mirror of
https://github.com/Doctorado-ML/bayesclass.git
synced 2025-08-15 23:55:57 +00:00
573 lines
20 KiB
Python
573 lines
20 KiB
Python
import random
|
|
import warnings
|
|
import numpy as np
|
|
import pandas as pd
|
|
from scipy.stats import mode
|
|
from sklearn.base import ClassifierMixin, BaseEstimator
|
|
from sklearn.ensemble import BaseEnsemble
|
|
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
|
from sklearn.utils.multiclass import unique_labels
|
|
from sklearn.feature_selection import mutual_info_classif
|
|
import networkx as nx
|
|
from pgmpy.estimators import TreeSearch, BayesianEstimator
|
|
from pgmpy.models import BayesianNetwork
|
|
import matplotlib.pyplot as plt
|
|
from fimdlp.mdlp import FImdlp
|
|
from ._version import __version__
|
|
|
|
|
|
class BayesBase(BaseEstimator, ClassifierMixin):
|
|
def __init__(self, random_state, show_progress):
|
|
self.random_state = random_state
|
|
self.show_progress = show_progress
|
|
|
|
def _more_tags(self):
|
|
return {
|
|
"requires_positive_X": True,
|
|
"requires_positive_y": True,
|
|
"preserve_dtype": [np.int32, np.int64],
|
|
"requires_y": True,
|
|
}
|
|
|
|
@staticmethod
|
|
def version() -> str:
|
|
"""Return the version of the package."""
|
|
return __version__
|
|
|
|
def nodes_edges(self):
|
|
if hasattr(self, "dag_"):
|
|
return len(self.dag_), len(self.dag_.edges())
|
|
return 0, 0
|
|
|
|
@staticmethod
|
|
def default_feature_names(num_features):
|
|
return [f"feature_{i}" for i in range(num_features)]
|
|
|
|
@staticmethod
|
|
def default_class_name():
|
|
return "class"
|
|
|
|
def _check_params_fit(self, X, y, expected_args, kwargs):
|
|
"""Check the common parameters passed to fit"""
|
|
# Check that X and y have correct shape
|
|
X, y = check_X_y(X, y)
|
|
X = self._validate_data(X, reset=True)
|
|
# Store the classes seen during fit
|
|
self.classes_ = unique_labels(y)
|
|
self.n_classes_ = self.classes_.shape[0]
|
|
# Default values
|
|
self.class_name_ = self.default_class_name()
|
|
self.features_ = self.default_feature_names(X.shape[1])
|
|
for key, value in kwargs.items():
|
|
if key in expected_args:
|
|
setattr(self, f"{key}_", value)
|
|
else:
|
|
raise ValueError(f"Unexpected argument: {key}")
|
|
self.feature_names_in_ = self.features_
|
|
if self.random_state is not None:
|
|
random.seed(self.random_state)
|
|
if len(self.feature_names_in_) != X.shape[1]:
|
|
raise ValueError(
|
|
"Number of features does not match the number of columns in X"
|
|
)
|
|
self.n_features_in_ = X.shape[1]
|
|
return X, y
|
|
|
|
@property
|
|
def states_(self):
|
|
if hasattr(self, "fitted_"):
|
|
return sum([len(item) for _, item in self.model_.states.items()])
|
|
return 0
|
|
|
|
@property
|
|
def depth_(self):
|
|
return self.states_
|
|
|
|
def fit(self, X, y, **kwargs):
|
|
"""Fit classifier
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
The training input samples.
|
|
y : array-like, shape (n_samples,)
|
|
The target values. An array of int.
|
|
**kwargs : dict
|
|
class_name : str (default='class') Name of the class column
|
|
features: list (default=None) List of features
|
|
head: int (default=None) Index of the head node. Default value
|
|
gets the node with the highest sum of weights (mutual_info)
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns self.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> import pandas as pd
|
|
>>> from bayesclass.clfs import TAN
|
|
>>> features = ['A', 'B', 'C', 'D', 'E']
|
|
>>> np.random.seed(17)
|
|
>>> values = pd.DataFrame(np.random.randint(low=0, high=2,
|
|
... size=(1000, 5)), columns=features)
|
|
>>> train_data = values[:800]
|
|
>>> train_y = train_data['E']
|
|
>>> predict_data = values[800:]
|
|
>>> train_data = train_data.drop('E', axis=1)
|
|
>>> model = TAN(random_state=17)
|
|
>>> features.remove('E')
|
|
>>> model.fit(train_data, train_y, features=features, class_name='E')
|
|
TAN(random_state=17)
|
|
"""
|
|
X_, y_ = self._check_params(X, y, kwargs)
|
|
# Store the information needed to build the model
|
|
self.X_ = X_
|
|
self.y_ = y_
|
|
self.dataset_ = pd.DataFrame(
|
|
self.X_, columns=self.feature_names_in_, dtype=np.int32
|
|
)
|
|
self.dataset_[self.class_name_] = self.y_
|
|
# Build the DAG
|
|
self._build()
|
|
# Train the model
|
|
self._train(kwargs)
|
|
self.fitted_ = True
|
|
# To keep compatiblity with the benchmark platform
|
|
self.nodes_leaves = self.nodes_edges
|
|
# Return the classifier
|
|
return self
|
|
|
|
def _train(self, kwargs):
|
|
self.model_ = BayesianNetwork(
|
|
self.dag_.edges(), show_progress=self.show_progress
|
|
)
|
|
states = dict(state_names=kwargs.pop("state_names", []))
|
|
self.model_.fit(
|
|
self.dataset_,
|
|
estimator=BayesianEstimator,
|
|
prior_type="K2",
|
|
**states,
|
|
)
|
|
|
|
def predict(self, X):
|
|
"""A reference implementation of a prediction for a classifier.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
The input samples.
|
|
|
|
Returns
|
|
-------
|
|
y : ndarray, shape (n_samples,)
|
|
The label for each sample is the label of the closest sample
|
|
seen during fit.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> import pandas as pd
|
|
>>> from bayesclass.clfs import TAN
|
|
>>> features = ['A', 'B', 'C', 'D', 'E']
|
|
>>> np.random.seed(17)
|
|
>>> values = pd.DataFrame(np.random.randint(low=0, high=2,
|
|
... size=(1000, 5)), columns=features)
|
|
>>> train_data = values[:800]
|
|
>>> train_y = train_data['E']
|
|
>>> predict_data = values[800:]
|
|
>>> train_data = train_data.drop('E', axis=1)
|
|
>>> model = TAN(random_state=17)
|
|
>>> features.remove('E')
|
|
>>> model.fit(train_data, train_y, features=features, class_name='E')
|
|
TAN(random_state=17)
|
|
>>> predict_data = predict_data.copy()
|
|
>>> predict_data.drop('E', axis=1, inplace=True)
|
|
>>> y_pred = model.predict(predict_data)
|
|
>>> y_pred[:10]
|
|
array([[0],
|
|
[0],
|
|
[1],
|
|
[1],
|
|
[0],
|
|
[1],
|
|
[1],
|
|
[1],
|
|
[0],
|
|
[1]])
|
|
"""
|
|
# Check is fit had been called
|
|
check_is_fitted(self, ["X_", "y_", "fitted_"])
|
|
# Input validation
|
|
X = check_array(X)
|
|
dataset = pd.DataFrame(
|
|
X, columns=self.feature_names_in_, dtype=np.int32
|
|
)
|
|
return self.model_.predict(dataset).values.ravel()
|
|
|
|
def plot(self, title="", node_size=800):
|
|
warnings.simplefilter("ignore", UserWarning)
|
|
nx.draw_circular(
|
|
self.model_,
|
|
with_labels=True,
|
|
arrowsize=20,
|
|
node_size=node_size,
|
|
alpha=0.3,
|
|
font_weight="bold",
|
|
)
|
|
plt.title(title)
|
|
plt.show()
|
|
|
|
|
|
class TAN(BayesBase):
|
|
"""Tree Augmented Naive Bayes
|
|
|
|
Parameters
|
|
----------
|
|
random_state: int, default=None
|
|
Random state for reproducibility
|
|
show_progress: bool, default=False
|
|
used in pgmpy to show progress bars
|
|
|
|
Attributes
|
|
----------
|
|
X_ : ndarray, shape (n_samples, n_features)
|
|
The input passed during :meth:`fit`.
|
|
y_ : ndarray, shape (n_samples,)
|
|
The labels passed during :meth:`fit`.
|
|
classes_ : ndarray, shape (n_classes,)
|
|
The classes seen at :meth:`fit`.
|
|
class_name_ : str
|
|
The name of the class column
|
|
feature_names_in_ : list
|
|
The list of features names
|
|
head_ : int
|
|
The index of the node used as head for the initial DAG
|
|
dataset_ : pd.DataFrame
|
|
The dataset used to train the model (X_ + y_)
|
|
dag_ : nx.DiGraph
|
|
The TAN DAG
|
|
model_ : BayesianNetwork
|
|
The actual classifier
|
|
"""
|
|
|
|
def __init__(self, show_progress=False, random_state=None):
|
|
super().__init__(
|
|
show_progress=show_progress, random_state=random_state
|
|
)
|
|
|
|
def _check_params(self, X, y, kwargs):
|
|
self.head_ = 0
|
|
expected_args = ["class_name", "features", "head", "state_names"]
|
|
X, y = self._check_params_fit(X, y, expected_args, kwargs)
|
|
if self.head_ == "random":
|
|
self.head_ = random.randint(0, self.n_features_in_ - 1)
|
|
if self.head_ is not None and self.head_ >= self.n_features_in_:
|
|
raise ValueError("Head index out of range")
|
|
return X, y
|
|
|
|
def _build(self):
|
|
est = TreeSearch(
|
|
self.dataset_, root_node=self.feature_names_in_[self.head_]
|
|
)
|
|
self.dag_ = est.estimate(
|
|
estimator_type="tan",
|
|
class_node=self.class_name_,
|
|
show_progress=self.show_progress,
|
|
)
|
|
# Code taken from pgmpy
|
|
# n_jobs = -1
|
|
# weights = TreeSearch._get_conditional_weights(
|
|
# self.dataset_,
|
|
# self.class_name_,
|
|
# "mutual_info",
|
|
# n_jobs,
|
|
# self.show_progress,
|
|
# )
|
|
# # Step 4.2: Construct chow-liu DAG on {data.columns - class_node}
|
|
# class_node_idx = np.where(self.dataset_.columns == self.class_name_)[
|
|
# 0
|
|
# ][0]
|
|
# weights = np.delete(weights, class_node_idx, axis=0)
|
|
# weights = np.delete(weights, class_node_idx, axis=1)
|
|
# reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
|
|
# D = TreeSearch._create_tree_and_dag(
|
|
# weights, reduced_columns, self.feature_names_in_[self.head_]
|
|
# )
|
|
# # Step 4.3: Add edges from class_node to all other nodes.
|
|
# D.add_edges_from(
|
|
# [(self.class_name_, node) for node in reduced_columns]
|
|
# )
|
|
# self.dag_ = D
|
|
|
|
|
|
class KDB(BayesBase):
|
|
def __init__(self, k, theta=0.03, show_progress=False, random_state=None):
|
|
self.k = k
|
|
self.theta = theta
|
|
super().__init__(
|
|
show_progress=show_progress, random_state=random_state
|
|
)
|
|
|
|
def _check_params(self, X, y, kwargs):
|
|
expected_args = ["class_name", "features", "state_names"]
|
|
return self._check_params_fit(X, y, expected_args, kwargs)
|
|
|
|
def _add_m_edges(self, dag, idx, S_nodes, conditional_weights):
|
|
n_edges = min(self.k, len(S_nodes))
|
|
cond_w = conditional_weights.copy()
|
|
exit_cond = self.k == 0
|
|
num = 0
|
|
while not exit_cond:
|
|
max_minfo = np.argmax(cond_w[idx, :])
|
|
if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta:
|
|
try:
|
|
dag.add_edge(
|
|
self.feature_names_in_[max_minfo],
|
|
self.feature_names_in_[idx],
|
|
)
|
|
num += 1
|
|
except ValueError:
|
|
# Loops are not allowed
|
|
pass
|
|
cond_w[idx, max_minfo] = -1
|
|
exit_cond = num == n_edges or np.all(cond_w[idx, :] <= self.theta)
|
|
|
|
def _build(self):
|
|
"""
|
|
1. For each feature Xi, compute mutual information, I(X;;C),
|
|
where C is the class.
|
|
2. Compute class conditional mutual information I(Xi;XjIC), f or each
|
|
pair of features Xi and Xj, where i#j.
|
|
3. Let the used variable list, S, be empty.
|
|
4. Let the Bayesian network being constructed, BN, begin with a single
|
|
class node, C.
|
|
5. Repeat until S includes all domain features
|
|
5.1. Select feature Xmax which is not in S and has the largest value
|
|
I(Xmax;C).
|
|
5.2. Add a node to BN representing Xmax.
|
|
5.3. Add an arc from C to Xmax in BN.
|
|
5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with
|
|
the highest value for I(Xmax;X,jC).
|
|
5.5. Add Xmax to S.
|
|
Compute the conditional probabilility infered by the structure of BN by
|
|
using counts from DB, and output BN.
|
|
"""
|
|
# 1. get the mutual information between each feature and the class
|
|
mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True)
|
|
# 2. symmetric matrix where each element represents I(X, Y| class_node)
|
|
conditional_weights = TreeSearch(
|
|
self.dataset_
|
|
)._get_conditional_weights(
|
|
self.dataset_, self.class_name_, show_progress=self.show_progress
|
|
)
|
|
# 3. Let the used variable list, S, be empty.
|
|
S_nodes = []
|
|
# 4. Let the BN being constructed, BN, begin with a single class node
|
|
dag = BayesianNetwork()
|
|
dag.add_node(self.class_name_) # , state_names=self.classes_)
|
|
# 5. Repeat until S includes all domain features
|
|
# 5.1 Select feature Xmax which is not in S and has the largest value
|
|
for idx in np.argsort(mutual):
|
|
# 5.2 Add a node to BN representing Xmax.
|
|
feature = self.feature_names_in_[idx]
|
|
dag.add_node(feature)
|
|
# 5.3 Add an arc from C to Xmax in BN.
|
|
dag.add_edge(self.class_name_, feature)
|
|
# 5.4 Add m = min(lSl,/c) arcs from m distinct features Xj in S
|
|
self._add_m_edges(dag, idx, S_nodes, conditional_weights)
|
|
# 5.5 Add Xmax to S.
|
|
S_nodes.append(idx)
|
|
self.dag_ = dag
|
|
|
|
|
|
class AODE(BayesBase, BaseEnsemble):
|
|
def __init__(self, show_progress=False, random_state=None):
|
|
super().__init__(
|
|
show_progress=show_progress, random_state=random_state
|
|
)
|
|
|
|
def _check_params(self, X, y, kwargs):
|
|
expected_args = ["class_name", "features", "state_names"]
|
|
return self._check_params_fit(X, y, expected_args, kwargs)
|
|
|
|
def nodes_edges(self):
|
|
nodes = 0
|
|
edges = 0
|
|
if hasattr(self, "fitted_"):
|
|
nodes = sum([len(x) for x in self.models_])
|
|
edges = sum([len(x.edges()) for x in self.models_])
|
|
return nodes, edges
|
|
|
|
@property
|
|
def states_(self):
|
|
if hasattr(self, "fitted_"):
|
|
return sum(
|
|
[
|
|
len(item)
|
|
for model in self.models_
|
|
for _, item in model.states.items()
|
|
]
|
|
) / len(self.models_)
|
|
return 0
|
|
|
|
def _build(self):
|
|
self.dag_ = None
|
|
|
|
def _train(self, kwargs):
|
|
"""Build SPODE estimators (Super Parent One Dependent Estimator)"""
|
|
self.models_ = []
|
|
class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
|
|
states = dict(state_names=kwargs.pop("state_names", []))
|
|
for idx in range(self.n_features_in_):
|
|
feature_edges = [
|
|
(self.feature_names_in_[idx], f)
|
|
for f in self.feature_names_in_
|
|
if f != self.feature_names_in_[idx]
|
|
]
|
|
feature_edges.extend(class_edges)
|
|
model = BayesianNetwork(
|
|
feature_edges, show_progress=self.show_progress
|
|
)
|
|
model.fit(
|
|
self.dataset_,
|
|
estimator=BayesianEstimator,
|
|
prior_type="K2",
|
|
**states,
|
|
)
|
|
self.models_.append(model)
|
|
|
|
def plot(self, title=""):
|
|
warnings.simplefilter("ignore", UserWarning)
|
|
for idx, model in enumerate(self.models_):
|
|
self.model_ = model
|
|
super().plot(title=f"{idx} {title}")
|
|
|
|
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
check_is_fitted(self, ["X_", "y_", "fitted_"])
|
|
# Input validation
|
|
X = check_array(X)
|
|
n_samples = X.shape[0]
|
|
n_estimators = len(self.models_)
|
|
result = np.empty((n_samples, n_estimators))
|
|
dataset = pd.DataFrame(
|
|
X, columns=self.feature_names_in_, dtype=np.int32
|
|
)
|
|
for index, model in enumerate(self.models_):
|
|
result[:, index] = model.predict(dataset).values.ravel()
|
|
return mode(result, axis=1, keepdims=False).mode.ravel()
|
|
|
|
|
|
class TANNew(TAN):
|
|
def fit(self, X, y, **kwargs):
|
|
self.estimator = Proposal(self)
|
|
return self.estimator.fit(X, y, **kwargs)
|
|
|
|
def predict(self, X):
|
|
self.plot()
|
|
return self.estimator.predict(X)
|
|
|
|
|
|
class KDBNew(KDB):
|
|
def fit(self, X, y, **kwargs):
|
|
self.estimator = Proposal(self)
|
|
return self.estimator.fit(X, y, **kwargs)
|
|
|
|
def predict(self, X):
|
|
return self.estimator.predict(X)
|
|
|
|
|
|
class Proposal:
|
|
def __init__(self, estimator):
|
|
self.estimator = estimator
|
|
self.class_type = estimator.__class__
|
|
|
|
def fit(self, X, y, **kwargs):
|
|
# Discretize train data
|
|
self.discretizer = FImdlp(n_jobs=1)
|
|
self.Xd = self.discretizer.fit_transform(X, y)
|
|
kwargs = self.update_kwargs(y, kwargs)
|
|
# Build the model
|
|
super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs)
|
|
self.check_integrity("f", self.Xd)
|
|
# Local discretization based on the model
|
|
features = kwargs["features"]
|
|
# assign indices to feature names
|
|
self.idx_features_ = dict(list(zip(features, range(len(features)))))
|
|
upgraded, self.Xd = self._local_discretization()
|
|
if upgraded:
|
|
kwargs = self.update_kwargs(y, kwargs)
|
|
super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs)
|
|
|
|
def predict(self, X):
|
|
self.check_integrity("p", self.discretizer.transform(X))
|
|
return super(self.class_type, self.estimator).predict(
|
|
self.discretizer.transform(X)
|
|
)
|
|
|
|
def update_kwargs(self, y, kwargs):
|
|
features = (
|
|
kwargs["features"]
|
|
if "features" in kwargs
|
|
else self.estimator.default_feature_names(self.Xd.shape[1])
|
|
)
|
|
states = {
|
|
features[i]: self.discretizer.get_states_feature(i)
|
|
for i in range(self.Xd.shape[1])
|
|
}
|
|
class_name = (
|
|
kwargs["class_name"]
|
|
if "class_name" in kwargs
|
|
else self.estimator.default_class_name()
|
|
)
|
|
states[class_name] = np.unique(y).tolist()
|
|
kwargs["state_names"] = states
|
|
self.state_names_ = states
|
|
self.features_ = features
|
|
kwargs["features"] = features
|
|
kwargs["class_name"] = class_name
|
|
return kwargs
|
|
|
|
def _local_discretization(self):
|
|
"""Discretize each feature with its fathers and the class"""
|
|
res = self.Xd.copy()
|
|
upgraded = False
|
|
# print("-" * 80)
|
|
for idx, feature in enumerate(self.estimator.feature_names_in_):
|
|
fathers = self.estimator.dag_.get_parents(feature)
|
|
if len(fathers) > 1:
|
|
# print(
|
|
# "Discretizing " + feature + " with " + str(fathers),
|
|
# end=" ",
|
|
# )
|
|
# First remove the class name as it will be added later
|
|
fathers.remove(self.estimator.class_name_)
|
|
# Get the fathers indices
|
|
features = [self.idx_features_[f] for f in fathers]
|
|
# Update the discretization of the feature
|
|
res[:, idx] = self.discretizer.join_fit(
|
|
target=idx, features=features, data=self.Xd
|
|
)
|
|
# print(self.discretizer.y_join[:5])
|
|
upgraded = True
|
|
return upgraded, res
|
|
|
|
def check_integrity(self, source, X):
|
|
# print(f"Checking integrity of {source} data")
|
|
for i in range(X.shape[1]):
|
|
if not set(np.unique(X[:, i]).tolist()).issubset(
|
|
set(self.state_names_[self.features_[i]])
|
|
):
|
|
print(
|
|
"i",
|
|
i,
|
|
"features[i]",
|
|
self.features_[i],
|
|
"np.unique(X[:, i])",
|
|
np.unique(X[:, i]),
|
|
"np.array(state_names[features[i]])",
|
|
np.array(self.state_names_[self.features_[i]]),
|
|
)
|
|
raise ValueError("Discretization error")
|