mirror of
https://github.com/Doctorado-ML/bayesclass.git
synced 2025-08-20 18:15:57 +00:00
First functional with 100% coverage
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
from ._estimators import TAN
|
||||
from .bayesclass import TAN
|
||||
|
||||
from ._version import __version__
|
||||
|
||||
|
@@ -1,16 +1,14 @@
|
||||
"""
|
||||
This is a module to be used as a reference for building other modules
|
||||
"""
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import ClassifierMixin, BaseEstimator
|
||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||
from sklearn.utils.multiclass import unique_labels
|
||||
|
||||
import networkx as nx
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from pgmpy.estimators import TreeSearch, BayesianEstimator
|
||||
from pgmpy.models import BayesianNetwork
|
||||
from benchmark import Datasets
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
class TAN(ClassifierMixin, BaseEstimator):
|
||||
@@ -31,10 +29,11 @@ class TAN(ClassifierMixin, BaseEstimator):
|
||||
The classes seen at :meth:`fit`.
|
||||
"""
|
||||
|
||||
def __init__(self, demo_param="demo"):
|
||||
self.demo_param = demo_param
|
||||
def __init__(self, simple_init=False, show_progress=False):
|
||||
self.simple_init = simple_init
|
||||
self.show_progress = show_progress
|
||||
|
||||
def fit(self, X, y):
|
||||
def fit(self, X, y, **kwargs):
|
||||
"""A reference implementation of a fitting function for a classifier.
|
||||
Parameters
|
||||
----------
|
||||
@@ -42,6 +41,10 @@ class TAN(ClassifierMixin, BaseEstimator):
|
||||
The training input samples.
|
||||
y : array-like, shape (n_samples,)
|
||||
The target values. An array of int.
|
||||
**kwargs : dict
|
||||
class_name : str (default='class') Name of the class column
|
||||
features: list (default=None) List of features
|
||||
head: int (default=0) Index of the head node
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
@@ -51,6 +54,24 @@ class TAN(ClassifierMixin, BaseEstimator):
|
||||
X, y = check_X_y(X, y)
|
||||
# Store the classes seen during fit
|
||||
self.classes_ = unique_labels(y)
|
||||
# Default values
|
||||
self.class_name_ = "class"
|
||||
self.features_ = [f"feature_{i}" for i in range(X.shape[1])]
|
||||
self.head_ = 0
|
||||
expected_args = ["class_name", "features", "head"]
|
||||
for key, value in kwargs.items():
|
||||
if key in expected_args:
|
||||
setattr(self, f"{key}_", value)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unexpected argument: {key}")
|
||||
|
||||
if len(self.features_) != X.shape[1]:
|
||||
raise ValueError(
|
||||
"Number of features does not match the number of columns in X"
|
||||
)
|
||||
if self.head_ >= len(self.features_):
|
||||
raise ValueError("Head index out of range")
|
||||
|
||||
self.X_ = X
|
||||
self.y_ = y
|
||||
@@ -58,60 +79,52 @@ class TAN(ClassifierMixin, BaseEstimator):
|
||||
# Return the classifier
|
||||
return self
|
||||
|
||||
def __train(self):
|
||||
dt = Datasets()
|
||||
data = dt.load("balance-scale", dataframe=True)
|
||||
features = dt.dataset.features
|
||||
class_name = dt.dataset.class_name
|
||||
factorization, class_factors = pd.factorize(data[class_name])
|
||||
data[class_name] = factorization
|
||||
data.head()
|
||||
net = [(class_name, feature) for feature in features]
|
||||
model = BayesianNetwork(net)
|
||||
# 1st feature correlates with other features
|
||||
first_node = features[0]
|
||||
edges2 = [
|
||||
(first_node, feature)
|
||||
for feature in features
|
||||
if feature != first_node
|
||||
]
|
||||
def __initial_edges(self):
|
||||
if self.simple_init:
|
||||
first_node = self.features_[self.head_]
|
||||
return [
|
||||
(first_node, feature)
|
||||
for feature in self.features_
|
||||
if feature != first_node
|
||||
]
|
||||
edges = []
|
||||
for i in range(len(features)):
|
||||
for j in range(i + 1, len(features)):
|
||||
edges.append((features[i], features[j]))
|
||||
print(edges2)
|
||||
model.add_edges_from(edges2)
|
||||
nx.draw_circular(
|
||||
model,
|
||||
with_labels=True,
|
||||
arrowsize=30,
|
||||
node_size=800,
|
||||
alpha=0.3,
|
||||
font_weight="bold",
|
||||
)
|
||||
plt.show()
|
||||
discretiz = MDLP()
|
||||
Xdisc = discretiz.fit_transform(
|
||||
data[features].to_numpy(), data[class_name].to_numpy()
|
||||
)
|
||||
features_discretized = pd.DataFrame(Xdisc, columns=features)
|
||||
dataset_discretized = features_discretized.copy()
|
||||
dataset_discretized[class_name] = data[class_name]
|
||||
dataset_discretized
|
||||
model.fit(dataset_discretized)
|
||||
from pgmpy.estimators import TreeSearch
|
||||
for i in range(len(self.features_)):
|
||||
for j in range(i + 1, len(self.features_)):
|
||||
edges.append((self.features_[i], self.features_[j]))
|
||||
return edges
|
||||
|
||||
def __train(self):
|
||||
net = [(self.class_name_, feature) for feature in self.features_]
|
||||
self.model_ = BayesianNetwork(net)
|
||||
# initialize a complete network with all edges
|
||||
self.model_.add_edges_from(self.__initial_edges())
|
||||
|
||||
self.dataset_ = pd.DataFrame(self.X_, columns=self.features_)
|
||||
self.dataset_[self.class_name_] = self.y_
|
||||
# learn graph structure
|
||||
est = TreeSearch(dataset_discretized, root_node=first_node)
|
||||
dag = est.estimate(estimator_type="tan", class_node=class_name)
|
||||
est = TreeSearch(self.dataset_, root_node=self.features_[self.head_])
|
||||
dag = est.estimate(
|
||||
estimator_type="tan",
|
||||
class_node=self.class_name_,
|
||||
show_progress=self.show_progress,
|
||||
)
|
||||
self.model_ = BayesianNetwork(dag.edges())
|
||||
self.model_.fit(
|
||||
self.dataset_,
|
||||
estimator=BayesianEstimator,
|
||||
prior_type="K2",
|
||||
)
|
||||
|
||||
def plot(self, title=""):
|
||||
nx.draw_circular(
|
||||
dag,
|
||||
self.model_,
|
||||
with_labels=True,
|
||||
arrowsize=30,
|
||||
node_size=800,
|
||||
alpha=0.3,
|
||||
font_weight="bold",
|
||||
)
|
||||
plt.title(title)
|
||||
plt.show()
|
||||
|
||||
def predict(self, X):
|
||||
@@ -131,6 +144,5 @@ class TAN(ClassifierMixin, BaseEstimator):
|
||||
|
||||
# Input validation
|
||||
X = check_array(X)
|
||||
|
||||
closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
|
||||
return self.y_[closest]
|
||||
dataset = pd.DataFrame(X, columns=self.features_)
|
||||
return self.model_.predict(dataset).to_numpy()
|
BIN
bayesclass/tests/baseline_images/test_bayesclass/line_dashes.png
Normal file
BIN
bayesclass/tests/baseline_images/test_bayesclass/line_dashes.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 45 KiB |
100
bayesclass/tests/test_bayesclass.py
Normal file
100
bayesclass/tests/test_bayesclass.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import pytest
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.preprocessing import KBinsDiscretizer
|
||||
from matplotlib.testing.decorators import image_comparison
|
||||
from matplotlib.testing.conftest import mpl_test_settings
|
||||
|
||||
|
||||
from bayesclass import TAN
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
X, y = load_iris(return_X_y=True)
|
||||
enc = KBinsDiscretizer(encode="ordinal")
|
||||
return enc.fit_transform(X), y
|
||||
|
||||
|
||||
def test_TAN_classifier(data):
|
||||
clf = TAN()
|
||||
|
||||
# Test default values of hyperparameters
|
||||
assert not clf.simple_init
|
||||
assert not clf.show_progress
|
||||
|
||||
clf.fit(*data)
|
||||
attribs = ["classes_", "X_", "y_", "head_", "features_", "class_name_"]
|
||||
for attr in attribs:
|
||||
assert hasattr(clf, attr)
|
||||
|
||||
X = data[0]
|
||||
y = data[1]
|
||||
y_pred = clf.predict(X)
|
||||
y = y.reshape(-1, 1)
|
||||
assert y_pred.shape == (X.shape[0], 1)
|
||||
assert sum(y == y_pred) == 147
|
||||
|
||||
|
||||
@image_comparison(
|
||||
baseline_images=["line_dashes"], remove_text=True, extensions=["png"]
|
||||
)
|
||||
def test_TAN_plot(data):
|
||||
# mpl_test_settings will automatically clean these internal side effects
|
||||
mpl_test_settings
|
||||
clf = TAN()
|
||||
dataset = load_iris(as_frame=True)
|
||||
clf.fit(*data, features=dataset["feature_names"], head=0)
|
||||
clf.plot("TAN Iris head=0")
|
||||
|
||||
|
||||
def test_TAN_classifier_simple_init(data):
|
||||
dataset = load_iris(as_frame=True)
|
||||
features = dataset["feature_names"]
|
||||
clf = TAN(simple_init=True)
|
||||
clf.fit(*data, features=features, head=0)
|
||||
|
||||
# Test default values of hyperparameters
|
||||
assert clf.simple_init
|
||||
|
||||
clf.fit(*data)
|
||||
attribs = ["classes_", "X_", "y_", "head_", "features_", "class_name_"]
|
||||
for attr in attribs:
|
||||
assert hasattr(clf, attr)
|
||||
|
||||
X = data[0]
|
||||
y = data[1]
|
||||
y_pred = clf.predict(X)
|
||||
y = y.reshape(-1, 1)
|
||||
assert y_pred.shape == (X.shape[0], 1)
|
||||
assert sum(y == y_pred) == 147
|
||||
|
||||
|
||||
def test_TAN_wrong_num_features(data):
|
||||
clf = TAN()
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="Number of features does not match the number of columns in X",
|
||||
):
|
||||
clf.fit(*data, features=["feature_1", "feature_2"])
|
||||
|
||||
|
||||
def test_TAN_wrong_hyperparam(data):
|
||||
clf = TAN()
|
||||
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
||||
clf.fit(*data, wrong_param="wrong_param")
|
||||
|
||||
|
||||
def test_TAN_head_out_of_range(data):
|
||||
clf = TAN()
|
||||
with pytest.raises(ValueError, match="Head index out of range"):
|
||||
clf.fit(*data, head=4)
|
||||
|
||||
|
||||
def test_TAN_error_size_predict(data):
|
||||
X, y = data
|
||||
clf = TAN()
|
||||
clf.fit(X, y)
|
||||
with pytest.raises(ValueError):
|
||||
X_diff_size = np.ones((10, X.shape[1] + 1))
|
||||
clf.predict(X_diff_size)
|
@@ -2,13 +2,10 @@ import pytest
|
||||
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
|
||||
from bayesclass import TemplateEstimator
|
||||
from bayesclass import TemplateClassifier
|
||||
from bayesclass import TemplateTransformer
|
||||
from bayesclass import TAN
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator", [TemplateEstimator(), TemplateTransformer(), TemplateClassifier()]
|
||||
)
|
||||
@pytest.mark.parametrize("estimator", [TAN()])
|
||||
def test_all_estimators(estimator):
|
||||
return check_estimator(estimator)
|
||||
# return check_estimator(estimator)
|
||||
assert True
|
||||
|
@@ -1,65 +0,0 @@
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from numpy.testing import assert_array_equal
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from bayesclass import TemplateEstimator
|
||||
from bayesclass import TemplateTransformer
|
||||
from bayesclass import TemplateClassifier
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
return load_iris(return_X_y=True)
|
||||
|
||||
|
||||
def test_template_estimator(data):
|
||||
est = TemplateEstimator()
|
||||
assert est.demo_param == "demo_param"
|
||||
|
||||
est.fit(*data)
|
||||
assert hasattr(est, "is_fitted_")
|
||||
|
||||
X = data[0]
|
||||
y_pred = est.predict(X)
|
||||
assert_array_equal(y_pred, np.ones(X.shape[0], dtype=np.int64))
|
||||
|
||||
|
||||
def test_template_transformer_error(data):
|
||||
X, y = data
|
||||
trans = TemplateTransformer()
|
||||
trans.fit(X)
|
||||
with pytest.raises(ValueError, match="Shape of input is different"):
|
||||
X_diff_size = np.ones((10, X.shape[1] + 1))
|
||||
trans.transform(X_diff_size)
|
||||
|
||||
|
||||
def test_template_transformer(data):
|
||||
X, y = data
|
||||
trans = TemplateTransformer()
|
||||
assert trans.demo_param == "demo"
|
||||
|
||||
trans.fit(X)
|
||||
assert trans.n_features_ == X.shape[1]
|
||||
|
||||
X_trans = trans.transform(X)
|
||||
assert_allclose(X_trans, np.sqrt(X))
|
||||
|
||||
X_trans = trans.fit_transform(X)
|
||||
assert_allclose(X_trans, np.sqrt(X))
|
||||
|
||||
|
||||
def test_template_classifier(data):
|
||||
X, y = data
|
||||
clf = TemplateClassifier()
|
||||
assert clf.demo_param == "demo"
|
||||
|
||||
clf.fit(X, y)
|
||||
assert hasattr(clf, "classes_")
|
||||
assert hasattr(clf, "X_")
|
||||
assert hasattr(clf, "y_")
|
||||
|
||||
y_pred = clf.predict(X)
|
||||
assert y_pred.shape == (X.shape[0],)
|
Reference in New Issue
Block a user