diff --git a/bayesclass/__init__.py b/bayesclass/__init__.py index 0ef73dd..803f64a 100644 --- a/bayesclass/__init__.py +++ b/bayesclass/__init__.py @@ -17,4 +17,5 @@ __all__ = [ "KDB", "AODE", "KDBNew", + "AODENew", ] diff --git a/bayesclass/clfs.py b/bayesclass/clfs.py index 7db719b..566fb76 100644 --- a/bayesclass/clfs.py +++ b/bayesclass/clfs.py @@ -460,6 +460,21 @@ class AODE(BayesBase, BaseEnsemble): class TANNew(TAN): + def __init__( + self, + show_progress=False, + random_state=None, + discretizer_depth=1e6, + discretizer_length=3, + discretizer_cuts=0, + ): + self.discretizer_depth = discretizer_depth + self.discretizer_length = discretizer_length + self.discretizer_cuts = discretizer_cuts + super().__init__( + show_progress=show_progress, random_state=random_state + ) + def fit(self, X, y, **kwargs): self.estimator = Proposal(self) return self.estimator.fit(X, y, **kwargs) @@ -470,6 +485,22 @@ class TANNew(TAN): class KDBNew(KDB): + def __init__( + self, + k=2, + show_progress=False, + random_state=None, + discretizer_depth=1e6, + discretizer_length=3, + discretizer_cuts=0, + ): + self.discretizer_depth = discretizer_depth + self.discretizer_length = discretizer_length + self.discretizer_cuts = discretizer_cuts + super().__init__( + k=k, show_progress=show_progress, random_state=random_state + ) + def fit(self, X, y, **kwargs): self.estimator = Proposal(self) return self.estimator.fit(X, y, **kwargs) @@ -478,14 +509,25 @@ class KDBNew(KDB): return self.estimator.predict(X) +class AODENew(AODE): + pass + + class Proposal: def __init__(self, estimator): self.estimator = estimator self.class_type = estimator.__class__ def fit(self, X, y, **kwargs): + # Check parameters + super(self.class_type, self.estimator)._check_params(X, y, kwargs) # Discretize train data - self.discretizer = FImdlp(n_jobs=1) + self.discretizer = FImdlp( + n_jobs=1, + max_depth=self.estimator.discretizer_depth, + min_length=self.estimator.discretizer_length, + max_cuts=self.estimator.discretizer_cuts, + ) self.Xd = self.discretizer.fit_transform(X, y) kwargs = self.update_kwargs(y, kwargs) # Build the model diff --git a/bayesclass/tests/baseline_images/test_KDBNew/line_dashes_KDBNew.png b/bayesclass/tests/baseline_images/test_KDBNew/line_dashes_KDBNew.png new file mode 100644 index 0000000..0fd3312 Binary files /dev/null and b/bayesclass/tests/baseline_images/test_KDBNew/line_dashes_KDBNew.png differ diff --git a/bayesclass/tests/baseline_images/test_TANNew/line_dashes_TANNew.png b/bayesclass/tests/baseline_images/test_TANNew/line_dashes_TANNew.png new file mode 100644 index 0000000..3f55599 Binary files /dev/null and b/bayesclass/tests/baseline_images/test_TANNew/line_dashes_TANNew.png differ diff --git a/bayesclass/tests/test_KDB.py b/bayesclass/tests/test_KDB.py index 41b977d..acda865 100644 --- a/bayesclass/tests/test_KDB.py +++ b/bayesclass/tests/test_KDB.py @@ -55,7 +55,6 @@ def test_KDB_nodes_edges(clf, data): def test_KDB_states(clf, data): assert clf.states_ == 0 - clf = KDB(k=3, random_state=17) clf.fit(*data) assert clf.states_ == 23 assert clf.depth_ == clf.states_ diff --git a/bayesclass/tests/test_KDBNew.py b/bayesclass/tests/test_KDBNew.py new file mode 100644 index 0000000..e8948ae --- /dev/null +++ b/bayesclass/tests/test_KDBNew.py @@ -0,0 +1,127 @@ +import pytest +import numpy as np +from sklearn.datasets import load_iris +from sklearn.preprocessing import KBinsDiscretizer +from matplotlib.testing.decorators import image_comparison +from matplotlib.testing.conftest import mpl_test_settings +from pgmpy.models import BayesianNetwork + + +from bayesclass.clfs import KDBNew +from .._version import __version__ + + +@pytest.fixture +def data(): + X, y = load_iris(return_X_y=True) + enc = KBinsDiscretizer(encode="ordinal") + return enc.fit_transform(X), y + + +@pytest.fixture +def clf(): + return KDBNew(k=3) + + +def test_KDBNew_default_hyperparameters(data, clf): + # Test default values of hyperparameters + assert not clf.show_progress + assert clf.random_state is None + assert clf.theta == 0.03 + clf = KDBNew(show_progress=True, random_state=17, k=3) + assert clf.show_progress + assert clf.random_state == 17 + assert clf.k == 3 + clf.fit(*data) + assert clf.class_name_ == "class" + assert clf.feature_names_in_ == [ + "feature_0", + "feature_1", + "feature_2", + "feature_3", + ] + + +def test_KDBNew_version(clf): + """Check KDBNew version.""" + assert __version__ == clf.version() + + +def test_KDBNew_nodes_edges(clf, data): + assert clf.nodes_edges() == (0, 0) + clf.fit(*data) + assert clf.nodes_leaves() == (5, 10) + + +def test_KDBNew_states(clf, data): + assert clf.states_ == 0 + clf.fit(*data) + assert clf.states_ == 23 + assert clf.depth_ == clf.states_ + + +def test_KDBNew_classifier(data, clf): + clf.fit(*data) + attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"] + for attr in attribs: + assert hasattr(clf, attr) + X = data[0] + y = data[1] + y_pred = clf.predict(X) + assert y_pred.shape == (X.shape[0],) + assert sum(y == y_pred) == 148 + + +@image_comparison( + baseline_images=["line_dashes_KDBNew"], + remove_text=True, + extensions=["png"], +) +def test_KDBNew_plot(data, clf): + # mpl_test_settings will automatically clean these internal side effects + mpl_test_settings + dataset = load_iris(as_frame=True) + clf.fit(*data, features=dataset["feature_names"]) + clf.plot("KDBNew Iris") + + +def test_KDBNew_wrong_num_features(data, clf): + with pytest.raises( + ValueError, + match="Number of features does not match the number of columns in X", + ): + clf.fit(*data, features=["feature_1", "feature_2"]) + + +def test_KDBNew_wrong_hyperparam(data, clf): + with pytest.raises(ValueError, match="Unexpected argument: wrong_param"): + clf.fit(*data, wrong_param="wrong_param") + + +def test_KDBNew_error_size_predict(data, clf): + X, y = data + clf.fit(X, y) + with pytest.raises(ValueError): + X_diff_size = np.ones((10, X.shape[1] + 1)) + clf.predict(X_diff_size) + + +def test_KDBNew_dont_do_cycles(): + clf = KDBNew(k=4) + dag = BayesianNetwork() + clf.feature_names_in_ = [ + "feature_0", + "feature_1", + "feature_2", + "feature_3", + ] + nodes = list(range(4)) + weights = np.ones((4, 4)) + for idx in range(1, 4): + dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx]) + dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2]) + dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3]) + dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3]) + for idx in range(4): + clf._add_m_edges(dag, idx, nodes, weights) + assert len(dag.edges()) == 6 diff --git a/bayesclass/tests/test_TAN.py b/bayesclass/tests/test_TAN.py index ddba503..95ec46a 100644 --- a/bayesclass/tests/test_TAN.py +++ b/bayesclass/tests/test_TAN.py @@ -19,16 +19,16 @@ def data(): @pytest.fixture def clf(): - return TAN() + return TAN(random_state=17) def test_TAN_default_hyperparameters(data, clf): # Test default values of hyperparameters assert not clf.show_progress - assert clf.random_state is None - clf = TAN(show_progress=True, random_state=17) - assert clf.show_progress assert clf.random_state == 17 + clf = TAN(show_progress=True) + assert clf.show_progress + assert clf.random_state is None clf.fit(*data) assert clf.head_ == 0 assert clf.class_name_ == "class" @@ -47,21 +47,18 @@ def test_TAN_version(clf): def test_TAN_nodes_edges(clf, data): assert clf.nodes_edges() == (0, 0) - clf = TAN(random_state=17) clf.fit(*data, head="random") assert clf.nodes_leaves() == (5, 7) def test_TAN_states(clf, data): assert clf.states_ == 0 - clf = TAN(random_state=17) clf.fit(*data) assert clf.states_ == 23 assert clf.depth_ == clf.states_ -def test_TAN_random_head(data): - clf = TAN(random_state=17) +def test_TAN_random_head(clf, data): clf.fit(*data, head="random") assert clf.head_ == 3 diff --git a/bayesclass/tests/test_TANNew.py b/bayesclass/tests/test_TANNew.py new file mode 100644 index 0000000..2208f26 --- /dev/null +++ b/bayesclass/tests/test_TANNew.py @@ -0,0 +1,121 @@ +import pytest +import numpy as np +from sklearn.datasets import load_iris +from sklearn.preprocessing import KBinsDiscretizer +from matplotlib.testing.decorators import image_comparison +from matplotlib.testing.conftest import mpl_test_settings + + +from bayesclass.clfs import TANNew +from .._version import __version__ + + +@pytest.fixture +def data(): + X, y = load_iris(return_X_y=True) + enc = KBinsDiscretizer(encode="ordinal") + return enc.fit_transform(X), y + + +@pytest.fixture +def clf(): + return TANNew(random_state=17) + + +def test_TANNew_default_hyperparameters(data, clf): + # Test default values of hyperparameters + assert not clf.show_progress + assert clf.random_state == 17 + clf = TANNew(show_progress=True) + assert clf.show_progress + assert clf.random_state is None + clf.fit(*data) + assert clf.head_ == 0 + assert clf.class_name_ == "class" + assert clf.feature_names_in_ == [ + "feature_0", + "feature_1", + "feature_2", + "feature_3", + ] + + +def test_TANNew_version(clf): + """Check TANNew version.""" + assert __version__ == clf.version() + + +def test_TANNew_nodes_edges(clf, data): + assert clf.nodes_edges() == (0, 0) + clf.fit(*data, head="random") + assert clf.nodes_leaves() == (5, 7) + + +def test_TANNew_states(clf, data): + assert clf.states_ == 0 + clf.fit(*data) + assert clf.states_ == 22 + assert clf.depth_ == clf.states_ + + +def test_TANNew_random_head(clf, data): + clf.fit(*data, head="random") + assert clf.head_ == 3 + + +def test_TANNew_classifier(data, clf): + clf.fit(*data) + attribs = [ + "classes_", + "X_", + "y_", + "head_", + "feature_names_in_", + "class_name_", + ] + for attr in attribs: + assert hasattr(clf, attr) + X = data[0] + y = data[1] + y_pred = clf.predict(X) + assert y_pred.shape == (X.shape[0],) + assert sum(y == y_pred) == 145 + + +@image_comparison( + baseline_images=["line_dashes_TANNew"], + remove_text=True, + extensions=["png"], +) +def test_TANNew_plot(data, clf): + # mpl_test_settings will automatically clean these internal side effects + mpl_test_settings + dataset = load_iris(as_frame=True) + clf.fit(*data, features=dataset["feature_names"], head=0) + clf.plot("TANNew Iris head=0") + + +def test_TANNew_wrong_num_features(data, clf): + with pytest.raises( + ValueError, + match="Number of features does not match the number of columns in X", + ): + clf.fit(*data, features=["feature_1", "feature_2"]) + + +def test_TANNew_wrong_hyperparam(data, clf): + with pytest.raises(ValueError, match="Unexpected argument: wrong_param"): + clf.fit(*data, wrong_param="wrong_param") + + +def test_TANNew_head_out_of_range(data, clf): + with pytest.raises(ValueError, match="Head index out of range"): + clf.fit(*data, head=4) + + +def test_TANNew_error_size_predict(data, clf): + X, y = data + clf.fit(X, y) + with pytest.raises(ValueError): + X_diff_size = np.ones((10, X.shape[1] + 1)) + clf.predict(X_diff_size) diff --git a/pyproject.toml b/pyproject.toml index 9fa621d..b8c0021 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "pgmpy", "networkx", "matplotlib", + "fimdlp", ] requires-python = ">=3.8" classifiers = [