Add KDBNew and TANNew tests

2025-08-15 07:35:53 +00:00 · 2023-03-23 14:13:01 +01:00
parent 2ffc06b232
commit ea8c5b805e
9 changed files with 298 additions and 10 deletions
--- a/bayesclass/init.py
+++ b/bayesclass/init.py
@@ -17,4 +17,5 @@ __all__ = [
    "KDB",
    "AODE",
    "KDBNew",
+    "AODENew",
 ]
--- a/bayesclass/clfs.py
+++ b/bayesclass/clfs.py
@@ -460,6 +460,21 @@ class AODE(BayesBase, BaseEnsemble):


 class TANNew(TAN):
+    def __init__(
+        self,
+        show_progress=False,
+        random_state=None,
+        discretizer_depth=1e6,
+        discretizer_length=3,
+        discretizer_cuts=0,
+    ):
+        self.discretizer_depth = discretizer_depth
+        self.discretizer_length = discretizer_length
+        self.discretizer_cuts = discretizer_cuts
+        super().__init__(
+            show_progress=show_progress, random_state=random_state
+        )
+
    def fit(self, X, y, **kwargs):
        self.estimator = Proposal(self)
        return self.estimator.fit(X, y, **kwargs)
@@ -470,6 +485,22 @@ class TANNew(TAN):


 class KDBNew(KDB):
+    def __init__(
+        self,
+        k=2,
+        show_progress=False,
+        random_state=None,
+        discretizer_depth=1e6,
+        discretizer_length=3,
+        discretizer_cuts=0,
+    ):
+        self.discretizer_depth = discretizer_depth
+        self.discretizer_length = discretizer_length
+        self.discretizer_cuts = discretizer_cuts
+        super().__init__(
+            k=k, show_progress=show_progress, random_state=random_state
+        )
+
    def fit(self, X, y, **kwargs):
        self.estimator = Proposal(self)
        return self.estimator.fit(X, y, **kwargs)
@@ -478,14 +509,25 @@ class KDBNew(KDB):
        return self.estimator.predict(X)


+class AODENew(AODE):
+    pass
+
+
 class Proposal:
    def __init__(self, estimator):
        self.estimator = estimator
        self.class_type = estimator.__class__

    def fit(self, X, y, **kwargs):
+        # Check parameters
+        super(self.class_type, self.estimator)._check_params(X, y, kwargs)
        # Discretize train data
-        self.discretizer = FImdlp(n_jobs=1)
+        self.discretizer = FImdlp(
+            n_jobs=1,
+            max_depth=self.estimator.discretizer_depth,
+            min_length=self.estimator.discretizer_length,
+            max_cuts=self.estimator.discretizer_cuts,
+        )
        self.Xd = self.discretizer.fit_transform(X, y)
        kwargs = self.update_kwargs(y, kwargs)
        # Build the model
--- a/bayesclass/tests/baseline_images/test_KDBNew/line_dashes_KDBNew.png
+++ b/bayesclass/tests/baseline_images/test_KDBNew/line_dashes_KDBNew.png
--- a/bayesclass/tests/baseline_images/test_TANNew/line_dashes_TANNew.png
+++ b/bayesclass/tests/baseline_images/test_TANNew/line_dashes_TANNew.png
--- a/bayesclass/tests/test_KDB.py
+++ b/bayesclass/tests/test_KDB.py
@@ -55,7 +55,6 @@ def test_KDB_nodes_edges(clf, data):

 def test_KDB_states(clf, data):
    assert clf.states_ == 0
-    clf = KDB(k=3, random_state=17)
    clf.fit(*data)
    assert clf.states_ == 23
    assert clf.depth_ == clf.states_
--- a/bayesclass/tests/test_KDBNew.py
+++ b/bayesclass/tests/test_KDBNew.py
@@ -0,0 +1,127 @@
+import pytest
+import numpy as np
+from sklearn.datasets import load_iris
+from sklearn.preprocessing import KBinsDiscretizer
+from matplotlib.testing.decorators import image_comparison
+from matplotlib.testing.conftest import mpl_test_settings
+from pgmpy.models import BayesianNetwork
+
+
+from bayesclass.clfs import KDBNew
+from .._version import __version__
+
+
+@pytest.fixture
+def data():
+    X, y = load_iris(return_X_y=True)
+    enc = KBinsDiscretizer(encode="ordinal")
+    return enc.fit_transform(X), y
+
+
+@pytest.fixture
+def clf():
+    return KDBNew(k=3)
+
+
+def test_KDBNew_default_hyperparameters(data, clf):
+    # Test default values of hyperparameters
+    assert not clf.show_progress
+    assert clf.random_state is None
+    assert clf.theta == 0.03
+    clf = KDBNew(show_progress=True, random_state=17, k=3)
+    assert clf.show_progress
+    assert clf.random_state == 17
+    assert clf.k == 3
+    clf.fit(*data)
+    assert clf.class_name_ == "class"
+    assert clf.feature_names_in_ == [
+        "feature_0",
+        "feature_1",
+        "feature_2",
+        "feature_3",
+    ]
+
+
+def test_KDBNew_version(clf):
+    """Check KDBNew version."""
+    assert __version__ == clf.version()
+
+
+def test_KDBNew_nodes_edges(clf, data):
+    assert clf.nodes_edges() == (0, 0)
+    clf.fit(*data)
+    assert clf.nodes_leaves() == (5, 10)
+
+
+def test_KDBNew_states(clf, data):
+    assert clf.states_ == 0
+    clf.fit(*data)
+    assert clf.states_ == 23
+    assert clf.depth_ == clf.states_
+
+
+def test_KDBNew_classifier(data, clf):
+    clf.fit(*data)
+    attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
+    for attr in attribs:
+        assert hasattr(clf, attr)
+    X = data[0]
+    y = data[1]
+    y_pred = clf.predict(X)
+    assert y_pred.shape == (X.shape[0],)
+    assert sum(y == y_pred) == 148
+
+
+@image_comparison(
+    baseline_images=["line_dashes_KDBNew"],
+    remove_text=True,
+    extensions=["png"],
+)
+def test_KDBNew_plot(data, clf):
+    # mpl_test_settings will automatically clean these internal side effects
+    mpl_test_settings
+    dataset = load_iris(as_frame=True)
+    clf.fit(*data, features=dataset["feature_names"])
+    clf.plot("KDBNew Iris")
+
+
+def test_KDBNew_wrong_num_features(data, clf):
+    with pytest.raises(
+        ValueError,
+        match="Number of features does not match the number of columns in X",
+    ):
+        clf.fit(*data, features=["feature_1", "feature_2"])
+
+
+def test_KDBNew_wrong_hyperparam(data, clf):
+    with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
+        clf.fit(*data, wrong_param="wrong_param")
+
+
+def test_KDBNew_error_size_predict(data, clf):
+    X, y = data
+    clf.fit(X, y)
+    with pytest.raises(ValueError):
+        X_diff_size = np.ones((10, X.shape[1] + 1))
+        clf.predict(X_diff_size)
+
+
+def test_KDBNew_dont_do_cycles():
+    clf = KDBNew(k=4)
+    dag = BayesianNetwork()
+    clf.feature_names_in_ = [
+        "feature_0",
+        "feature_1",
+        "feature_2",
+        "feature_3",
+    ]
+    nodes = list(range(4))
+    weights = np.ones((4, 4))
+    for idx in range(1, 4):
+        dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
+    dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
+    dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
+    dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
+    for idx in range(4):
+        clf._add_m_edges(dag, idx, nodes, weights)
+        assert len(dag.edges()) == 6
--- a/bayesclass/tests/test_TAN.py
+++ b/bayesclass/tests/test_TAN.py
@@ -19,16 +19,16 @@ def data():

@pytest.fixture
 def clf():
-    return TAN()
+    return TAN(random_state=17)


 def test_TAN_default_hyperparameters(data, clf):
    # Test default values of hyperparameters
    assert not clf.show_progress
-    assert clf.random_state is None
-    clf = TAN(show_progress=True, random_state=17)
-    assert clf.show_progress
    assert clf.random_state == 17
+    clf = TAN(show_progress=True)
+    assert clf.show_progress
+    assert clf.random_state is None
    clf.fit(*data)
    assert clf.head_ == 0
    assert clf.class_name_ == "class"
@@ -47,21 +47,18 @@ def test_TAN_version(clf):

 def test_TAN_nodes_edges(clf, data):
    assert clf.nodes_edges() == (0, 0)
-    clf = TAN(random_state=17)
    clf.fit(*data, head="random")
    assert clf.nodes_leaves() == (5, 7)


 def test_TAN_states(clf, data):
    assert clf.states_ == 0
-    clf = TAN(random_state=17)
    clf.fit(*data)
    assert clf.states_ == 23
    assert clf.depth_ == clf.states_


-def test_TAN_random_head(data):
-    clf = TAN(random_state=17)
+def test_TAN_random_head(clf, data):
    clf.fit(*data, head="random")
    assert clf.head_ == 3

--- a/bayesclass/tests/test_TANNew.py
+++ b/bayesclass/tests/test_TANNew.py
@@ -0,0 +1,121 @@
+import pytest
+import numpy as np
+from sklearn.datasets import load_iris
+from sklearn.preprocessing import KBinsDiscretizer
+from matplotlib.testing.decorators import image_comparison
+from matplotlib.testing.conftest import mpl_test_settings
+
+
+from bayesclass.clfs import TANNew
+from .._version import __version__
+
+
+@pytest.fixture
+def data():
+    X, y = load_iris(return_X_y=True)
+    enc = KBinsDiscretizer(encode="ordinal")
+    return enc.fit_transform(X), y
+
+
+@pytest.fixture
+def clf():
+    return TANNew(random_state=17)
+
+
+def test_TANNew_default_hyperparameters(data, clf):
+    # Test default values of hyperparameters
+    assert not clf.show_progress
+    assert clf.random_state == 17
+    clf = TANNew(show_progress=True)
+    assert clf.show_progress
+    assert clf.random_state is None
+    clf.fit(*data)
+    assert clf.head_ == 0
+    assert clf.class_name_ == "class"
+    assert clf.feature_names_in_ == [
+        "feature_0",
+        "feature_1",
+        "feature_2",
+        "feature_3",
+    ]
+
+
+def test_TANNew_version(clf):
+    """Check TANNew version."""
+    assert __version__ == clf.version()
+
+
+def test_TANNew_nodes_edges(clf, data):
+    assert clf.nodes_edges() == (0, 0)
+    clf.fit(*data, head="random")
+    assert clf.nodes_leaves() == (5, 7)
+
+
+def test_TANNew_states(clf, data):
+    assert clf.states_ == 0
+    clf.fit(*data)
+    assert clf.states_ == 22
+    assert clf.depth_ == clf.states_
+
+
+def test_TANNew_random_head(clf, data):
+    clf.fit(*data, head="random")
+    assert clf.head_ == 3
+
+
+def test_TANNew_classifier(data, clf):
+    clf.fit(*data)
+    attribs = [
+        "classes_",
+        "X_",
+        "y_",
+        "head_",
+        "feature_names_in_",
+        "class_name_",
+    ]
+    for attr in attribs:
+        assert hasattr(clf, attr)
+    X = data[0]
+    y = data[1]
+    y_pred = clf.predict(X)
+    assert y_pred.shape == (X.shape[0],)
+    assert sum(y == y_pred) == 145
+
+
+@image_comparison(
+    baseline_images=["line_dashes_TANNew"],
+    remove_text=True,
+    extensions=["png"],
+)
+def test_TANNew_plot(data, clf):
+    # mpl_test_settings will automatically clean these internal side effects
+    mpl_test_settings
+    dataset = load_iris(as_frame=True)
+    clf.fit(*data, features=dataset["feature_names"], head=0)
+    clf.plot("TANNew Iris head=0")
+
+
+def test_TANNew_wrong_num_features(data, clf):
+    with pytest.raises(
+        ValueError,
+        match="Number of features does not match the number of columns in X",
+    ):
+        clf.fit(*data, features=["feature_1", "feature_2"])
+
+
+def test_TANNew_wrong_hyperparam(data, clf):
+    with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
+        clf.fit(*data, wrong_param="wrong_param")
+
+
+def test_TANNew_head_out_of_range(data, clf):
+    with pytest.raises(ValueError, match="Head index out of range"):
+        clf.fit(*data, head=4)
+
+
+def test_TANNew_error_size_predict(data, clf):
+    X, y = data
+    clf.fit(X, y)
+    with pytest.raises(ValueError):
+        X_diff_size = np.ones((10, X.shape[1] + 1))
+        clf.predict(X_diff_size)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
  "pgmpy",
  "networkx",
  "matplotlib",
+  "fimdlp",
 ]
 requires-python = ">=3.8"
 classifiers = [