Use ancest-order to process local discretization

Fix local discretization
Refactor tests
Unifiy iris dataset from sklearn with iris.arff
This commit is contained in:
2023-04-20 01:20:33 +02:00
parent 74cd8a6aa2
commit f9b35f61f0
12 changed files with 186 additions and 169 deletions

View File

@@ -47,6 +47,12 @@ class BayesBase(BaseEstimator, ClassifierMixin):
def default_class_name(): def default_class_name():
return "class" return "class"
def build_dataset(self):
self.dataset_ = pd.DataFrame(
self.X_, columns=self.feature_names_in_, dtype=np.int32
)
self.dataset_[self.class_name_] = self.y_
def _check_params_fit(self, X, y, expected_args, kwargs): def _check_params_fit(self, X, y, expected_args, kwargs):
"""Check the common parameters passed to fit""" """Check the common parameters passed to fit"""
# Check that X and y have correct shape # Check that X and y have correct shape
@@ -64,6 +70,10 @@ class BayesBase(BaseEstimator, ClassifierMixin):
else: else:
raise ValueError(f"Unexpected argument: {key}") raise ValueError(f"Unexpected argument: {key}")
self.feature_names_in_ = self.features_ self.feature_names_in_ = self.features_
# used for local discretization
self.indexed_features_ = {
feature: i for i, feature in enumerate(self.features_)
}
if self.random_state is not None: if self.random_state is not None:
random.seed(self.random_state) random.seed(self.random_state)
if len(self.feature_names_in_) != X.shape[1]: if len(self.feature_names_in_) != X.shape[1]:
@@ -125,10 +135,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
# Store the information needed to build the model # Store the information needed to build the model
self.X_ = X_ self.X_ = X_
self.y_ = y_ self.y_ = y_
self.dataset_ = pd.DataFrame( self.build_dataset()
self.X_, columns=self.feature_names_in_, dtype=np.int32
)
self.dataset_[self.class_name_] = self.y_
# Build the DAG # Build the DAG
self._build() self._build()
# Train the model # Train the model
@@ -660,14 +667,8 @@ class Proposal(BaseEstimator):
# Build the model # Build the model
super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs) super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs)
# Local discretization based on the model # Local discretization based on the model
features = kwargs["features"] self._local_discretization()
# assign indices to feature names
self.idx_features_ = dict(list(zip(features, range(len(features)))))
upgraded, self.Xd = self._local_discretization()
# self.check_integrity("fit", self.Xd) # self.check_integrity("fit", self.Xd)
if upgraded:
kwargs = self.update_kwargs(y, kwargs)
super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs)
self.fitted_ = True self.fitted_ = True
return self return self
@@ -705,27 +706,45 @@ class Proposal(BaseEstimator):
def _local_discretization(self): def _local_discretization(self):
"""Discretize each feature with its fathers and the class""" """Discretize each feature with its fathers and the class"""
res = self.Xd.copy() upgrade = False
upgraded = False # order of local discretization is important. no good 0, 1, 2...
# print("-" * 80) ancestral_order = list(nx.topological_sort(self.estimator.dag_))
for idx, feature in enumerate(self.estimator.feature_names_in_): for feature in ancestral_order:
if feature == self.estimator.class_name_:
continue
idx = self.estimator.indexed_features_[feature]
fathers = self.estimator.dag_.get_parents(feature) fathers = self.estimator.dag_.get_parents(feature)
if len(fathers) > 1: if len(fathers) > 1:
# print(
# "Discretizing " + feature + " with " + str(fathers),
# end=" ",
# )
# First remove the class name as it will be added later # First remove the class name as it will be added later
fathers.remove(self.estimator.class_name_) fathers.remove(self.estimator.class_name_)
# Get the fathers indices # Get the fathers indices
features = [self.idx_features_[f] for f in fathers] features = [
self.estimator.indexed_features_[f] for f in fathers
]
# Update the discretization of the feature # Update the discretization of the feature
res[:, idx] = self.discretizer_.join_fit( self.Xd[:, idx] = self.discretizer_.join_fit(
target=idx, features=features, data=self.Xd # each feature has to use previous discretization data=res
target=idx,
features=features,
data=self.Xd,
) )
# print(self.discretizer.y_join[:5]) upgrade = True
upgraded = True if upgrade:
return upgraded, res # Update the dataset
self.estimator.X_ = self.Xd
self.estimator.build_dataset()
self.state_names_ = {
key: self.discretizer_.get_states_feature(value)
for key, value in self.estimator.indexed_features_.items()
}
states = {"state_names": self.state_names_}
# Update the model
self.estimator.model_.fit(
self.estimator.dataset_,
estimator=BayesianEstimator,
prior_type="K2",
**states,
)
# def check_integrity(self, source, X): # def check_integrity(self, source, X):
# # print(f"Checking integrity of {source} data") # # print(f"Checking integrity of {source} data")

19
bayesclass/test.py Normal file
View File

@@ -0,0 +1,19 @@
from bayesclass.clfs import AODENew, TANNew, KDBNew, AODE
from benchmark.datasets import Datasets
import os
os.chdir("../discretizbench")
dt = Datasets()
clfan = AODENew()
clftn = TANNew()
clfkn = KDBNew()
# clfa = AODE()
X, y = dt.load("iris")
# clfa.fit(X, y)
clfan.fit(X, y)
clftn.fit(X, y)
clfkn.fit(X, y)
self.discretizer_.target_
self.estimator.indexed_features_

Binary file not shown.

Before

Width:  |  Height:  |  Size: 50 KiB

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 50 KiB

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 41 KiB

After

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -0,0 +1,38 @@
import pytest
from sklearn.datasets import load_iris
from fimdlp.mdlp import FImdlp
@pytest.fixture
def iris():
dataset = load_iris()
X = dataset["data"]
y = dataset["target"]
features = dataset["feature_names"]
# To make iris dataset has the same values as our iris.arff dataset
patch = {(34, 3): (0.2, 0.1), (37, 1): (3.6, 3.1), (37, 2): (1.4, 1.5)}
for key, value in patch.items():
X[key] = value[1]
return X, y, features
@pytest.fixture
def data(iris):
return iris[0], iris[1]
@pytest.fixture
def features(iris):
return iris[2]
@pytest.fixture
def class_name():
return "class"
@pytest.fixture
def data_disc(data):
clf = FImdlp()
X, y = data
return clf.fit_transform(X, y), y

View File

@@ -1,6 +1,5 @@
import pytest import pytest
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings from matplotlib.testing.conftest import mpl_test_settings
@@ -10,26 +9,19 @@ from bayesclass.clfs import AODE
from .._version import __version__ from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture @pytest.fixture
def clf(): def clf():
return AODE(random_state=17) return AODE(random_state=17)
def test_AODE_default_hyperparameters(data, clf): def test_AODE_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters # Test default values of hyperparameters
assert not clf.show_progress assert not clf.show_progress
assert clf.random_state == 17 assert clf.random_state == 17
clf = AODE(show_progress=True) clf = AODE(show_progress=True)
assert clf.show_progress assert clf.show_progress
assert clf.random_state is None assert clf.random_state is None
clf.fit(*data) clf.fit(*data_disc)
assert clf.class_name_ == "class" assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [ assert clf.feature_names_in_ == [
"feature_0", "feature_0",
@@ -42,37 +34,35 @@ def test_AODE_default_hyperparameters(data, clf):
@image_comparison( @image_comparison(
baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"] baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"]
) )
def test_AODE_plot(data, clf): def test_AODE_plot(data_disc, features, clf):
# mpl_test_settings will automatically clean these internal side effects # mpl_test_settings will automatically clean these internal side effects
mpl_test_settings mpl_test_settings
dataset = load_iris(as_frame=True) clf.fit(*data_disc, features=features)
clf.fit(*data, features=dataset["feature_names"])
clf.plot("AODE Iris") clf.plot("AODE Iris")
def test_AODE_version(clf, data): def test_AODE_version(clf, features, data_disc):
"""Check AODE version.""" """Check AODE version."""
assert __version__ == clf.version() assert __version__ == clf.version()
dataset = load_iris(as_frame=True) clf.fit(*data_disc, features=features)
clf.fit(*data, features=dataset["feature_names"])
assert __version__ == clf.version() assert __version__ == clf.version()
def test_AODE_nodes_edges(clf, data): def test_AODE_nodes_edges(clf, data_disc):
assert clf.nodes_edges() == (0, 0) assert clf.nodes_edges() == (0, 0)
clf.fit(*data) clf.fit(*data_disc)
assert clf.nodes_leaves() == (20, 28) assert clf.nodes_leaves() == (20, 28)
def test_AODE_states(clf, data): def test_AODE_states(clf, data_disc):
assert clf.states_ == 0 assert clf.states_ == 0
clf.fit(*data) clf.fit(*data_disc)
assert clf.states_ == 23 assert clf.states_ == 19
assert clf.depth_ == clf.states_ assert clf.depth_ == clf.states_
def test_AODE_classifier(data, clf): def test_AODE_classifier(data_disc, clf):
clf.fit(*data) clf.fit(*data_disc)
attribs = [ attribs = [
"feature_names_in_", "feature_names_in_",
"class_name_", "class_name_",
@@ -82,28 +72,28 @@ def test_AODE_classifier(data, clf):
] ]
for attr in attribs: for attr in attribs:
assert hasattr(clf, attr) assert hasattr(clf, attr)
X = data[0] X = data_disc[0]
y = data[1] y = data_disc[1]
y_pred = clf.predict(X) y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],) assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 147 assert sum(y == y_pred) == 146
def test_AODE_wrong_num_features(data, clf): def test_AODE_wrong_num_features(data_disc, clf):
with pytest.raises( with pytest.raises(
ValueError, ValueError,
match="Number of features does not match the number of columns in X", match="Number of features does not match the number of columns in X",
): ):
clf.fit(*data, features=["feature_1", "feature_2"]) clf.fit(*data_disc, features=["feature_1", "feature_2"])
def test_AODE_wrong_hyperparam(data, clf): def test_AODE_wrong_hyperparam(data_disc, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"): with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param") clf.fit(*data_disc, wrong_param="wrong_param")
def test_AODE_error_size_predict(data, clf): def test_AODE_error_size_predict(data_disc, clf):
X, y = data X, y = data_disc
clf.fit(X, y) clf.fit(X, y)
with pytest.raises(ValueError): with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1)) X_diff_size = np.ones((10, X.shape[1] + 1))

View File

@@ -1,7 +1,5 @@
import pytest import pytest
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings from matplotlib.testing.conftest import mpl_test_settings
@@ -10,13 +8,6 @@ from bayesclass.clfs import AODENew
from .._version import __version__ from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture @pytest.fixture
def clf(): def clf():
return AODENew(random_state=17) return AODENew(random_state=17)
@@ -44,19 +35,17 @@ def test_AODENew_default_hyperparameters(data, clf):
remove_text=True, remove_text=True,
extensions=["png"], extensions=["png"],
) )
def test_AODENew_plot(data, clf): def test_AODENew_plot(data, features, clf):
# mpl_test_settings will automatically clean these internal side effects # mpl_test_settings will automatically clean these internal side effects
mpl_test_settings mpl_test_settings
dataset = load_iris(as_frame=True) clf.fit(*data, features=features)
clf.fit(*data, features=dataset["feature_names"])
clf.plot("AODE Iris") clf.plot("AODE Iris")
def test_AODENew_version(clf, data): def test_AODENew_version(clf, data):
"""Check AODENew version.""" """Check AODENew version."""
assert __version__ == clf.version() assert __version__ == clf.version()
dataset = load_iris(as_frame=True) clf.fit(*data)
clf.fit(*data, features=dataset["feature_names"])
assert __version__ == clf.version() assert __version__ == clf.version()
@@ -69,7 +58,7 @@ def test_AODENew_nodes_edges(clf, data):
def test_AODENew_states(clf, data): def test_AODENew_states(clf, data):
assert clf.states_ == 0 assert clf.states_ == 0
clf.fit(*data) clf.fit(*data)
assert clf.states_ == 22.75 assert clf.states_ == 17.75
assert clf.depth_ == clf.states_ assert clf.depth_ == clf.states_
@@ -88,17 +77,17 @@ def test_AODENew_classifier(data, clf):
y = data[1] y = data[1]
y_pred = clf.predict(X) y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],) assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 147 assert sum(y == y_pred) == 146
def test_AODENew_local_discretization(clf, data): def test_AODENew_local_discretization(clf, data_disc):
expected_data = [ expected_data = [
[-1, [0, -1], [0, -1], [0, -1]], [-1, [0, -1], [0, -1], [0, -1]],
[[1, -1], -1, [1, -1], [1, -1]], [[1, -1], -1, [1, -1], [1, -1]],
[[2, -1], [2, -1], -1, [2, -1]], [[2, -1], [2, -1], -1, [2, -1]],
[[3, -1], [3, -1], [3, -1], -1], [[3, -1], [3, -1], [3, -1], -1],
] ]
clf.fit(*data) clf.fit(*data_disc)
for idx, estimator in enumerate(clf.estimators_): for idx, estimator in enumerate(clf.estimators_):
expected = expected_data[idx] expected = expected_data[idx]
for feature in range(4): for feature in range(4):

View File

@@ -1,6 +1,5 @@
import pytest import pytest
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings from matplotlib.testing.conftest import mpl_test_settings
@@ -11,19 +10,12 @@ from bayesclass.clfs import KDB
from .._version import __version__ from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture @pytest.fixture
def clf(): def clf():
return KDB(k=3) return KDB(k=3)
def test_KDB_default_hyperparameters(data, clf): def test_KDB_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters # Test default values of hyperparameters
assert not clf.show_progress assert not clf.show_progress
assert clf.random_state is None assert clf.random_state is None
@@ -32,7 +24,7 @@ def test_KDB_default_hyperparameters(data, clf):
assert clf.show_progress assert clf.show_progress
assert clf.random_state == 17 assert clf.random_state == 17
assert clf.k == 3 assert clf.k == 3
clf.fit(*data) clf.fit(*data_disc)
assert clf.class_name_ == "class" assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [ assert clf.feature_names_in_ == [
"feature_0", "feature_0",
@@ -47,57 +39,56 @@ def test_KDB_version(clf):
assert __version__ == clf.version() assert __version__ == clf.version()
def test_KDB_nodes_edges(clf, data): def test_KDB_nodes_edges(clf, data_disc):
assert clf.nodes_edges() == (0, 0) assert clf.nodes_edges() == (0, 0)
clf.fit(*data) clf.fit(*data_disc)
assert clf.nodes_leaves() == (5, 10) assert clf.nodes_leaves() == (5, 9)
def test_KDB_states(clf, data): def test_KDB_states(clf, data_disc):
assert clf.states_ == 0 assert clf.states_ == 0
clf.fit(*data) clf.fit(*data_disc)
assert clf.states_ == 23 assert clf.states_ == 19
assert clf.depth_ == clf.states_ assert clf.depth_ == clf.states_
def test_KDB_classifier(data, clf): def test_KDB_classifier(data_disc, clf):
clf.fit(*data) clf.fit(*data_disc)
attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"] attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
for attr in attribs: for attr in attribs:
assert hasattr(clf, attr) assert hasattr(clf, attr)
X = data[0] X = data_disc[0]
y = data[1] y = data_disc[1]
y_pred = clf.predict(X) y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],) assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 148 assert sum(y == y_pred) == 146
@image_comparison( @image_comparison(
baseline_images=["line_dashes_KDB"], remove_text=True, extensions=["png"] baseline_images=["line_dashes_KDB"], remove_text=True, extensions=["png"]
) )
def test_KDB_plot(data, clf): def test_KDB_plot(data_disc, features, clf):
# mpl_test_settings will automatically clean these internal side effects # mpl_test_settings will automatically clean these internal side effects
mpl_test_settings mpl_test_settings
dataset = load_iris(as_frame=True) clf.fit(*data_disc, features=features)
clf.fit(*data, features=dataset["feature_names"])
clf.plot("KDB Iris") clf.plot("KDB Iris")
def test_KDB_wrong_num_features(data, clf): def test_KDB_wrong_num_features(data_disc, clf):
with pytest.raises( with pytest.raises(
ValueError, ValueError,
match="Number of features does not match the number of columns in X", match="Number of features does not match the number of columns in X",
): ):
clf.fit(*data, features=["feature_1", "feature_2"]) clf.fit(*data_disc, features=["feature_1", "feature_2"])
def test_KDB_wrong_hyperparam(data, clf): def test_KDB_wrong_hyperparam(data_disc, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"): with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param") clf.fit(*data_disc, wrong_param="wrong_param")
def test_KDB_error_size_predict(data, clf): def test_KDB_error_size_predict(data_disc, clf):
X, y = data X, y = data_disc
clf.fit(X, y) clf.fit(X, y)
with pytest.raises(ValueError): with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1)) X_diff_size = np.ones((10, X.shape[1] + 1))

View File

@@ -1,7 +1,5 @@
import pytest import pytest
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings from matplotlib.testing.conftest import mpl_test_settings
from pgmpy.models import BayesianNetwork from pgmpy.models import BayesianNetwork
@@ -11,13 +9,6 @@ from bayesclass.clfs import KDBNew
from .._version import __version__ from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture @pytest.fixture
def clf(): def clf():
return KDBNew(k=3) return KDBNew(k=3)
@@ -50,13 +41,13 @@ def test_KDBNew_version(clf):
def test_KDBNew_nodes_edges(clf, data): def test_KDBNew_nodes_edges(clf, data):
assert clf.nodes_edges() == (0, 0) assert clf.nodes_edges() == (0, 0)
clf.fit(*data) clf.fit(*data)
assert clf.nodes_leaves() == (5, 10) assert clf.nodes_leaves() == (5, 9)
def test_KDBNew_states(clf, data): def test_KDBNew_states(clf, data):
assert clf.states_ == 0 assert clf.states_ == 0
clf.fit(*data) clf.fit(*data)
assert clf.states_ == 23 assert clf.states_ == 22
assert clf.depth_ == clf.states_ assert clf.depth_ == clf.states_
@@ -69,14 +60,15 @@ def test_KDBNew_classifier(data, clf):
y = data[1] y = data[1]
y_pred = clf.predict(X) y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],) assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 148 assert sum(y == y_pred) == 145
def test_KDBNew_local_discretization(clf, data): def test_KDBNew_local_discretization(clf, data):
expected = [[1, -1], -1, [0, 1, 3, -1], [1, 0, -1]] expected = [[1, -1], -1, [0, 1, 3, -1], [1, -1]]
clf.fit(*data) clf.fit(*data)
for feature in range(4): for feature in range(4):
computed = clf.estimator_.discretizer_.target_[feature] computed = clf.estimator_.discretizer_.target_[feature]
print("computed:", computed)
if type(computed) == list: if type(computed) == list:
for j, k in zip(expected[feature], computed): for j, k in zip(expected[feature], computed):
assert j == k assert j == k
@@ -92,11 +84,10 @@ def test_KDBNew_local_discretization(clf, data):
remove_text=True, remove_text=True,
extensions=["png"], extensions=["png"],
) )
def test_KDBNew_plot(data, clf): def test_KDBNew_plot(data, features, class_name, clf):
# mpl_test_settings will automatically clean these internal side effects # mpl_test_settings will automatically clean these internal side effects
mpl_test_settings mpl_test_settings
dataset = load_iris(as_frame=True) clf.fit(*data, features=features, class_name=class_name)
clf.fit(*data, features=dataset["feature_names"])
clf.plot("KDBNew Iris") clf.plot("KDBNew Iris")

View File

@@ -1,7 +1,5 @@
import pytest import pytest
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings from matplotlib.testing.conftest import mpl_test_settings
@@ -10,26 +8,19 @@ from bayesclass.clfs import TAN
from .._version import __version__ from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture @pytest.fixture
def clf(): def clf():
return TAN(random_state=17) return TAN(random_state=17)
def test_TAN_default_hyperparameters(data, clf): def test_TAN_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters # Test default values of hyperparameters
assert not clf.show_progress assert not clf.show_progress
assert clf.random_state == 17 assert clf.random_state == 17
clf = TAN(show_progress=True) clf = TAN(show_progress=True)
assert clf.show_progress assert clf.show_progress
assert clf.random_state is None assert clf.random_state is None
clf.fit(*data) clf.fit(*data_disc)
assert clf.head_ == 0 assert clf.head_ == 0
assert clf.class_name_ == "class" assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [ assert clf.feature_names_in_ == [
@@ -45,26 +36,26 @@ def test_TAN_version(clf):
assert __version__ == clf.version() assert __version__ == clf.version()
def test_TAN_nodes_edges(clf, data): def test_TAN_nodes_edges(clf, data_disc):
assert clf.nodes_edges() == (0, 0) assert clf.nodes_edges() == (0, 0)
clf.fit(*data, head="random") clf.fit(*data_disc, head="random")
assert clf.nodes_leaves() == (5, 7) assert clf.nodes_leaves() == (5, 7)
def test_TAN_states(clf, data): def test_TAN_states(clf, data_disc):
assert clf.states_ == 0 assert clf.states_ == 0
clf.fit(*data) clf.fit(*data_disc)
assert clf.states_ == 23 assert clf.states_ == 19
assert clf.depth_ == clf.states_ assert clf.depth_ == clf.states_
def test_TAN_random_head(clf, data): def test_TAN_random_head(clf, data_disc):
clf.fit(*data, head="random") clf.fit(*data_disc, head="random")
assert clf.head_ == 3 assert clf.head_ == 3
def test_TAN_classifier(data, clf): def test_TAN_classifier(data_disc, clf):
clf.fit(*data) clf.fit(*data_disc)
attribs = [ attribs = [
"classes_", "classes_",
"X_", "X_",
@@ -75,44 +66,43 @@ def test_TAN_classifier(data, clf):
] ]
for attr in attribs: for attr in attribs:
assert hasattr(clf, attr) assert hasattr(clf, attr)
X = data[0] X = data_disc[0]
y = data[1] y = data_disc[1]
y_pred = clf.predict(X) y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],) assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 147 assert sum(y == y_pred) == 146
@image_comparison( @image_comparison(
baseline_images=["line_dashes_TAN"], remove_text=True, extensions=["png"] baseline_images=["line_dashes_TAN"], remove_text=True, extensions=["png"]
) )
def test_TAN_plot(data, clf): def test_TAN_plot(data_disc, features, clf):
# mpl_test_settings will automatically clean these internal side effects # mpl_test_settings will automatically clean these internal side effects
mpl_test_settings mpl_test_settings
dataset = load_iris(as_frame=True) clf.fit(*data_disc, features=features, head=0)
clf.fit(*data, features=dataset["feature_names"], head=0)
clf.plot("TAN Iris head=0") clf.plot("TAN Iris head=0")
def test_TAN_wrong_num_features(data, clf): def test_TAN_wrong_num_features(data_disc, clf):
with pytest.raises( with pytest.raises(
ValueError, ValueError,
match="Number of features does not match the number of columns in X", match="Number of features does not match the number of columns in X",
): ):
clf.fit(*data, features=["feature_1", "feature_2"]) clf.fit(*data_disc, features=["feature_1", "feature_2"])
def test_TAN_wrong_hyperparam(data, clf): def test_TAN_wrong_hyperparam(data_disc, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"): with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param") clf.fit(*data_disc, wrong_param="wrong_param")
def test_TAN_head_out_of_range(data, clf): def test_TAN_head_out_of_range(data_disc, clf):
with pytest.raises(ValueError, match="Head index out of range"): with pytest.raises(ValueError, match="Head index out of range"):
clf.fit(*data, head=4) clf.fit(*data_disc, head=4)
def test_TAN_error_size_predict(data, clf): def test_TAN_error_size_predict(data_disc, clf):
X, y = data X, y = data_disc
clf.fit(X, y) clf.fit(X, y)
with pytest.raises(ValueError): with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1)) X_diff_size = np.ones((10, X.shape[1] + 1))

View File

@@ -1,7 +1,5 @@
import pytest import pytest
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings from matplotlib.testing.conftest import mpl_test_settings
@@ -10,13 +8,6 @@ from bayesclass.clfs import TANNew
from .._version import __version__ from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture @pytest.fixture
def clf(): def clf():
return TANNew(random_state=17) return TANNew(random_state=17)
@@ -54,7 +45,7 @@ def test_TANNew_nodes_edges(clf, data):
def test_TANNew_states(clf, data): def test_TANNew_states(clf, data):
assert clf.states_ == 0 assert clf.states_ == 0
clf.fit(*data) clf.fit(*data)
assert clf.states_ == 22 assert clf.states_ == 18
assert clf.depth_ == clf.states_ assert clf.depth_ == clf.states_
@@ -88,7 +79,7 @@ def test_TANNew_classifier(data, clf):
y = data[1] y = data[1]
y_pred = clf.predict(X) y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],) assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 145 assert sum(y == y_pred) == 146
@image_comparison( @image_comparison(
@@ -96,11 +87,10 @@ def test_TANNew_classifier(data, clf):
remove_text=True, remove_text=True,
extensions=["png"], extensions=["png"],
) )
def test_TANNew_plot(data, clf): def test_TANNew_plot(data, features, clf):
# mpl_test_settings will automatically clean these internal side effects # mpl_test_settings will automatically clean these internal side effects
mpl_test_settings mpl_test_settings
dataset = load_iris(as_frame=True) clf.fit(*data, features=features, head=0)
clf.fit(*data, features=dataset["feature_names"], head=0)
clf.plot("TANNew Iris head=0") clf.plot("TANNew Iris head=0")