feat: Add feature_names_in_ to classifiers

This commit is contained in:
2023-01-27 19:25:01 +01:00
parent a4edc74e8d
commit e837c6cef7
5 changed files with 62 additions and 37 deletions

View File

@@ -19,14 +19,12 @@ class BayesBase(BaseEstimator, ClassifierMixin):
def __init__(self, random_state, show_progress):
self.random_state = random_state
self.show_progress = show_progress
# To keep compatiblity with the benchmark platform
self.nodes_leaves = self.nodes_edges
def _more_tags(self):
return {
"requires_positive_X": True,
"requires_positive_y": True,
"preserve_dtype": [np.int64, np.int32],
"preserve_dtype": [np.int32, np.int64],
"requires_y": True,
}
@@ -44,6 +42,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
"""Check the common parameters passed to fit"""
# Check that X and y have correct shape
X, y = check_X_y(X, y)
X = self._validate_data(X, reset=True)
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
@@ -55,9 +54,10 @@ class BayesBase(BaseEstimator, ClassifierMixin):
setattr(self, f"{key}_", value)
else:
raise ValueError(f"Unexpected argument: {key}")
self.feature_names_in_ = self.features_
if self.random_state is not None:
random.seed(self.random_state)
if len(self.features_) != X.shape[1]:
if len(self.feature_names_in_) != X.shape[1]:
raise ValueError(
"Number of features does not match the number of columns in X"
)
@@ -116,13 +116,17 @@ class BayesBase(BaseEstimator, ClassifierMixin):
# Store the information needed to build the model
self.X_ = X_
self.y_ = y_
self.dataset_ = pd.DataFrame(self.X_, columns=self.features_)
self.dataset_ = pd.DataFrame(
self.X_, columns=self.feature_names_in_, dtype=np.int32
)
self.dataset_[self.class_name_] = self.y_
# Build the DAG
self._build()
# Train the model
self._train(kwargs)
self.fitted_ = True
# To keep compatiblity with the benchmark platform
self.nodes_leaves = self.nodes_edges
# Return the classifier
return self
@@ -189,7 +193,9 @@ class BayesBase(BaseEstimator, ClassifierMixin):
# Input validation
X = check_array(X)
dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
dataset = pd.DataFrame(
X, columns=self.feature_names_in_, dtype=np.int32
)
return self.model_.predict(dataset).values.ravel()
def plot(self, title="", node_size=800):
@@ -226,7 +232,7 @@ class TAN(BayesBase):
The classes seen at :meth:`fit`.
class_name_ : str
The name of the class column
features_ : list
feature_names_in_ : list
The list of features names
head_ : int
The index of the node used as head for the initial DAG
@@ -254,7 +260,7 @@ class TAN(BayesBase):
return X, y
def _build(self):
# est = TreeSearch(self.dataset_, root_node=self.features_[self.head_])
# est = TreeSearch(self.dataset_, root_node=self. feature_names_in_[self.head_])
# self.dag_ = est.estimate(
# estimator_type="tan",
# class_node=self.class_name_,
@@ -277,9 +283,8 @@ class TAN(BayesBase):
weights = np.delete(weights, class_node_idx, axis=1)
reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
D = TreeSearch._create_tree_and_dag(
weights, reduced_columns, self.features_[self.head_]
weights, reduced_columns, self.feature_names_in_[self.head_]
)
# Step 4.3: Add edges from class_node to all other nodes.
D.add_edges_from(
[(self.class_name_, node) for node in reduced_columns]
@@ -309,7 +314,8 @@ class KDB(BayesBase):
if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta:
try:
dag.add_edge(
self.features_[max_minfo], self.features_[idx]
self.feature_names_in_[max_minfo],
self.feature_names_in_[idx],
)
num += 1
except ValueError:
@@ -349,7 +355,7 @@ class KDB(BayesBase):
# 5. 5.1
for idx in np.argsort(mutual):
# 5.2
feature = self.features_[idx]
feature = self.feature_names_in_[idx]
dag.add_node(feature)
# 5.3
dag.add_edge(self.class_name_, feature)
@@ -396,13 +402,13 @@ class AODE(BayesBase, BaseEnsemble):
def _train(self, kwargs):
"""Build SPODE estimators (Super Parent One Dependent Estimator)"""
self.models_ = []
class_edges = [(self.class_name_, f) for f in self.features_]
class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
states = dict(state_names=kwargs.pop("state_names", []))
for idx in range(self.n_features_in_):
feature_edges = [
(self.features_[idx], f)
for f in self.features_
if f != self.features_[idx]
(self.feature_names_in_[idx], f)
for f in self.feature_names_in_
if f != self.feature_names_in_[idx]
]
feature_edges.extend(class_edges)
model = BayesianNetwork(
@@ -425,11 +431,13 @@ class AODE(BayesBase, BaseEnsemble):
def predict(self, X: np.ndarray) -> np.ndarray:
check_is_fitted(self, ["X_", "y_", "fitted_"])
# Input validation
X = self._validate_data(X, reset=False)
X = check_array(X)
n_samples = X.shape[0]
n_estimators = len(self.models_)
result = np.empty((n_samples, n_estimators))
dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
dataset = pd.DataFrame(
X, columns=self.feature_names_in_, dtype=np.int32
)
for index, model in enumerate(self.models_):
result[:, index] = model.predict(dataset).values.ravel()
return mode(result, axis=1, keepdims=False).mode.ravel()

View File

@@ -31,7 +31,7 @@ def test_AODE_default_hyperparameters(data, clf):
assert clf.random_state == 17
clf.fit(*data)
assert clf.class_name_ == "class"
assert clf.features_ == [
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
@@ -56,7 +56,7 @@ def test_AODE_version(clf):
def test_AODE_nodes_edges(clf, data):
assert clf.nodes_leaves() == (0, 0)
assert clf.nodes_edges() == (0, 0)
clf.fit(*data)
assert clf.nodes_leaves() == (20, 28)
@@ -71,7 +71,14 @@ def test_AODE_states(clf, data):
def test_AODE_classifier(data, clf):
clf.fit(*data)
attribs = ["classes_", "X_", "y_", "features_", "class_name_"]
attribs = [
"classes_",
"X_",
"y_",
"feature_names_in_",
"class_name_",
"n_features_in_",
]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]

View File

@@ -34,7 +34,7 @@ def test_KDB_default_hyperparameters(data, clf):
assert clf.k == 3
clf.fit(*data)
assert clf.class_name_ == "class"
assert clf.features_ == [
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
@@ -48,7 +48,7 @@ def test_KDB_version(clf):
def test_KDB_nodes_edges(clf, data):
assert clf.nodes_leaves() == (0, 0)
assert clf.nodes_edges() == (0, 0)
clf.fit(*data)
assert clf.nodes_leaves() == (5, 10)
@@ -63,7 +63,7 @@ def test_KDB_states(clf, data):
def test_KDB_classifier(data, clf):
clf.fit(*data)
attribs = ["classes_", "X_", "y_", "features_", "class_name_"]
attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
@@ -108,14 +108,19 @@ def test_KDB_error_size_predict(data, clf):
def test_KDB_dont_do_cycles():
clf = KDB(k=4)
dag = BayesianNetwork()
clf.features_ = ["feature_0", "feature_1", "feature_2", "feature_3"]
clf.feature_names_in_ = [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
nodes = list(range(4))
weights = np.ones((4, 4))
for idx in range(1, 4):
dag.add_edge(clf.features_[0], clf.features_[idx])
dag.add_edge(clf.features_[1], clf.features_[2])
dag.add_edge(clf.features_[1], clf.features_[3])
dag.add_edge(clf.features_[2], clf.features_[3])
dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
for idx in range(4):
clf._add_m_edges(dag, idx, nodes, weights)
assert len(dag.edges()) == 6

View File

@@ -32,7 +32,7 @@ def test_TAN_default_hyperparameters(data, clf):
clf.fit(*data)
assert clf.head_ == 0
assert clf.class_name_ == "class"
assert clf.features_ == [
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
@@ -46,7 +46,7 @@ def test_TAN_version(clf):
def test_TAN_nodes_edges(clf, data):
assert clf.nodes_leaves() == (0, 0)
assert clf.nodes_edges() == (0, 0)
clf = TAN(random_state=17)
clf.fit(*data, head="random")
assert clf.nodes_leaves() == (5, 7)
@@ -68,7 +68,14 @@ def test_TAN_random_head(data):
def test_TAN_classifier(data, clf):
clf.fit(*data)
attribs = ["classes_", "X_", "y_", "head_", "features_", "class_name_"]
attribs = [
"classes_",
"X_",
"y_",
"head_",
"feature_names_in_",
"class_name_",
]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
@@ -89,7 +96,7 @@ def test_TAN_plot(data, clf):
clf.plot("TAN Iris head=0")
def test_KDB_wrong_num_features(data, clf):
def test_TAN_wrong_num_features(data, clf):
with pytest.raises(
ValueError,
match="Number of features does not match the number of columns in X",

View File

@@ -5,10 +5,8 @@ from sklearn.utils.estimator_checks import check_estimator
from bayesclass.clfs import TAN, KDB, AODE
@pytest.mark.parametrize("estimators", [TAN(), KDB(k=2), AODE()])
# @pytest.mark.parametrize("estimators", [TAN()])
# @pytest.mark.parametrize("estimators", [TAN(), KDB(k=2), AODE()])
@pytest.mark.parametrize("estimators", [AODE()])
def test_all_estimators(estimators):
i = 0
for estimator, test in check_estimator(estimators, generate_only=True):