diff --git a/bayesclass/clfs.py b/bayesclass/clfs.py index 95c82f7..a7a37ea 100644 --- a/bayesclass/clfs.py +++ b/bayesclass/clfs.py @@ -818,6 +818,7 @@ class BoostSPODE(BayesBase): def _train(self, kwargs): states = dict(state_names=kwargs.get("state_names", [])) + breakpoint() self.model_ = BayesianNetwork(self.dag_.edges(), show_progress=False) self.model_.fit( self.dataset_, @@ -834,9 +835,11 @@ class BoostAODE(ClassifierMixin, BaseEnsemble): show_progress=False, random_state=None, estimator=None, + n_estimators=10, ): self.show_progress = show_progress self.random_state = random_state + self.n_estimators = n_estimators super().__init__(estimator=estimator) def _validate_estimator(self) -> None: @@ -858,6 +861,7 @@ class BoostAODE(ClassifierMixin, BaseEnsemble): self.y_ = y self.n_samples_ = X.shape[0] self.estimators_ = [] + self._validate_estimator() self._train(kwargs) self.fitted_ = True # To keep compatiblity with the benchmark platform @@ -868,44 +872,37 @@ class BoostAODE(ClassifierMixin, BaseEnsemble): """Build boosted SPODEs""" weights = [1 / self.n_samples_] * self.n_samples_ # Step 0: Set the finish condition - pending_features = self.feature_names_in_.copy() - exit_condition = len(pending_features) == 0 - while not exit_condition: + for num in range(self.n_estimators): # Step 1: Build ranking with mutual information + # OJO MAL, ESTO NO ACTUALIZA EL RANKING CON LOS PESOS + # SIEMPRE VA A SACAR LO MISMO feature = ( - SelectKBest(k="all") + SelectKBest(k=1) .fit(self.X_, self.y_) .get_feature_names_out(self.feature_names_in_) .tolist()[0] ) # Step 2: Build & train spode with the first feature as sparent - self._validate_estimator() estimator = clone(self.estimator_) _args = kwargs.copy() _args["sparent"] = feature _args["sample_weight"] = weights _args["weighted"] = True - _args["X"] = self.X_ - _args["y"] = self.y_ # Step 2.1: build dataset # Step 2.2: Train the model - estimator.fit(**_args) + estimator.fit(self.X_, self.y_, **_args) # Step 3: Compute errors (epsilon sub m & alpha sub m) # Explanation in https://medium.datadriveninvestor.com/understanding-adaboost-and-scikit-learns-algorithm-c8d8af5ace10 y_pred = estimator.predict(self.X_) em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights) - am = np.log((1 - em) / em) + np.log(self.n_classes_ - 1) + am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1) # Step 3.2: Update weights for next classifier weights = [ wm * np.exp(am * (ym != y_pred)) for wm, ym in zip(weights, self.y_) ] - print(weights) # Step 4: Add the new model self.estimators_.append(estimator) - # Final step: Update the finish condition - pending_features.remove(feature) - exit_condition = len(pending_features) == 0 """ class_edges = [(self.class_name_, f) for f in self.feature_names_in_] feature_edges = [ diff --git a/bayesclass/tests/test_BoostAODE.py b/bayesclass/tests/test_BoostAODE.py new file mode 100644 index 0000000..c8fd602 --- /dev/null +++ b/bayesclass/tests/test_BoostAODE.py @@ -0,0 +1,100 @@ +import pytest +import numpy as np +from sklearn.preprocessing import KBinsDiscretizer +from matplotlib.testing.decorators import image_comparison +from matplotlib.testing.conftest import mpl_test_settings + + +from bayesclass.clfs import BoostAODE +from .._version import __version__ + + +@pytest.fixture +def clf(): + return BoostAODE(random_state=17) + + +def test_BoostAODE_default_hyperparameters(data_disc, clf): + # Test default values of hyperparameters + assert not clf.show_progress + assert clf.random_state == 17 + clf = BoostAODE(show_progress=True) + assert clf.show_progress + assert clf.random_state is None + clf.fit(*data_disc) + assert clf.class_name_ == "class" + assert clf.feature_names_in_ == [ + "feature_0", + "feature_1", + "feature_2", + "feature_3", + ] + + +# @image_comparison( +# baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"] +# ) +# def test_BoostAODE_plot(data_disc, features, clf): +# # mpl_test_settings will automatically clean these internal side effects +# mpl_test_settings +# clf.fit(*data_disc, features=features) +# clf.plot("AODE Iris") + + +# def test_BoostAODE_version(clf, features, data_disc): +# """Check AODE version.""" +# assert __version__ == clf.version() +# clf.fit(*data_disc, features=features) +# assert __version__ == clf.version() + + +# def test_BoostAODE_nodes_edges(clf, data_disc): +# assert clf.nodes_edges() == (0, 0) +# clf.fit(*data_disc) +# assert clf.nodes_leaves() == (20, 28) + + +# def test_BoostAODE_states(clf, data_disc): +# assert clf.states_ == 0 +# clf.fit(*data_disc) +# assert clf.states_ == 19 +# assert clf.depth_ == clf.states_ + + +# def test_BoostAODE_classifier(data_disc, clf): +# clf.fit(*data_disc) +# attribs = [ +# "feature_names_in_", +# "class_name_", +# "n_features_in_", +# "X_", +# "y_", +# ] +# for attr in attribs: +# assert hasattr(clf, attr) +# X = data_disc[0] +# y = data_disc[1] +# y_pred = clf.predict(X) +# assert y_pred.shape == (X.shape[0],) +# assert sum(y == y_pred) == 146 + + +# def test_BoostAODE_wrong_num_features(data_disc, clf): +# with pytest.raises( +# ValueError, +# match="Number of features does not match the number of columns in X", +# ): +# clf.fit(*data_disc, features=["feature_1", "feature_2"]) + + +# def test_BoostAODE_wrong_hyperparam(data_disc, clf): +# with pytest.raises(ValueError, match="Unexpected argument: wrong_param"): +# clf.fit(*data_disc, wrong_param="wrong_param") + + +# def test_BoostAODE_error_size_predict(data_disc, clf): +# X, y = data_disc +# clf.fit(X, y) +# with pytest.raises(ValueError): +# X_diff_size = np.ones((10, X.shape[1] + 1)) +# clf.predict(X_diff_size) diff --git a/requirements.txt b/requirements.txt index 0806932..1384e1c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ numpy scipy +pandas scikit-learn matplotlib networkx