From c906d6a3612907d82f9ea97abdc0a03367b05429 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Thu, 15 Jun 2023 14:13:15 +0200 Subject: [PATCH] Add weights to KDB classifier --- bayesclass/clfs.py | 16 ++++++++++++++-- bayesclass/test.py | 19 ------------------- bayesclass/tests/test_KDB.py | 7 +++++++ bayesclass/tests/test_KDBNew.py | 1 - 4 files changed, 21 insertions(+), 22 deletions(-) delete mode 100644 bayesclass/test.py diff --git a/bayesclass/clfs.py b/bayesclass/clfs.py index 5868027..db40900 100644 --- a/bayesclass/clfs.py +++ b/bayesclass/clfs.py @@ -52,6 +52,8 @@ class BayesBase(BaseEstimator, ClassifierMixin): self.X_, columns=self.feature_names_in_, dtype=np.int32 ) self.dataset_[self.class_name_] = self.y_ + if self.sample_weight_ is not None: + self.dataset_["_weight"] = self.sample_weight_ def _check_params_fit(self, X, y, expected_args, kwargs): """Check the common parameters passed to fit""" @@ -62,6 +64,8 @@ class BayesBase(BaseEstimator, ClassifierMixin): self.classes_ = unique_labels(y) self.n_classes_ = self.classes_.shape[0] # Default values + self.weighted_ = False + self.sample_weight_ = None self.class_name_ = self.default_class_name() self.features_ = default_feature_names(X.shape[1]) for key, value in kwargs.items(): @@ -80,6 +84,7 @@ class BayesBase(BaseEstimator, ClassifierMixin): raise ValueError( "Number of features does not match the number of columns in X" ) + self.n_features_in_ = X.shape[1] return X, y @@ -151,13 +156,14 @@ class BayesBase(BaseEstimator, ClassifierMixin): def _train(self, kwargs): self.model_ = BayesianNetwork( - self.dag_.edges(), show_progress=self.show_progress + self.dag_.edges() # , show_progress=self.show_progress ) states = dict(state_names=kwargs.pop("state_names", [])) self.model_.fit( self.dataset_, estimator=BayesianEstimator, prior_type="K2", + weighted=self.weighted_, **states, ) @@ -321,7 +327,13 @@ class KDB(BayesBase): ) def _check_params(self, X, y, kwargs): - expected_args = ["class_name", "features", "state_names"] + expected_args = [ + "class_name", + "features", + "state_names", + "sample_weight", + "weighted", + ] return self._check_params_fit(X, y, expected_args, kwargs) def _add_m_edges(self, dag, idx, S_nodes, conditional_weights): diff --git a/bayesclass/test.py b/bayesclass/test.py deleted file mode 100644 index fd983d6..0000000 --- a/bayesclass/test.py +++ /dev/null @@ -1,19 +0,0 @@ -from bayesclass.clfs import AODENew, TANNew, KDBNew, AODE -from benchmark.datasets import Datasets -import os - -os.chdir("../discretizbench") -dt = Datasets() -clfan = AODENew() -clftn = TANNew() -clfkn = KDBNew() -# clfa = AODE() -X, y = dt.load("iris") -# clfa.fit(X, y) -clfan.fit(X, y) -clftn.fit(X, y) -clfkn.fit(X, y) - - -self.discretizer_.target_ -self.estimator.indexed_features_ diff --git a/bayesclass/tests/test_KDB.py b/bayesclass/tests/test_KDB.py index aa35751..d365d7d 100644 --- a/bayesclass/tests/test_KDB.py +++ b/bayesclass/tests/test_KDB.py @@ -64,6 +64,13 @@ def test_KDB_classifier(data_disc, clf): assert sum(y == y_pred) == 146 +def test_KDB_classifier_weighted(data_disc, clf): + sample_weight = [1] * data_disc[0].shape[0] + sample_weight[:50] = [0] * 50 + clf.fit(*data_disc, sample_weight=sample_weight, weighted=True) + assert clf.score(*data_disc) == 0.64 + + @image_comparison( baseline_images=["line_dashes_KDB"], remove_text=True, extensions=["png"] ) diff --git a/bayesclass/tests/test_KDBNew.py b/bayesclass/tests/test_KDBNew.py index 36de4da..6c0ce73 100644 --- a/bayesclass/tests/test_KDBNew.py +++ b/bayesclass/tests/test_KDBNew.py @@ -68,7 +68,6 @@ def test_KDBNew_local_discretization(clf, data): clf.fit(*data) for feature in range(4): computed = clf.estimator_.discretizer_.target_[feature] - print("computed:", computed) if type(computed) == list: for j, k in zip(expected[feature], computed): assert j == k