From 42ac57eb79bc0cd5600a2b98035e77c280b9334d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 7 Feb 2023 18:02:35 +0100 Subject: [PATCH] Continue with New estimators --- bayesclass/clfs.py | 163 +++++++++++++++++---------------------------- 1 file changed, 61 insertions(+), 102 deletions(-) diff --git a/bayesclass/clfs.py b/bayesclass/clfs.py index 4e5808e..b9cf8de 100644 --- a/bayesclass/clfs.py +++ b/bayesclass/clfs.py @@ -139,9 +139,6 @@ class BayesBase(BaseEstimator, ClassifierMixin): # Return the classifier return self - def _build(self): - pass - def _train(self, kwargs): self.model_ = BayesianNetwork( self.dag_.edges(), show_progress=self.show_progress @@ -463,116 +460,54 @@ class AODE(BayesBase, BaseEnsemble): class TANNew(TAN): - pass + def fit(self, X, y, **kwargs): + self.estimator = Proposal(self) + return self.estimator.fit(X, y, **kwargs) + + def predict(self, X): + self.plot() + return self.estimator.predict(X) class KDBNew(KDB): - def __init__(self, k, theta=0.3, show_progress=False, random_state=None): - super().__init__( - k, theta, show_progress=show_progress, random_state=random_state - ) - self.estimator = Proposal(self) - def fit(self, X, y, **kwargs): - Xd, kwargs = self.estimator.discretize_train(X, y, kwargs) - super().fit(Xd, y, **kwargs) - upgraded, Xd, kwargs = self.estimator.local_discretization(kwargs) - if upgraded: - super().fit(Xd, y, **kwargs) + self.estimator = Proposal(self) + return self.estimator.fit(X, y, **kwargs) def predict(self, X): + self.plot() return self.estimator.predict(X) - # def fit(self, X, y, **kwargs): - # self.discretizer_ = FImdlp(n_jobs=1) - # Xd = self.discretizer_.fit_transform(X, y) - # features = ( - # kwargs["features"] - # if "features" in kwargs - # else self.default_feature_names(Xd.shape[1]) - # ) - # self.compute_kwargs(Xd, y, kwargs) - # # Build the model - # super().fit(Xd, y, **kwargs) - # self.idx_features_ = dict(list(zip(features, range(len(features))))) - # self.proposal(Xd) - # return self - - # def predict(self, X): - # return super().predict(self.discretizer_.transform(X)) - - # def compute_kwargs(self, Xd, y, kwargs): - # features = kwargs["features"] - # states = { - # features[i]: np.unique(Xd[:, i]).tolist() - # for i in range(Xd.shape[1]) - # } - # class_name = ( - # kwargs["class_name"] - # if "class_name" in kwargs - # else self.default_class_name() - # ) - # states[class_name] = np.unique(y).tolist() - # kwargs["state_names"] = states - # self.kwargs_ = kwargs - - # def check_integrity(self, X, state_names, features): - # for i in range(X.shape[1]): - # if not np.array_equal( - # np.unique(X[:, i]), np.array(state_names[features[i]]) - # ): - # print( - # "i", - # i, - # "features[i]", - # features[i], - # "np.unique(X[:, i])", - # np.unique(X[:, i]), - # "np.array(state_names[features[i]])", - # np.array(state_names[features[i]]), - # ) - # raise ValueError("Discretization error") - - # def proposal(self, Xd): - # """Discretize each feature with its fathers and the class""" - # res = Xd.copy() - # upgraded = False - # for idx, feature in enumerate(self.feature_names_in_): - # fathers = self.dag_.get_parents(feature) - # if len(fathers) > 1: - # # First remove the class name as it will be added later - # fathers.remove(self.class_name_) - # # Get the fathers indices - # features = [self.idx_features_[f] for f in fathers] - # # Update the discretization of the feature - # res[:, idx] = self.discretizer_.join_fit( - # target=idx, features=features, data=Xd - # ) - # upgraded = True - # if upgraded: - # self.compute_kwargs(res, self.y_, self.kwargs_) - # super().fit(res, self.y_, **self.kwargs_) - class Proposal: def __init__(self, estimator): self.estimator = estimator + self.class_type = estimator.__class__ - def discretize_train(self, X, y, kwargs): - self.discretizer_ = FImdlp(n_jobs=1) - self.Xd = self.discretizer_.fit_transform(X, y) - kwargs = self.compute_kwargs(y, kwargs) - return self.Xd, kwargs - - def local_discretization(self, kwargs): - features = kwargs["features"] - self.idx_features_ = dict(list(zip(features, range(len(features))))) - return self._local_discretization(kwargs) + def fit(self, X, y, **kwargs): + # Discretize train data + self.discretizer = FImdlp(n_jobs=1) + self.Xd = self.discretizer.fit_transform(X, y) + kwargs = self.update_kwargs(y, kwargs) + # Build the model + super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs) + self.check_integrity("f", self.Xd) + # # Local discretization based on the model + # features = kwargs["features"] + # # assign indices to feature names + # self.idx_features_ = dict(list(zip(features, range(len(features))))) + # upgraded, self.Xd = self._local_discretization() + # if upgraded: + # kwargs = self.update_kwargs(y, kwargs) + # super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs) def predict(self, X): - return self.estimator.predict(self.discretizer_.transform(X)) + self.check_integrity("p", self.discretizer.transform(X)) + return super(self.class_type, self.estimator).predict( + self.discretizer.transform(X) + ) - def compute_kwargs(self, y, kwargs): + def update_kwargs(self, y, kwargs): features = ( kwargs["features"] if "features" in kwargs @@ -589,26 +524,50 @@ class Proposal: ) states[class_name] = np.unique(y).tolist() kwargs["state_names"] = states + self.state_names_ = states + self.features_ = features kwargs["features"] = features kwargs["class_name"] = class_name return kwargs - def _local_discretization(self, kwargs): + def _local_discretization(self): """Discretize each feature with its fathers and the class""" res = self.Xd.copy() upgraded = False + print("-" * 80) for idx, feature in enumerate(self.estimator.feature_names_in_): fathers = self.estimator.dag_.get_parents(feature) if len(fathers) > 1: + print( + "Discretizing " + feature + " with " + str(fathers), + end=" ", + ) # First remove the class name as it will be added later fathers.remove(self.estimator.class_name_) # Get the fathers indices features = [self.idx_features_[f] for f in fathers] # Update the discretization of the feature - res[:, idx] = self.discretizer_.join_fit( + res[:, idx] = self.discretizer.join_fit( target=idx, features=features, data=self.Xd ) + print(self.discretizer.y_join[:5]) upgraded = True - if upgraded: - kwargs = self.compute_kwargs(res, self.estimator.y_, kwargs) - return upgraded, res, kwargs + return upgraded, res + + def check_integrity(self, source, X): + print(f"Checking integrity of {source} data") + for i in range(X.shape[1]): + if not set(np.unique(X[:, i]).tolist()).issubset( + set(self.state_names_[self.features_[i]]) + ): + print( + "i", + i, + "features[i]", + self.features_[i], + "np.unique(X[:, i])", + np.unique(X[:, i]), + "np.array(state_names[features[i]])", + np.array(self.state_names_[self.features_[i]]), + ) + raise ValueError("Discretization error")