diff --git a/bayesclass/clfs.py b/bayesclass/clfs.py index f734343..43e95b7 100644 --- a/bayesclass/clfs.py +++ b/bayesclass/clfs.py @@ -891,7 +891,6 @@ class BoostAODE(ClassifierMixin, BaseEnsemble): SelectKBestWeighted(k=1) .fit(self.X_, self.y_, weights) .get_feature_names_out(self.feature_names_in_) - .tolist()[0] ) # Step 2: Build & train spode with the first feature as sparent estimator = clone(self.estimator_) @@ -914,13 +913,3 @@ class BoostAODE(ClassifierMixin, BaseEnsemble): ] # Step 4: Add the new model self.estimators_.append(estimator) - """ - class_edges = [(self.class_name_, f) for f in self.feature_names_in_] - feature_edges = [ - (sparent, f) for f in self.feature_names_in_ if f != sparent - ] - self.weights_ = weights.copy() if weights is not None else None - feature_edges.extend(class_edges) - self.model_ = BayesianNetwork(feature_edges, show_progress=False) - return self.model_ - """ diff --git a/bayesclass/feature_selection.py b/bayesclass/feature_selection.py index c0e3597..bbf6c4b 100644 --- a/bayesclass/feature_selection.py +++ b/bayesclass/feature_selection.py @@ -1,6 +1,6 @@ import numpy as np from sklearn.feature_selection import mutual_info_classif - +from sklearn.utils.validation import check_X_y, check_is_fitted """ Compute the weighted mutual information between each feature and the @@ -24,10 +24,17 @@ entropy are given. class SelectKBestWeighted: + def __init__(self, k): + self.k = k + def fit(self, X, y, sample_weight): + self.X_, self.y_ = check_X_y(X, y) self.X_ = X self.y_ = y + self.n_features_in_ = X.shape[1] self.sample_weight_ = sample_weight + if self.k > X.shape[1] or self.k<1: + raise ValueError(f"k must be between 1 and {self.n_features_in_}") # Compute the entropy of the target variable entropy_y = -np.sum( np.multiply( @@ -42,7 +49,8 @@ class SelectKBestWeighted: # Compute the weighted entropy of each feature entropy_weighted = [] for i in range(X.shape[1]): - # Compute the weighted frequency of each unique value of the feature + # Compute the weighted frequency of each unique value of the + # feature freq_weighted = np.bincount(X[:, i], weights=sample_weight) freq_weighted = freq_weighted[freq_weighted != 0] @@ -52,8 +60,15 @@ class SelectKBestWeighted: / np.sum(sample_weight) ) - # Compute the weighted mutual information between each feature and the target + # Compute the weighted mutual information between each feature and + # the target mi_weighted = mi * entropy_weighted / entropy_y # Return the weighted mutual information scores self.mi_weighted_ = mi_weighted + return self + + def get_feature_names_out(self, features): + check_is_fitted(self, ["X_", "y_", "mi_weighted_"]) + return [features[i] for i in np.argsort(self.mi_weighted_)[::-1][:self.k]] +