From 99321043ec8bcb98c6a8ec63c180329f02432c30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 21 Jun 2023 16:40:29 +0200 Subject: [PATCH] Complete feature_selection with weighted entropy --- bayesclass/clfs.py | 2 ++ bayesclass/feature_selection.py | 49 +++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/bayesclass/clfs.py b/bayesclass/clfs.py index 43e95b7..4fc71eb 100644 --- a/bayesclass/clfs.py +++ b/bayesclass/clfs.py @@ -891,6 +891,7 @@ class BoostAODE(ClassifierMixin, BaseEnsemble): SelectKBestWeighted(k=1) .fit(self.X_, self.y_, weights) .get_feature_names_out(self.feature_names_in_) + .tolist() ) # Step 2: Build & train spode with the first feature as sparent estimator = clone(self.estimator_) @@ -898,6 +899,7 @@ class BoostAODE(ClassifierMixin, BaseEnsemble): _args["sparent"] = feature _args["sample_weight"] = weights _args["weighted"] = True + print("I'm gonna build a spode with", feature) # Step 2.1: build dataset # Step 2.2: Train the model estimator.fit(self.X_, self.y_, **_args) diff --git a/bayesclass/feature_selection.py b/bayesclass/feature_selection.py index bbf6c4b..92189fb 100644 --- a/bayesclass/feature_selection.py +++ b/bayesclass/feature_selection.py @@ -1,6 +1,10 @@ import numpy as np from sklearn.feature_selection import mutual_info_classif from sklearn.utils.validation import check_X_y, check_is_fitted +from sklearn.feature_selection._univariate_selection import ( + _BaseFilter, + _clean_nans, +) """ Compute the weighted mutual information between each feature and the @@ -23,18 +27,38 @@ entropy are given. """ -class SelectKBestWeighted: - def __init__(self, k): +class SelectKBestWeighted(_BaseFilter): + def __init__(self, *, k=10): + super().__init__(score_func=mutual_info_classif) self.k = k - + + def _check_params(self, X, y): + if self.k > X.shape[1] or self.k < 1: + raise ValueError( + f"k must be between 1 and {X.shape[1]} got {self.k}." + ) + + def _get_support_mask(self): + check_is_fitted(self) + + if self.k == "all": + return np.ones(self.scores_.shape, dtype=bool) + elif self.k == 0: + return np.zeros(self.scores_.shape, dtype=bool) + else: + scores = _clean_nans(self.scores_) + mask = np.zeros(scores.shape, dtype=bool) + + # Request a stable sort. Mergesort takes more memory (~40MB per + # megafeature on x86-64). + mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1 + return mask + def fit(self, X, y, sample_weight): self.X_, self.y_ = check_X_y(X, y) - self.X_ = X - self.y_ = y + self._check_params(X, y) self.n_features_in_ = X.shape[1] self.sample_weight_ = sample_weight - if self.k > X.shape[1] or self.k<1: - raise ValueError(f"k must be between 1 and {self.n_features_in_}") # Compute the entropy of the target variable entropy_y = -np.sum( np.multiply( @@ -44,12 +68,12 @@ class SelectKBestWeighted: ) # Compute the mutual information between each feature and the target - mi = mutual_info_classif(X, y) + mi = self.score_func(X, y) # Compute the weighted entropy of each feature entropy_weighted = [] for i in range(X.shape[1]): - # Compute the weighted frequency of each unique value of the + # Compute the weighted frequency of each unique value of the # feature freq_weighted = np.bincount(X[:, i], weights=sample_weight) freq_weighted = freq_weighted[freq_weighted != 0] @@ -65,10 +89,5 @@ class SelectKBestWeighted: mi_weighted = mi * entropy_weighted / entropy_y # Return the weighted mutual information scores - self.mi_weighted_ = mi_weighted + self.scores_ = mi_weighted return self - - def get_feature_names_out(self, features): - check_is_fitted(self, ["X_", "y_", "mi_weighted_"]) - return [features[i] for i in np.argsort(self.mi_weighted_)[::-1][:self.k]] -