From 99321043ec8bcb98c6a8ec63c180329f02432c30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= <rmontanana@gmail.com>
Date: Wed, 21 Jun 2023 16:40:29 +0200
Subject: [PATCH] Complete feature_selection with weighted entropy

---
 bayesclass/clfs.py              |  2 ++
 bayesclass/feature_selection.py | 49 +++++++++++++++++++++++----------
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/bayesclass/clfs.py b/bayesclass/clfs.py
index 43e95b7..4fc71eb 100644
--- a/bayesclass/clfs.py
+++ b/bayesclass/clfs.py
@@ -891,6 +891,7 @@ class BoostAODE(ClassifierMixin, BaseEnsemble):
                 SelectKBestWeighted(k=1)
                 .fit(self.X_, self.y_, weights)
                 .get_feature_names_out(self.feature_names_in_)
+                .tolist()
             )
             # Step 2: Build & train spode with the first feature as sparent
             estimator = clone(self.estimator_)
@@ -898,6 +899,7 @@ class BoostAODE(ClassifierMixin, BaseEnsemble):
             _args["sparent"] = feature
             _args["sample_weight"] = weights
             _args["weighted"] = True
+            print("I'm gonna build a spode with", feature)
             # Step 2.1: build dataset
             # Step 2.2: Train the model
             estimator.fit(self.X_, self.y_, **_args)
diff --git a/bayesclass/feature_selection.py b/bayesclass/feature_selection.py
index bbf6c4b..92189fb 100644
--- a/bayesclass/feature_selection.py
+++ b/bayesclass/feature_selection.py
@@ -1,6 +1,10 @@
 import numpy as np
 from sklearn.feature_selection import mutual_info_classif
 from sklearn.utils.validation import check_X_y, check_is_fitted
+from sklearn.feature_selection._univariate_selection import (
+    _BaseFilter,
+    _clean_nans,
+)
 
 """
 Compute the weighted mutual information between each feature and the
@@ -23,18 +27,38 @@ entropy are given.
 """
 
 
-class SelectKBestWeighted:
-    def __init__(self, k):
+class SelectKBestWeighted(_BaseFilter):
+    def __init__(self, *, k=10):
+        super().__init__(score_func=mutual_info_classif)
         self.k = k
-        
+
+    def _check_params(self, X, y):
+        if self.k > X.shape[1] or self.k < 1:
+            raise ValueError(
+                f"k must be between 1 and {X.shape[1]} got {self.k}."
+            )
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+
+        if self.k == "all":
+            return np.ones(self.scores_.shape, dtype=bool)
+        elif self.k == 0:
+            return np.zeros(self.scores_.shape, dtype=bool)
+        else:
+            scores = _clean_nans(self.scores_)
+            mask = np.zeros(scores.shape, dtype=bool)
+
+            # Request a stable sort. Mergesort takes more memory (~40MB per
+            # megafeature on x86-64).
+            mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1
+            return mask
+
     def fit(self, X, y, sample_weight):
         self.X_, self.y_ = check_X_y(X, y)
-        self.X_ = X
-        self.y_ = y
+        self._check_params(X, y)
         self.n_features_in_ = X.shape[1]
         self.sample_weight_ = sample_weight
-        if self.k > X.shape[1] or self.k<1:
-            raise ValueError(f"k must be between 1 and {self.n_features_in_}")
         # Compute the entropy of the target variable
         entropy_y = -np.sum(
             np.multiply(
@@ -44,12 +68,12 @@ class SelectKBestWeighted:
         )
 
         # Compute the mutual information between each feature and the target
-        mi = mutual_info_classif(X, y)
+        mi = self.score_func(X, y)
 
         # Compute the weighted entropy of each feature
         entropy_weighted = []
         for i in range(X.shape[1]):
-            # Compute the weighted frequency of each unique value of the 
+            # Compute the weighted frequency of each unique value of the
             # feature
             freq_weighted = np.bincount(X[:, i], weights=sample_weight)
             freq_weighted = freq_weighted[freq_weighted != 0]
@@ -65,10 +89,5 @@ class SelectKBestWeighted:
         mi_weighted = mi * entropy_weighted / entropy_y
 
         # Return the weighted mutual information scores
-        self.mi_weighted_ = mi_weighted
+        self.scores_ = mi_weighted
         return self
-    
-    def get_feature_names_out(self, features):
-        check_is_fitted(self, ["X_", "y_", "mi_weighted_"])
-        return [features[i] for i in np.argsort(self.mi_weighted_)[::-1][:self.k]]
-