Complete feature_selection with weighted entropy

This commit is contained in:
2023-06-21 16:40:29 +02:00
parent fbaa5eb7d3
commit 99321043ec
2 changed files with 36 additions and 15 deletions

View File

@@ -891,6 +891,7 @@ class BoostAODE(ClassifierMixin, BaseEnsemble):
SelectKBestWeighted(k=1)
.fit(self.X_, self.y_, weights)
.get_feature_names_out(self.feature_names_in_)
.tolist()
)
# Step 2: Build & train spode with the first feature as sparent
estimator = clone(self.estimator_)
@@ -898,6 +899,7 @@ class BoostAODE(ClassifierMixin, BaseEnsemble):
_args["sparent"] = feature
_args["sample_weight"] = weights
_args["weighted"] = True
print("I'm gonna build a spode with", feature)
# Step 2.1: build dataset
# Step 2.2: Train the model
estimator.fit(self.X_, self.y_, **_args)

View File

@@ -1,6 +1,10 @@
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.feature_selection._univariate_selection import (
_BaseFilter,
_clean_nans,
)
"""
Compute the weighted mutual information between each feature and the
@@ -23,18 +27,38 @@ entropy are given.
"""
class SelectKBestWeighted:
def __init__(self, k):
class SelectKBestWeighted(_BaseFilter):
def __init__(self, *, k=10):
super().__init__(score_func=mutual_info_classif)
self.k = k
def _check_params(self, X, y):
if self.k > X.shape[1] or self.k < 1:
raise ValueError(
f"k must be between 1 and {X.shape[1]} got {self.k}."
)
def _get_support_mask(self):
check_is_fitted(self)
if self.k == "all":
return np.ones(self.scores_.shape, dtype=bool)
elif self.k == 0:
return np.zeros(self.scores_.shape, dtype=bool)
else:
scores = _clean_nans(self.scores_)
mask = np.zeros(scores.shape, dtype=bool)
# Request a stable sort. Mergesort takes more memory (~40MB per
# megafeature on x86-64).
mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1
return mask
def fit(self, X, y, sample_weight):
self.X_, self.y_ = check_X_y(X, y)
self.X_ = X
self.y_ = y
self._check_params(X, y)
self.n_features_in_ = X.shape[1]
self.sample_weight_ = sample_weight
if self.k > X.shape[1] or self.k<1:
raise ValueError(f"k must be between 1 and {self.n_features_in_}")
# Compute the entropy of the target variable
entropy_y = -np.sum(
np.multiply(
@@ -44,12 +68,12 @@ class SelectKBestWeighted:
)
# Compute the mutual information between each feature and the target
mi = mutual_info_classif(X, y)
mi = self.score_func(X, y)
# Compute the weighted entropy of each feature
entropy_weighted = []
for i in range(X.shape[1]):
# Compute the weighted frequency of each unique value of the
# Compute the weighted frequency of each unique value of the
# feature
freq_weighted = np.bincount(X[:, i], weights=sample_weight)
freq_weighted = freq_weighted[freq_weighted != 0]
@@ -65,10 +89,5 @@ class SelectKBestWeighted:
mi_weighted = mi * entropy_weighted / entropy_y
# Return the weighted mutual information scores
self.mi_weighted_ = mi_weighted
self.scores_ = mi_weighted
return self
def get_feature_names_out(self, features):
check_is_fitted(self, ["X_", "y_", "mi_weighted_"])
return [features[i] for i in np.argsort(self.mi_weighted_)[::-1][:self.k]]