Add L1FS feature selection
This commit is contained in:
279
bayesnet/feature_selection/L1FS.cc
Normal file
279
bayesnet/feature_selection/L1FS.cc
Normal file
@@ -0,0 +1,279 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include "bayesnet/utils/bayesnetUtils.h"
|
||||
#include "L1FS.h"
|
||||
|
||||
namespace bayesnet {
|
||||
using namespace torch::indexing;
|
||||
|
||||
L1FS::L1FS(const torch::Tensor& samples,
|
||||
const std::vector<std::string>& features,
|
||||
const std::string& className,
|
||||
const int maxFeatures,
|
||||
const int classNumStates,
|
||||
const torch::Tensor& weights,
|
||||
const double alpha,
|
||||
const int maxIter,
|
||||
const double tolerance,
|
||||
const bool fitIntercept)
|
||||
: FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights),
|
||||
alpha(alpha), maxIter(maxIter), tolerance(tolerance), fitIntercept(fitIntercept)
|
||||
{
|
||||
if (alpha < 0) {
|
||||
throw std::invalid_argument("Alpha (regularization strength) must be non-negative");
|
||||
}
|
||||
if (maxIter < 1) {
|
||||
throw std::invalid_argument("Maximum iterations must be positive");
|
||||
}
|
||||
if (tolerance <= 0) {
|
||||
throw std::invalid_argument("Tolerance must be positive");
|
||||
}
|
||||
|
||||
// Determine if this is a regression or classification task
|
||||
// For simplicity, assume binary classification if classNumStates == 2
|
||||
// and regression otherwise (this can be refined based on your needs)
|
||||
isRegression = (classNumStates > 2 || classNumStates == 0);
|
||||
}
|
||||
|
||||
void L1FS::fit()
|
||||
{
|
||||
initialize();
|
||||
|
||||
// Prepare data
|
||||
int n_samples = samples.size(1);
|
||||
int n_features = features.size();
|
||||
|
||||
// Extract features (all rows except last)
|
||||
auto X = samples.index({ Slice(0, n_features), Slice() }).t().contiguous();
|
||||
|
||||
// Extract labels (last row)
|
||||
auto y = samples.index({ -1, Slice() }).contiguous();
|
||||
|
||||
// Convert to float for numerical operations
|
||||
X = X.to(torch::kFloat32);
|
||||
y = y.to(torch::kFloat32);
|
||||
|
||||
// Normalize features for better convergence
|
||||
auto X_mean = X.mean(0);
|
||||
auto X_std = X.std(0);
|
||||
X_std = torch::where(X_std == 0, torch::ones_like(X_std), X_std);
|
||||
X = (X - X_mean) / X_std;
|
||||
|
||||
if (isRegression) {
|
||||
// Normalize y for regression
|
||||
auto y_mean = y.mean();
|
||||
auto y_std = y.std();
|
||||
if (y_std.item<double>() > 0) {
|
||||
y = (y - y_mean) / y_std;
|
||||
}
|
||||
fitLasso(X, y, weights);
|
||||
} else {
|
||||
// For binary classification
|
||||
fitL1Logistic(X, y, weights);
|
||||
}
|
||||
|
||||
// Select features based on non-zero coefficients
|
||||
std::vector<std::pair<int, double>> featureImportance;
|
||||
for (int i = 0; i < n_features; ++i) {
|
||||
double coef_magnitude = std::abs(coefficients[i]);
|
||||
if (coef_magnitude > 1e-10) { // Threshold for numerical zero
|
||||
featureImportance.push_back({ i, coef_magnitude });
|
||||
}
|
||||
}
|
||||
|
||||
// If all coefficients are zero (high regularization), select based on original feature-class correlation
|
||||
if (featureImportance.empty() && maxFeatures > 0) {
|
||||
// Compute SU with labels as fallback
|
||||
computeSuLabels();
|
||||
auto featureOrder = argsort(suLabels);
|
||||
|
||||
// Select top features by SU score
|
||||
int numToSelect = std::min(static_cast<int>(featureOrder.size()),
|
||||
std::min(maxFeatures, 3)); // At most 3 features as fallback
|
||||
|
||||
for (int i = 0; i < numToSelect; ++i) {
|
||||
selectedFeatures.push_back(featureOrder[i]);
|
||||
selectedScores.push_back(suLabels[featureOrder[i]]);
|
||||
}
|
||||
} else {
|
||||
// Sort by importance (absolute coefficient value)
|
||||
std::sort(featureImportance.begin(), featureImportance.end(),
|
||||
[](const auto& a, const auto& b) { return a.second > b.second; });
|
||||
|
||||
// Select top features up to maxFeatures
|
||||
int numToSelect = std::min(static_cast<int>(featureImportance.size()),
|
||||
maxFeatures);
|
||||
|
||||
for (int i = 0; i < numToSelect; ++i) {
|
||||
selectedFeatures.push_back(featureImportance[i].first);
|
||||
selectedScores.push_back(featureImportance[i].second);
|
||||
}
|
||||
}
|
||||
|
||||
fitted = true;
|
||||
}
|
||||
|
||||
void L1FS::fitLasso(const torch::Tensor& X, const torch::Tensor& y,
|
||||
const torch::Tensor& sampleWeights)
|
||||
{
|
||||
int n_samples = X.size(0);
|
||||
int n_features = X.size(1);
|
||||
|
||||
// Initialize coefficients
|
||||
coefficients.resize(n_features, 0.0);
|
||||
double intercept = 0.0;
|
||||
|
||||
// Ensure consistent types
|
||||
torch::Tensor weights = sampleWeights.to(torch::kFloat32);
|
||||
|
||||
// Coordinate descent for Lasso
|
||||
torch::Tensor residuals = y.clone();
|
||||
if (fitIntercept) {
|
||||
intercept = (y * weights).sum().item<float>() / weights.sum().item<float>();
|
||||
residuals = y - intercept;
|
||||
}
|
||||
|
||||
// Precompute feature norms
|
||||
std::vector<double> featureNorms(n_features);
|
||||
for (int j = 0; j < n_features; ++j) {
|
||||
auto Xj = X.index({ Slice(), j });
|
||||
featureNorms[j] = (Xj * Xj * weights).sum().item<float>();
|
||||
}
|
||||
|
||||
// Coordinate descent iterations
|
||||
for (int iter = 0; iter < maxIter; ++iter) {
|
||||
double maxChange = 0.0;
|
||||
|
||||
// Update each coordinate
|
||||
for (int j = 0; j < n_features; ++j) {
|
||||
auto Xj = X.index({ Slice(), j });
|
||||
|
||||
// Compute partial residuals (excluding feature j)
|
||||
torch::Tensor partialResiduals = residuals + coefficients[j] * Xj;
|
||||
|
||||
// Compute rho (correlation with residuals)
|
||||
double rho = (Xj * partialResiduals * weights).sum().item<float>();
|
||||
|
||||
// Soft thresholding
|
||||
double oldCoef = coefficients[j];
|
||||
coefficients[j] = softThreshold(rho, alpha) / featureNorms[j];
|
||||
|
||||
// Update residuals
|
||||
residuals = partialResiduals - coefficients[j] * Xj;
|
||||
|
||||
maxChange = std::max(maxChange, std::abs(coefficients[j] - oldCoef));
|
||||
}
|
||||
|
||||
// Update intercept if needed
|
||||
if (fitIntercept) {
|
||||
double oldIntercept = intercept;
|
||||
intercept = (residuals * weights).sum().item<float>() /
|
||||
weights.sum().item<float>();
|
||||
residuals = residuals - (intercept - oldIntercept);
|
||||
maxChange = std::max(maxChange, std::abs(intercept - oldIntercept));
|
||||
}
|
||||
|
||||
// Check convergence
|
||||
if (maxChange < tolerance) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void L1FS::fitL1Logistic(const torch::Tensor& X, const torch::Tensor& y,
|
||||
const torch::Tensor& sampleWeights)
|
||||
{
|
||||
int n_samples = X.size(0);
|
||||
int n_features = X.size(1);
|
||||
|
||||
// Initialize coefficients
|
||||
torch::Tensor coef = torch::zeros({ n_features }, torch::kFloat32);
|
||||
double intercept = 0.0;
|
||||
|
||||
// Ensure consistent types
|
||||
torch::Tensor weights = sampleWeights.to(torch::kFloat32);
|
||||
|
||||
// Learning rate (can be adaptive)
|
||||
double learningRate = 0.01;
|
||||
|
||||
// Proximal gradient descent
|
||||
for (int iter = 0; iter < maxIter; ++iter) {
|
||||
// Compute predictions
|
||||
torch::Tensor linearPred = X.matmul(coef);
|
||||
if (fitIntercept) {
|
||||
linearPred = linearPred + intercept;
|
||||
}
|
||||
torch::Tensor pred = sigmoid(linearPred);
|
||||
|
||||
// Compute gradient
|
||||
torch::Tensor diff = pred - y;
|
||||
torch::Tensor grad = X.t().matmul(diff * weights) / n_samples;
|
||||
|
||||
// Gradient descent step
|
||||
torch::Tensor coef_new = coef - learningRate * grad;
|
||||
|
||||
// Proximal step (soft thresholding)
|
||||
for (int j = 0; j < n_features; ++j) {
|
||||
coef_new[j] = softThreshold(coef_new[j].item<float>(),
|
||||
learningRate * alpha);
|
||||
}
|
||||
|
||||
// Update intercept if needed
|
||||
if (fitIntercept) {
|
||||
double grad_intercept = (diff * weights).sum().item<float>() / n_samples;
|
||||
intercept -= learningRate * grad_intercept;
|
||||
}
|
||||
|
||||
// Check convergence
|
||||
double change = (coef_new - coef).abs().max().item<float>();
|
||||
coef = coef_new;
|
||||
|
||||
if (change < tolerance) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Adaptive learning rate (optional)
|
||||
if (iter % 100 == 0) {
|
||||
learningRate *= 0.9;
|
||||
}
|
||||
}
|
||||
|
||||
// Store final coefficients
|
||||
coefficients.resize(n_features);
|
||||
for (int j = 0; j < n_features; ++j) {
|
||||
coefficients[j] = coef[j].item<float>();
|
||||
}
|
||||
}
|
||||
|
||||
double L1FS::softThreshold(double x, double lambda) const
|
||||
{
|
||||
if (x > lambda) {
|
||||
return x - lambda;
|
||||
} else if (x < -lambda) {
|
||||
return x + lambda;
|
||||
} else {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
torch::Tensor L1FS::sigmoid(const torch::Tensor& z) const
|
||||
{
|
||||
return 1.0 / (1.0 + torch::exp(-z));
|
||||
}
|
||||
|
||||
std::vector<double> L1FS::getCoefficients() const
|
||||
{
|
||||
if (!fitted) {
|
||||
throw std::runtime_error("L1FS not fitted");
|
||||
}
|
||||
return coefficients;
|
||||
}
|
||||
|
||||
} // namespace bayesnet
|
83
bayesnet/feature_selection/L1FS.h
Normal file
83
bayesnet/feature_selection/L1FS.h
Normal file
@@ -0,0 +1,83 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#ifndef L1FS_H
|
||||
#define L1FS_H
|
||||
#include <torch/torch.h>
|
||||
#include <vector>
|
||||
#include "bayesnet/feature_selection/FeatureSelect.h"
|
||||
|
||||
namespace bayesnet {
|
||||
/**
|
||||
* L1-Regularized Feature Selection (L1FS)
|
||||
*
|
||||
* This class implements feature selection using L1-regularized linear models.
|
||||
* For classification tasks, it uses one-vs-rest logistic regression with L1 penalty.
|
||||
* For regression tasks, it uses Lasso regression.
|
||||
*
|
||||
* The L1 penalty induces sparsity in the model coefficients, effectively
|
||||
* performing feature selection by setting irrelevant feature weights to zero.
|
||||
*/
|
||||
class L1FS : public FeatureSelect {
|
||||
public:
|
||||
/**
|
||||
* Constructor for L1FS
|
||||
* @param samples n+1xm tensor where samples[-1] is the target variable
|
||||
* @param features vector of feature names
|
||||
* @param className name of the class/target variable
|
||||
* @param maxFeatures maximum number of features to select (0 = all)
|
||||
* @param classNumStates number of states for classification (ignored for regression)
|
||||
* @param weights sample weights
|
||||
* @param alpha L1 regularization strength (higher = more sparsity)
|
||||
* @param maxIter maximum iterations for optimization
|
||||
* @param tolerance convergence tolerance
|
||||
* @param fitIntercept whether to fit an intercept term
|
||||
*/
|
||||
L1FS(const torch::Tensor& samples,
|
||||
const std::vector<std::string>& features,
|
||||
const std::string& className,
|
||||
const int maxFeatures,
|
||||
const int classNumStates,
|
||||
const torch::Tensor& weights,
|
||||
const double alpha = 1.0,
|
||||
const int maxIter = 1000,
|
||||
const double tolerance = 1e-4,
|
||||
const bool fitIntercept = true);
|
||||
|
||||
virtual ~L1FS() {};
|
||||
|
||||
void fit() override;
|
||||
|
||||
// Get the learned coefficients for each feature
|
||||
std::vector<double> getCoefficients() const;
|
||||
|
||||
private:
|
||||
double alpha; // L1 regularization strength
|
||||
int maxIter; // Maximum iterations for optimization
|
||||
double tolerance; // Convergence tolerance
|
||||
bool fitIntercept; // Whether to fit intercept
|
||||
bool isRegression; // Task type (regression vs classification)
|
||||
|
||||
std::vector<double> coefficients; // Learned coefficients
|
||||
|
||||
// Coordinate descent for Lasso regression
|
||||
void fitLasso(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& sampleWeights);
|
||||
|
||||
// Proximal gradient descent for L1-regularized logistic regression
|
||||
void fitL1Logistic(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& sampleWeights);
|
||||
|
||||
// Soft thresholding operator for L1 regularization
|
||||
double softThreshold(double x, double lambda) const;
|
||||
|
||||
// Logistic function
|
||||
torch::Tensor sigmoid(const torch::Tensor& z) const;
|
||||
|
||||
// Compute logistic loss
|
||||
double logisticLoss(const torch::Tensor& X, const torch::Tensor& y,
|
||||
const torch::Tensor& coef, const torch::Tensor& sampleWeights) const;
|
||||
};
|
||||
}
|
||||
#endif
|
Reference in New Issue
Block a user