#include #include #include #include "BinDisc.h" #include #include namespace mdlp { BinDisc::BinDisc(int n_bins, strategy_t strategy) : n_bins{ n_bins }, strategy{ strategy } { if (n_bins < 3) { throw std::invalid_argument("n_bins must be greater than 2"); } } BinDisc::~BinDisc() = default; void BinDisc::fit(samples_t& X) { cutPoints.clear(); if (X.empty()) { cutPoints.push_back(std::numeric_limits::max()); return; } if (strategy == strategy_t::QUANTILE) { fit_quantile(X); } else if (strategy == strategy_t::UNIFORM) { fit_uniform(X); } } std::vector linspace(precision_t start, precision_t end, int num) { // Doesn't include end point as it is not needed if (start == end) { return { 0 }; } precision_t delta = (end - start) / static_cast(num - 1); std::vector linspc; for (size_t i = 0; i < num - 1; ++i) { precision_t val = start + delta * static_cast(i); linspc.push_back(val); } return linspc; } size_t clip(const size_t n, size_t lower, size_t upper) { return std::max(lower, std::min(n, upper)); } std::vector percentile(samples_t& data, std::vector& percentiles) { // Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html std::vector results; results.reserve(percentiles.size()); for (auto percentile : percentiles) { const size_t i = static_cast(std::floor(static_cast(data.size() - 1) * percentile / 100.)); const auto indexLower = clip(i, 0, data.size() - 1); const double percentI = static_cast(indexLower) / static_cast(data.size() - 1); const double fraction = (percentile / 100.0 - percentI) / (static_cast(indexLower + 1) / static_cast(data.size() - 1) - percentI); const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction; if (value != results.back()) results.push_back(value); } return results; } void BinDisc::fit_quantile(samples_t& X) { auto quantiles = linspace(0.0, 100.0, n_bins + 1); auto data = X; std::sort(data.begin(), data.end()); if (data.front() == data.back() || data.size() == 1) { // if X is constant cutPoints.push_back(std::numeric_limits::max()); return; } cutPoints = percentile(data, quantiles); normalizeCutPoints(); } void BinDisc::fit_uniform(samples_t& X) { auto minmax = std::minmax_element(X.begin(), X.end()); cutPoints = linspace(*minmax.first, *minmax.second, n_bins + 1); normalizeCutPoints(); } void BinDisc::normalizeCutPoints() { // Add max value to the end cutPoints.push_back(std::numeric_limits::max()); // Remove first as it is not needed cutPoints.erase(cutPoints.begin()); } labels_t& BinDisc::transform(const samples_t& X) { discretizedData.clear(); discretizedData.reserve(X.size()); for (const precision_t& item : X) { auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item); discretizedData.push_back(upper - cutPoints.begin()); } return discretizedData; } } // void BinDisc::fit_quantile(samples_t& X) // { // cutPoints.clear(); // if (X.empty()) { // cutPoints.push_back(std::numeric_limits::max()); // return; // } // samples_t data = X; // std::sort(data.begin(), data.end()); // float min_val = data.front(); // float max_val = data.back(); // // Handle case of all data points having the same value // if (min_val == max_val) { // cutPoints.push_back(std::numeric_limits::max()); // return; // } // int first = X.size() / n_bins; // cutPoints.push_back(data.at(first - 1)); // int bins_done = 1; // int prev = first - 1; // while (bins_done < n_bins) { // int next = first * (bins_done + 1) - 1; // while (next < X.size() && data.at(next) == data[prev]) { // ++next; // } // if (next == X.size() || bins_done == n_bins - 1) { // cutPoints.push_back(std::numeric_limits::max()); // break; // } else { // cutPoints.push_back(data[next]); // bins_done++; // prev = next; // } // } // }