Fix BinDisc quantile mistakes (#9)

* Fix BinDisc quantile mistakes

* Fix FImdlp tests

* Fix tests, samples and remove uneeded support files

* Add coypright header to sources
Fix coverage report
Add coverage badge to README

* Update sonar github action

* Move sources to a folder and change ArffFiles files to library

* Add recursive submodules to github action
This commit is contained in:
Ricardo Montañana Gómez
2024-07-04 17:27:39 +02:00
committed by GitHub
parent 7b0673fd4b
commit e36d9af8f9
35 changed files with 1383 additions and 923 deletions

98
src/BinDisc.cpp Normal file
View File

@@ -0,0 +1,98 @@
// ****************************************************************
// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX - FileType: SOURCE
// SPDX - License - Identifier: MIT
// ****************************************************************
#include <algorithm>
#include <cmath>
#include "BinDisc.h"
#include <iostream>
#include <string>
namespace mdlp {
BinDisc::BinDisc(int n_bins, strategy_t strategy) :
Discretizer(), n_bins{ n_bins }, strategy{ strategy }
{
if (n_bins < 3) {
throw std::invalid_argument("n_bins must be greater than 2");
}
}
BinDisc::~BinDisc() = default;
void BinDisc::fit(samples_t& X)
{
// y is included for compatibility with the Discretizer interface
cutPoints.clear();
if (X.empty()) {
cutPoints.push_back(0.0);
cutPoints.push_back(0.0);
return;
}
if (strategy == strategy_t::QUANTILE) {
direction = bound_dir_t::RIGHT;
fit_quantile(X);
} else if (strategy == strategy_t::UNIFORM) {
direction = bound_dir_t::RIGHT;
fit_uniform(X);
}
}
void BinDisc::fit(samples_t& X, labels_t& y)
{
fit(X);
}
std::vector<precision_t> linspace(precision_t start, precision_t end, int num)
{
if (start == end) {
return { start, end };
}
precision_t delta = (end - start) / static_cast<precision_t>(num - 1);
std::vector<precision_t> linspc;
for (size_t i = 0; i < num; ++i) {
precision_t val = start + delta * static_cast<precision_t>(i);
linspc.push_back(val);
}
return linspc;
}
size_t clip(const size_t n, const size_t lower, const size_t upper)
{
return std::max(lower, std::min(n, upper));
}
std::vector<precision_t> percentile(samples_t& data, const std::vector<precision_t>& percentiles)
{
// Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html
std::vector<precision_t> results;
bool first = true;
results.reserve(percentiles.size());
for (auto percentile : percentiles) {
const auto i = static_cast<size_t>(std::floor(static_cast<precision_t>(data.size() - 1) * percentile / 100.));
const auto indexLower = clip(i, 0, data.size() - 2);
const precision_t percentI = static_cast<precision_t>(indexLower) / static_cast<precision_t>(data.size() - 1);
const precision_t fraction =
(percentile / 100.0 - percentI) /
(static_cast<precision_t>(indexLower + 1) / static_cast<precision_t>(data.size() - 1) - percentI);
if (const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction; value != results.back() || first) // first needed as results.back() return is undefined for empty vectors
results.push_back(value);
first = false;
}
return results;
}
void BinDisc::fit_quantile(const samples_t& X)
{
auto quantiles = linspace(0.0, 100.0, n_bins + 1);
auto data = X;
std::sort(data.begin(), data.end());
if (data.front() == data.back() || data.size() == 1) {
// if X is constant, pass any two given points that shall be ignored in transform
cutPoints.push_back(data.front());
cutPoints.push_back(data.front());
return;
}
cutPoints = percentile(data, quantiles);
}
void BinDisc::fit_uniform(const samples_t& X)
{
auto [vmin, vmax] = std::minmax_element(X.begin(), X.end());
cutPoints = linspace(*vmin, *vmax, n_bins + 1);
}
}

33
src/BinDisc.h Normal file
View File

@@ -0,0 +1,33 @@
// ****************************************************************
// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX - FileType: SOURCE
// SPDX - License - Identifier: MIT
// ****************************************************************
#ifndef BINDISC_H
#define BINDISC_H
#include "typesFImdlp.h"
#include "Discretizer.h"
#include <string>
namespace mdlp {
enum class strategy_t {
UNIFORM,
QUANTILE
};
class BinDisc : public Discretizer {
public:
BinDisc(int n_bins = 3, strategy_t strategy = strategy_t::UNIFORM);
~BinDisc();
// y is included for compatibility with the Discretizer interface
void fit(samples_t& X_, labels_t& y) override;
void fit(samples_t& X);
private:
void fit_uniform(const samples_t&);
void fit_quantile(const samples_t&);
int n_bins;
strategy_t strategy;
};
}
#endif

221
src/CPPFImdlp.cpp Normal file
View File

@@ -0,0 +1,221 @@
// ****************************************************************
// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX - FileType: SOURCE
// SPDX - License - Identifier: MIT
// ****************************************************************
#include <numeric>
#include <algorithm>
#include <set>
#include <cmath>
#include "CPPFImdlp.h"
namespace mdlp {
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) :
Discretizer(),
min_length(min_length_),
max_depth(max_depth_),
proposed_cuts(proposed)
{
direction = bound_dir_t::RIGHT;
}
size_t CPPFImdlp::compute_max_num_cut_points() const
{
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples
if (proposed_cuts == 0) {
return numeric_limits<size_t>::max();
}
if (proposed_cuts < 0 || proposed_cuts > static_cast<precision_t>(X.size())) {
throw invalid_argument("wrong proposed num_cuts value");
}
if (proposed_cuts < 1)
return static_cast<size_t>(round(static_cast<precision_t>(X.size()) * proposed_cuts));
return static_cast<size_t>(proposed_cuts); // The 2 extra cutpoints should not be considered here as this parameter is considered before they are added
}
void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
{
X = X_;
y = y_;
num_cut_points = compute_max_num_cut_points();
depth = 0;
discretizedData.clear();
cutPoints.clear();
if (X.size() != y.size()) {
throw invalid_argument("X and y must have the same size");
}
if (X.empty() || y.empty()) {
throw invalid_argument("X and y must have at least one element");
}
if (min_length < 3) {
throw invalid_argument("min_length must be greater than 2");
}
if (max_depth < 1) {
throw invalid_argument("max_depth must be greater than 0");
}
indices = sortIndices(X_, y_);
metrics.setData(y, indices);
computeCutPoints(0, X.size(), 1);
sort(cutPoints.begin(), cutPoints.end());
if (num_cut_points > 0) {
// Select the best (with lower entropy) cut points
while (cutPoints.size() > num_cut_points) {
resizeCutPoints();
}
}
// Insert first & last X value to the cutpoints as them shall be ignored in transform
auto [vmin, vmax] = std::minmax_element(X.begin(), X.end());
cutPoints.push_back(*vmax);
cutPoints.insert(cutPoints.begin(), *vmin);
}
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
{
size_t n;
size_t m;
size_t idxPrev = cut - 1 >= start ? cut - 1 : cut;
size_t idxNext = cut + 1 < end ? cut + 1 : cut;
bool backWall; // true if duplicates reach beginning of the interval
precision_t previous;
precision_t actual;
precision_t next;
previous = X[indices[idxPrev]];
actual = X[indices[cut]];
next = X[indices[idxNext]];
// definition 2 of the paper => X[t-1] < X[t]
// get the first equal value of X in the interval
while (idxPrev > start && actual == previous) {
previous = X[indices[--idxPrev]];
}
backWall = idxPrev == start && actual == previous;
// get the last equal value of X in the interval
while (idxNext < end - 1 && actual == next) {
next = X[indices[++idxNext]];
}
// # of duplicates before cutpoint
n = cut - 1 - idxPrev;
// # of duplicates after cutpoint
m = idxNext - cut - 1;
// Decide which values to use
cut = cut + (backWall ? m + 1 : -n);
actual = X[indices[cut]];
return { (actual + previous) / 2, cut };
}
void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_)
{
size_t cut;
pair<precision_t, size_t> result;
// Check if the interval length and the depth are Ok
if (end - start < min_length || depth_ > max_depth)
return;
depth = depth_ > depth ? depth_ : depth;
cut = getCandidate(start, end);
if (cut == numeric_limits<size_t>::max())
return;
if (mdlp(start, cut, end)) {
result = valueCutPoint(start, cut, end);
cut = result.second;
cutPoints.push_back(result.first);
computeCutPoints(start, cut, depth_ + 1);
computeCutPoints(cut, end, depth_ + 1);
}
}
size_t CPPFImdlp::getCandidate(size_t start, size_t end)
{
/* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
E(A, TA; S) is minimal amongst all the candidate cut points. */
size_t candidate = numeric_limits<size_t>::max();
size_t elements = end - start;
bool sameValues = true;
precision_t entropy_left;
precision_t entropy_right;
precision_t minEntropy;
// Check if all the values of the variable in the interval are the same
for (size_t idx = start + 1; idx < end; idx++) {
if (X[indices[idx]] != X[indices[start]]) {
sameValues = false;
break;
}
}
if (sameValues)
return candidate;
minEntropy = metrics.entropy(start, end);
for (size_t idx = start + 1; idx < end; idx++) {
// Cutpoints are always on boundaries (definition 2)
if (y[indices[idx]] == y[indices[idx - 1]])
continue;
entropy_left = precision_t(idx - start) / static_cast<precision_t>(elements) * metrics.entropy(start, idx);
entropy_right = precision_t(end - idx) / static_cast<precision_t>(elements) * metrics.entropy(idx, end);
if (entropy_left + entropy_right < minEntropy) {
minEntropy = entropy_left + entropy_right;
candidate = idx;
}
}
return candidate;
}
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
{
int k;
int k1;
int k2;
precision_t ig;
precision_t delta;
precision_t ent;
precision_t ent1;
precision_t ent2;
auto N = precision_t(end - start);
k = metrics.computeNumClasses(start, end);
k1 = metrics.computeNumClasses(start, cut);
k2 = metrics.computeNumClasses(cut, end);
ent = metrics.entropy(start, end);
ent1 = metrics.entropy(start, cut);
ent2 = metrics.entropy(cut, end);
ig = metrics.informationGain(start, cut, end);
delta = static_cast<precision_t>(log2(pow(3, precision_t(k)) - 2) -
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
precision_t term = 1 / N * (log2(N - 1) + delta);
return ig > term;
}
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
{
indices_t idx(X_.size());
std::iota(idx.begin(), idx.end(), 0);
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
if (X_[i1] == X_[i2])
return y_[i1] < y_[i2];
else
return X_[i1] < X_[i2];
});
return idx;
}
void CPPFImdlp::resizeCutPoints()
{
//Compute entropy of each of the whole cutpoint set and discards the biggest value
precision_t maxEntropy = 0;
precision_t entropy;
size_t maxEntropyIdx = 0;
size_t begin = 0;
size_t end;
for (size_t idx = 0; idx < cutPoints.size(); idx++) {
end = begin;
while (X[indices[end]] < cutPoints[idx] && end < X.size())
end++;
entropy = metrics.entropy(begin, end);
if (entropy > maxEntropy) {
maxEntropy = entropy;
maxEntropyIdx = idx;
}
begin = end;
}
cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx));
}
}

44
src/CPPFImdlp.h Normal file
View File

@@ -0,0 +1,44 @@
// ****************************************************************
// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX - FileType: SOURCE
// SPDX - License - Identifier: MIT
// ****************************************************************
#ifndef CPPFIMDLP_H
#define CPPFIMDLP_H
#include "typesFImdlp.h"
#include <limits>
#include <utility>
#include <string>
#include "Metrics.h"
#include "Discretizer.h"
namespace mdlp {
class CPPFImdlp : public Discretizer {
public:
CPPFImdlp() = default;
CPPFImdlp(size_t min_length_, int max_depth_, float proposed);
virtual ~CPPFImdlp() = default;
void fit(samples_t& X_, labels_t& y_) override;
inline int get_depth() const { return depth; };
protected:
size_t min_length = 3;
int depth = 0;
int max_depth = numeric_limits<int>::max();
float proposed_cuts = 0;
indices_t indices = indices_t();
samples_t X = samples_t();
labels_t y = labels_t();
Metrics metrics = Metrics(y, indices);
size_t num_cut_points = numeric_limits<size_t>::max();
static indices_t sortIndices(samples_t&, labels_t&);
void computeCutPoints(size_t, size_t, int);
void resizeCutPoints();
bool mdlp(size_t, size_t, size_t);
size_t getCandidate(size_t, size_t);
size_t compute_max_num_cut_points() const;
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
};
}
#endif

54
src/Discretizer.cpp Normal file
View File

@@ -0,0 +1,54 @@
// ****************************************************************
// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX - FileType: SOURCE
// SPDX - License - Identifier: MIT
// ****************************************************************
#include "Discretizer.h"
namespace mdlp {
labels_t& Discretizer::transform(const samples_t& data)
{
discretizedData.clear();
discretizedData.reserve(data.size());
// CutPoints always have at least two items
// Have to ignore first and last cut points provided
auto first = cutPoints.begin() + 1;
auto last = cutPoints.end() - 1;
auto bound = direction == bound_dir_t::LEFT ? std::lower_bound<std::vector<precision_t>::iterator, precision_t> : std::upper_bound<std::vector<precision_t>::iterator, precision_t>;
for (const precision_t& item : data) {
auto pos = bound(first, last, item);
auto number = pos - first;
discretizedData.push_back(static_cast<label_t>(number));
}
return discretizedData;
}
labels_t& Discretizer::fit_transform(samples_t& X_, labels_t& y_)
{
fit(X_, y_);
return transform(X_);
}
void Discretizer::fit_t(const torch::Tensor& X_, const torch::Tensor& y_)
{
auto num_elements = X_.numel();
samples_t X(X_.data_ptr<precision_t>(), X_.data_ptr<precision_t>() + num_elements);
labels_t y(y_.data_ptr<int>(), y_.data_ptr<int>() + num_elements);
fit(X, y);
}
torch::Tensor Discretizer::transform_t(const torch::Tensor& X_)
{
auto num_elements = X_.numel();
samples_t X(X_.data_ptr<precision_t>(), X_.data_ptr<precision_t>() + num_elements);
auto result = transform(X);
return torch::tensor(result, torch_label_t);
}
torch::Tensor Discretizer::fit_transform_t(const torch::Tensor& X_, const torch::Tensor& y_)
{
auto num_elements = X_.numel();
samples_t X(X_.data_ptr<precision_t>(), X_.data_ptr<precision_t>() + num_elements);
labels_t y(y_.data_ptr<int>(), y_.data_ptr<int>() + num_elements);
auto result = fit_transform(X, y);
return torch::tensor(result, torch_label_t);
}
}

39
src/Discretizer.h Normal file
View File

@@ -0,0 +1,39 @@
// ****************************************************************
// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX - FileType: SOURCE
// SPDX - License - Identifier: MIT
// ****************************************************************
#ifndef DISCRETIZER_H
#define DISCRETIZER_H
#include <string>
#include <algorithm>
#include "typesFImdlp.h"
#include <torch/torch.h>
namespace mdlp {
enum class bound_dir_t {
LEFT,
RIGHT
};
const auto torch_label_t = torch::kInt32;
class Discretizer {
public:
Discretizer() = default;
virtual ~Discretizer() = default;
inline cutPoints_t getCutPoints() const { return cutPoints; };
virtual void fit(samples_t& X_, labels_t& y_) = 0;
labels_t& transform(const samples_t& data);
labels_t& fit_transform(samples_t& X_, labels_t& y_);
void fit_t(const torch::Tensor& X_, const torch::Tensor& y_);
torch::Tensor transform_t(const torch::Tensor& X_);
torch::Tensor fit_transform_t(const torch::Tensor& X_, const torch::Tensor& y_);
static inline std::string version() { return "1.2.3"; };
protected:
labels_t discretizedData = labels_t();
cutPoints_t cutPoints; // At least two cutpoints must be provided, the first and the last will be ignored in transform
bound_dir_t direction; // used in transform
};
}
#endif

84
src/Metrics.cpp Normal file
View File

@@ -0,0 +1,84 @@
// ****************************************************************
// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX - FileType: SOURCE
// SPDX - License - Identifier: MIT
// ****************************************************************
#include "Metrics.h"
#include <set>
#include <cmath>
using namespace std;
namespace mdlp {
Metrics::Metrics(labels_t& y_, indices_t& indices_) : y(y_), indices(indices_),
numClasses(computeNumClasses(0, indices_.size()))
{
}
int Metrics::computeNumClasses(size_t start, size_t end)
{
set<int> nClasses;
for (auto i = start; i < end; ++i) {
nClasses.insert(y[indices[i]]);
}
return static_cast<int>(nClasses.size());
}
void Metrics::setData(const labels_t& y_, const indices_t& indices_)
{
indices = indices_;
y = y_;
numClasses = computeNumClasses(0, indices.size());
entropyCache.clear();
igCache.clear();
}
precision_t Metrics::entropy(size_t start, size_t end)
{
precision_t p;
precision_t ventropy = 0;
int nElements = 0;
labels_t counts(numClasses + 1, 0);
if (end - start < 2)
return 0;
if (entropyCache.find({ start, end }) != entropyCache.end()) {
return entropyCache[{start, end}];
}
for (auto i = &indices[start]; i != &indices[end]; ++i) {
counts[y[*i]]++;
nElements++;
}
for (auto count : counts) {
if (count > 0) {
p = static_cast<precision_t>(count) / static_cast<precision_t>(nElements);
ventropy -= p * log2(p);
}
}
entropyCache[{start, end}] = ventropy;
return ventropy;
}
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
{
precision_t iGain;
precision_t entropyInterval;
precision_t entropyLeft;
precision_t entropyRight;
size_t nElementsLeft = cut - start;
size_t nElementsRight = end - cut;
size_t nElements = end - start;
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
return igCache[make_tuple(start, cut, end)];
}
entropyInterval = entropy(start, end);
entropyLeft = entropy(start, cut);
entropyRight = entropy(cut, end);
iGain = entropyInterval -
(static_cast<precision_t>(nElementsLeft) * entropyLeft +
static_cast<precision_t>(nElementsRight) * entropyRight) /
static_cast<precision_t>(nElements);
igCache[make_tuple(start, cut, end)] = iGain;
return iGain;
}
}

28
src/Metrics.h Normal file
View File

@@ -0,0 +1,28 @@
// ****************************************************************
// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX - FileType: SOURCE
// SPDX - License - Identifier: MIT
// ****************************************************************
#ifndef CCMETRICS_H
#define CCMETRICS_H
#include "typesFImdlp.h"
namespace mdlp {
class Metrics {
protected:
labels_t& y;
indices_t& indices;
int numClasses;
cacheEnt_t entropyCache = cacheEnt_t();
cacheIg_t igCache = cacheIg_t();
public:
Metrics(labels_t&, indices_t&);
void setData(const labels_t&, const indices_t&);
int computeNumClasses(size_t, size_t);
precision_t entropy(size_t, size_t);
precision_t informationGain(size_t, size_t, size_t);
};
}
#endif

19
src/typesFImdlp.h Normal file
View File

@@ -0,0 +1,19 @@
#ifndef TYPES_H
#define TYPES_H
#include <vector>
#include <map>
#include <stdexcept>
using namespace std;
namespace mdlp {
typedef float precision_t;
typedef int label_t;
typedef std::vector<precision_t> samples_t;
typedef std::vector<label_t> labels_t;
typedef std::vector<size_t> indices_t;
typedef std::vector<precision_t> cutPoints_t;
typedef std::map<std::pair<int, int>, precision_t> cacheEnt_t;
typedef std::map<std::tuple<int, int, int>, precision_t> cacheIg_t;
}
#endif