Implement BinDisc and tests

This commit is contained in:
2024-06-05 10:45:11 +02:00
parent 236d1b2f8b
commit 6b68a41c42
11 changed files with 954 additions and 29 deletions

2
.gitignore vendored
View File

@@ -31,6 +31,8 @@
*.out
*.app
**/build
build_Debug
build_Release
**/lcoverage
.idea
cmake-*

138
BinDisc.cpp Normal file
View File

@@ -0,0 +1,138 @@
#include <algorithm>
#include <limits>
#include <cmath>
#include "BinDisc.h"
#include <iostream>
#include <string>
namespace mdlp {
BinDisc::BinDisc(int n_bins, strategy_t strategy) : n_bins{ n_bins }, strategy{ strategy }
{
if (n_bins < 3) {
throw std::invalid_argument("n_bins must be greater than 2");
}
}
BinDisc::~BinDisc() = default;
void BinDisc::fit(samples_t& X)
{
cutPoints.clear();
if (X.empty()) {
cutPoints.push_back(std::numeric_limits<precision_t>::max());
return;
}
if (strategy == strategy_t::QUANTILE) {
fit_quantile(X);
} else if (strategy == strategy_t::UNIFORM) {
fit_uniform(X);
}
}
std::vector<precision_t> linspace(precision_t start, precision_t end, int num)
{
// Doesn't include end point as it is not needed
if (start == end) {
return { 0 };
}
precision_t delta = (end - start) / static_cast<precision_t>(num - 1);
std::vector<precision_t> linspc;
for (size_t i = 0; i < num - 1; ++i) {
precision_t val = start + delta * static_cast<precision_t>(i);
linspc.push_back(val);
}
return linspc;
}
size_t clip(const size_t n, size_t lower, size_t upper)
{
return std::max(lower, std::min(n, upper));
}
std::vector<precision_t> percentile(samples_t& data, std::vector<precision_t>& percentiles)
{
// Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html
std::vector<precision_t> results;
results.reserve(percentiles.size());
for (auto percentile : percentiles) {
const size_t i = static_cast<size_t>(std::floor(static_cast<double>(data.size() - 1) * percentile / 100.));
const auto indexLower = clip(i, 0, data.size() - 1);
const double percentI = static_cast<double>(indexLower) / static_cast<double>(data.size() - 1);
const double fraction =
(percentile / 100.0 - percentI) /
(static_cast<double>(indexLower + 1) / static_cast<double>(data.size() - 1) - percentI);
const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction;
if (value != results.back())
results.push_back(value);
}
return results;
}
void BinDisc::fit_quantile(samples_t& X)
{
auto quantiles = linspace(0.0, 100.0, n_bins + 1);
auto data = X;
std::sort(data.begin(), data.end());
if (data.front() == data.back() || data.size() == 1) {
// if X is constant
cutPoints.push_back(std::numeric_limits<precision_t>::max());
return;
}
cutPoints = percentile(data, quantiles);
normalizeCutPoints();
}
void BinDisc::fit_uniform(samples_t& X)
{
auto minmax = std::minmax_element(X.begin(), X.end());
cutPoints = linspace(*minmax.first, *minmax.second, n_bins + 1);
normalizeCutPoints();
}
void BinDisc::normalizeCutPoints()
{
// Add max value to the end
cutPoints.push_back(std::numeric_limits<precision_t>::max());
// Remove first as it is not needed
cutPoints.erase(cutPoints.begin());
}
labels_t& BinDisc::transform(const samples_t& X)
{
discretizedData.clear();
discretizedData.reserve(X.size());
for (const precision_t& item : X) {
auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item);
discretizedData.push_back(upper - cutPoints.begin());
}
return discretizedData;
}
}
// void BinDisc::fit_quantile(samples_t& X)
// {
// cutPoints.clear();
// if (X.empty()) {
// cutPoints.push_back(std::numeric_limits<float>::max());
// return;
// }
// samples_t data = X;
// std::sort(data.begin(), data.end());
// float min_val = data.front();
// float max_val = data.back();
// // Handle case of all data points having the same value
// if (min_val == max_val) {
// cutPoints.push_back(std::numeric_limits<float>::max());
// return;
// }
// int first = X.size() / n_bins;
// cutPoints.push_back(data.at(first - 1));
// int bins_done = 1;
// int prev = first - 1;
// while (bins_done < n_bins) {
// int next = first * (bins_done + 1) - 1;
// while (next < X.size() && data.at(next) == data[prev]) {
// ++next;
// }
// if (next == X.size() || bins_done == n_bins - 1) {
// cutPoints.push_back(std::numeric_limits<float>::max());
// break;
// } else {
// cutPoints.push_back(data[next]);
// bins_done++;
// prev = next;
// }
// }
// }

31
BinDisc.h Normal file
View File

@@ -0,0 +1,31 @@
#ifndef BINDISC_H
#define BINDISC_H
#include "typesFImdlp.h"
#include <string>
namespace mdlp {
enum class strategy_t {
UNIFORM,
QUANTILE
};
class BinDisc {
public:
BinDisc(int n_bins = 3, strategy_t strategy = strategy_t::UNIFORM);
~BinDisc();
void fit(samples_t&);
inline cutPoints_t getCutPoints() const { return cutPoints; };
labels_t& transform(const samples_t&);
static inline std::string version() { return "1.0.0"; };
private:
void fit_uniform(samples_t&);
void fit_quantile(samples_t&);
void normalizeCutPoints();
int n_bins;
strategy_t strategy;
labels_t discretizedData = labels_t();
cutPoints_t cutPoints;
};
}
#endif

View File

@@ -3,7 +3,6 @@
#include <set>
#include <cmath>
#include "CPPFImdlp.h"
#include "Metrics.h"
namespace mdlp {
@@ -178,7 +177,7 @@ namespace mdlp {
indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
{
indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0);
std::iota(idx.begin(), idx.end(), 0);
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
if (X_[i1] == X_[i2])
return y_[i1] < y_[i2];
@@ -214,7 +213,7 @@ namespace mdlp {
discretizedData.clear();
discretizedData.reserve(data.size());
for (const precision_t& item : data) {
auto upper = upper_bound(cutPoints.begin(), cutPoints.end(), item);
auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item);
discretizedData.push_back(upper - cutPoints.begin());
}
return discretizedData;

View File

@@ -2,13 +2,22 @@
#define CPPFIMDLP_H
#include "typesFImdlp.h"
#include "Metrics.h"
#include <limits>
#include <utility>
#include <string>
#include "Metrics.h"
namespace mdlp {
class CPPFImdlp {
public:
CPPFImdlp();
CPPFImdlp(size_t, int, float);
~CPPFImdlp();
void fit(samples_t&, labels_t&);
inline cutPoints_t getCutPoints() const { return cutPoints; };
labels_t& transform(const samples_t&);
inline int get_depth() const { return depth; };
static inline std::string version() { return "1.1.3"; };
protected:
size_t min_length = 3;
int depth = 0;
@@ -21,25 +30,13 @@ namespace mdlp {
cutPoints_t cutPoints;
size_t num_cut_points = numeric_limits<size_t>::max();
labels_t discretizedData = labels_t();
static indices_t sortIndices(samples_t&, labels_t&);
void computeCutPoints(size_t, size_t, int);
void resizeCutPoints();
bool mdlp(size_t, size_t, size_t);
size_t getCandidate(size_t, size_t);
size_t compute_max_num_cut_points() const;
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
public:
CPPFImdlp();
CPPFImdlp(size_t, int, float);
~CPPFImdlp();
void fit(samples_t&, labels_t&);
inline cutPoints_t getCutPoints() const { return cutPoints; };
labels_t& transform(const samples_t&);
inline int get_depth() const { return depth; };
static inline string version() { return "1.1.2"; };
};
}
#endif

View File

@@ -3,7 +3,7 @@ sonar.organization=rmontanana
# This is the name and version displayed in the SonarCloud UI.
sonar.projectName=mdlp
sonar.projectVersion=1.0.2
sonar.projectVersion=1.1.3
# sonar.test.exclusions=tests/**
# sonar.tests=tests/
# sonar.coverage.exclusions=tests/**,sample/**

351
tests/BinDisc_unittest.cpp Normal file
View File

@@ -0,0 +1,351 @@
#include <fstream>
#include <string>
#include <iostream>
#include "gtest/gtest.h"
#include "ArffFiles.h"
#include "../BinDisc.h"
namespace mdlp {
const float margin = 1e-4;
static std::string set_data_path()
{
std::string path = "../datasets/";
std::ifstream file(path + "iris.arff");
if (file.is_open()) {
file.close();
return path;
}
return "../../tests/datasets/";
}
const std::string data_path = set_data_path();
class TestBinDisc3U : public BinDisc, public testing::Test {
public:
TestBinDisc3U(int n_bins = 3) : BinDisc(n_bins, strategy_t::UNIFORM) {};
};
class TestBinDisc3Q : public BinDisc, public testing::Test {
public:
TestBinDisc3Q(int n_bins = 3) : BinDisc(n_bins, strategy_t::QUANTILE) {};
};
class TestBinDisc4U : public BinDisc, public testing::Test {
public:
TestBinDisc4U(int n_bins = 4) : BinDisc(n_bins, strategy_t::UNIFORM) {};
};
class TestBinDisc4Q : public BinDisc, public testing::Test {
public:
TestBinDisc4Q(int n_bins = 4) : BinDisc(n_bins, strategy_t::QUANTILE) {};
};
TEST_F(TestBinDisc3U, Easy3BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_NEAR(3.66667, cuts[0], margin);
EXPECT_NEAR(6.33333, cuts[1], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
EXPECT_EQ(3, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3Q, Easy3BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_NEAR(3.666667, cuts[0], margin);
EXPECT_NEAR(6.333333, cuts[1], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
EXPECT_EQ(3, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3U, X10BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4.0, cuts[0]);
EXPECT_EQ(7.0, cuts[1]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
EXPECT_EQ(3, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3Q, X10BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4, cuts[0]);
EXPECT_EQ(7, cuts[1]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
EXPECT_EQ(3, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3U, X11BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_NEAR(4.33333, cuts[0], margin);
EXPECT_NEAR(7.66667, cuts[1], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
EXPECT_EQ(3, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3U, X11BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_NEAR(4.33333, cuts[0], margin);
EXPECT_NEAR(7.66667, cuts[1], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
EXPECT_EQ(3, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3U, ConstantUniform)
{
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
EXPECT_EQ(1, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 0, 0 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3Q, ConstantQuantile)
{
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
EXPECT_EQ(1, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 0, 0 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3U, EmptyUniform)
{
samples_t X = {};
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
EXPECT_EQ(1, cuts.size());
}
TEST_F(TestBinDisc3Q, EmptyQuantile)
{
samples_t X = {};
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
EXPECT_EQ(1, cuts.size());
}
TEST(TestBinDisc3, ExceptionNumberBins)
{
EXPECT_THROW(BinDisc(2), std::invalid_argument);
}
TEST_F(TestBinDisc3U, EasyRepeated)
{
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_NEAR(1.66667, cuts[0], margin);
EXPECT_NEAR(2.33333, cuts[1], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
EXPECT_EQ(3, cuts.size());
auto labels = transform(X);
labels_t expected = { 2, 0, 0, 2, 0, 0, 2, 0, 0 };
EXPECT_EQ(expected, labels);
EXPECT_EQ(3.0, X[0]); // X is not modified
}
TEST_F(TestBinDisc3Q, EasyRepeated)
{
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
fit(X);
auto cuts = getCutPoints();
std::cout << "cuts: ";
for (auto cut : cuts) {
std::cout << cut << " ";
}
std::cout << std::endl;
std::cout << std::string(80, '-') << std::endl;
EXPECT_NEAR(1.66667, cuts[0], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[1]);
EXPECT_EQ(2, cuts.size());
auto labels = transform(X);
labels_t expected = { 1, 0, 0, 1, 0, 0, 1, 0, 0 };
EXPECT_EQ(expected, labels);
EXPECT_EQ(3.0, X[0]); // X is not modified
}
TEST_F(TestBinDisc4U, Easy4BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(3.75, cuts[0]);
EXPECT_EQ(6.5, cuts[1]);
EXPECT_EQ(9.25, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
EXPECT_EQ(4, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4Q, Easy4BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(3.75, cuts[0]);
EXPECT_EQ(6.5, cuts[1]);
EXPECT_EQ(9.25, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
EXPECT_EQ(4, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4U, X13BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4.0, cuts[0]);
EXPECT_EQ(7.0, cuts[1]);
EXPECT_EQ(10.0, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
EXPECT_EQ(4, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4Q, X13BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4.0, cuts[0]);
EXPECT_EQ(7.0, cuts[1]);
EXPECT_EQ(10.0, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
EXPECT_EQ(4, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4U, X14BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4.25, cuts[0]);
EXPECT_EQ(7.5, cuts[1]);
EXPECT_EQ(10.75, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
EXPECT_EQ(4, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4Q, X14BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4.25, cuts[0]);
EXPECT_EQ(7.5, cuts[1]);
EXPECT_EQ(10.75, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
EXPECT_EQ(4, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4U, X15BinsUniform)
{
samples_t X = { 15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4.5, cuts[0]);
EXPECT_EQ(8, cuts[1]);
EXPECT_EQ(11.5, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
EXPECT_EQ(4, cuts.size());
auto labels = transform(X);
labels_t expected = { 3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4Q, X15BinsQuantile)
{
samples_t X = { 15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4.5, cuts[0]);
EXPECT_EQ(8, cuts[1]);
EXPECT_EQ(11.5, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
EXPECT_EQ(4, cuts.size());
auto labels = transform(X);
labels_t expected = { 3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4U, RepeatedValuesUniform)
{
samples_t X = { 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0 };
// 0 1 2 3 4 5 6 7 8 9
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(1.0, cuts[0]);
EXPECT_EQ(2.0, cuts[1]);
EXPECT_EQ(3.0, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
EXPECT_EQ(4, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4Q, RepeatedValuesQuantile)
{
samples_t X = { 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0 };
// 0 1 2 3 4 5 6 7 8 9
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(2.0, cuts[0]);
EXPECT_EQ(3.0, cuts[1]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
EXPECT_EQ(3, cuts.size());
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4U, irisUniform)
{
ArffFiles file;
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
fit(X[0]);
auto Xt = transform(X[0]);
labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
EXPECT_EQ(expected, Xt);
}
TEST_F(TestBinDisc4Q, irisQuantile)
{
ArffFiles file;
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
fit(X[0]);
auto Xt = transform(X[0]);
labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 };
EXPECT_EQ(expected, Xt);
}
}

View File

@@ -1,3 +1,4 @@
cmake_minimum_required(VERSION 3.20)
set(CMAKE_CXX_STANDARD 11)
include(FetchContent)
@@ -16,14 +17,18 @@ enable_testing()
add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp)
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp)
add_executable(BinDisc_unittest ../BinDisc.cpp ArffFiles.cpp BinDisc_unittest.cpp)
target_link_libraries(Metrics_unittest GTest::gtest_main)
target_link_libraries(FImdlp_unittest GTest::gtest_main)
target_link_libraries(BinDisc_unittest GTest::gtest_main)
target_compile_options(Metrics_unittest PRIVATE --coverage)
target_compile_options(FImdlp_unittest PRIVATE --coverage)
target_compile_options(BinDisc_unittest PRIVATE --coverage)
target_link_options(Metrics_unittest PRIVATE --coverage)
target_link_options(FImdlp_unittest PRIVATE --coverage)
target_link_options(BinDisc_unittest PRIVATE --coverage)
include(GoogleTest)
gtest_discover_tests(Metrics_unittest)
gtest_discover_tests(FImdlp_unittest)
gtest_discover_tests(BinDisc_unittest)

View File

@@ -1,3 +1,4 @@
#!/bin/bash
if [ -d build ] ; then
rm -fr build
fi
@@ -9,12 +10,9 @@ cmake --build build
cd build
ctest --output-on-failure
cd ..
if [ ! -d gcovr-report ] ; then
mkdir gcovr-report
fi
rm -fr gcovr-report/* 2>/dev/null
mkdir gcovr-report
#lcov --capture --directory ./ --output-file lcoverage/main_coverage.info
#lcov --remove lcoverage/main_coverage.info 'v1/*' '/Applications/*' '*/tests/*' --output-file lcoverage/main_coverage.info -q
#lcov --list lcoverage/main_coverage.info
cd ..
gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml
gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --gcov-filter "BinDisc.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml --exclude-noncode-lines

404
tests/testKbins.py Normal file
View File

@@ -0,0 +1,404 @@
from scipy.io.arff import loadarff
from sklearn.preprocessing import KBinsDiscretizer
def test(clf, X, expected, title):
X = [[x] for x in X]
clf.fit(X)
computed = [int(x[0]) for x in clf.transform(X)]
print(f"{title}")
print(f"{computed=}")
print(f"{expected=}")
assert computed == expected
print("-" * 80)
# Test Uniform Strategy
clf3u = KBinsDiscretizer(
n_bins=3, encode="ordinal", strategy="uniform", subsample=200_000
)
clf3q = KBinsDiscretizer(
n_bins=3, encode="ordinal", strategy="quantile", subsample=200_000
)
clf4u = KBinsDiscretizer(
n_bins=4, encode="ordinal", strategy="uniform", subsample=200_000
)
clf4q = KBinsDiscretizer(
n_bins=4, encode="ordinal", strategy="quantile", subsample=200_000
)
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
labels = [0, 0, 0, 1, 1, 1, 2, 2, 2]
test(clf3u, X, labels, title="Easy3BinsUniform")
test(clf3q, X, labels, title="Easy3BinsQuantile")
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
labels = [0, 0, 0, 1, 1, 1, 2, 2, 2, 2]
# En C++ se obtiene el mismo resultado en ambos, no como aquí
labels2 = [0, 0, 0, 1, 1, 1, 1, 2, 2, 2]
test(clf3u, X, labels, title="X10BinsUniform")
test(clf3q, X, labels2, title="X10BinsQuantile")
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
labels = [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2]
# En C++ se obtiene el mismo resultado en ambos, no como aquí
# labels2 = [0, 0, 0, 1, 1, 1, 1, 2, 2, 2]
test(clf3u, X, labels, title="X11BinsUniform")
test(clf3q, X, labels, title="X11BinsQuantile")
#
X = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
labels = [0, 0, 0, 0, 0, 0]
test(clf3u, X, labels, title="ConstantUniform")
test(clf3q, X, labels, title="ConstantQuantile")
#
X = [3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0]
labels = [2, 0, 0, 2, 0, 0, 2, 0, 0]
labels2 = [1, 0, 0, 1, 0, 0, 1, 0, 0] # igual que en C++
test(clf3u, X, labels, title="EasyRepeatedUniform")
test(clf3q, X, labels2, title="EasyRepeatedQuantile")
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
labels = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
test(clf4u, X, labels, title="Easy4BinsUniform")
test(clf4q, X, labels, title="Easy4BinsQuantile")
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]
labels = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
test(clf4u, X, labels, title="X13BinsUniform")
test(clf4q, X, labels, title="X13BinsQuantile")
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0]
labels = [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
test(clf4u, X, labels, title="X14BinsUniform")
test(clf4q, X, labels, title="X14BinsQuantile")
#
X1 = [15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0]
X2 = [15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0]
labels1 = [3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0]
labels2 = [3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0]
test(clf4u, X1, labels1, title="X15BinsUniform")
test(clf4q, X2, labels2, title="X15BinsQuantile")
#
X = [0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0]
labels = [0, 1, 1, 1, 2, 2, 3, 3, 3, 3]
test(clf4u, X, labels, title="RepeatedValuesUniform")
test(clf4q, X, labels, title="RepeatedValuesQuantile")
print(f"Uniform {clf4u.bin_edges_=}")
print(f"Quaintile {clf4q.bin_edges_=}")
print("-" * 80)
#
data, meta = loadarff("tests/datasets/iris.arff")
labelsu = [
0,
0,
0,
0,
0,
1,
0,
0,
0,
0,
1,
0,
0,
0,
1,
1,
1,
0,
1,
0,
1,
0,
0,
0,
0,
0,
0,
1,
1,
0,
0,
1,
1,
1,
0,
0,
1,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
0,
3,
2,
2,
1,
2,
1,
2,
0,
2,
1,
0,
1,
1,
2,
1,
2,
1,
1,
2,
1,
1,
2,
2,
2,
2,
2,
2,
2,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
1,
1,
1,
2,
1,
0,
1,
1,
1,
2,
0,
1,
2,
1,
3,
2,
2,
3,
0,
3,
2,
3,
2,
2,
2,
1,
1,
2,
2,
3,
3,
1,
2,
1,
3,
2,
2,
3,
2,
2,
2,
3,
3,
3,
2,
2,
2,
3,
2,
2,
1,
2,
2,
2,
1,
2,
2,
2,
2,
2,
2,
1,
]
labelsq = [
1,
0,
0,
0,
0,
1,
0,
0,
0,
0,
1,
0,
0,
0,
2,
1,
1,
1,
1,
1,
1,
1,
0,
1,
0,
0,
0,
1,
1,
0,
0,
1,
1,
1,
0,
0,
1,
0,
0,
1,
0,
0,
0,
0,
1,
0,
1,
0,
1,
0,
3,
3,
3,
1,
3,
1,
2,
0,
3,
1,
0,
2,
2,
2,
1,
3,
1,
2,
2,
1,
2,
2,
2,
2,
3,
3,
3,
3,
2,
1,
1,
1,
2,
2,
1,
2,
3,
2,
1,
1,
1,
2,
2,
0,
1,
1,
1,
2,
1,
1,
2,
2,
3,
2,
3,
3,
0,
3,
3,
3,
3,
3,
3,
1,
2,
3,
3,
3,
3,
2,
3,
1,
3,
2,
3,
3,
2,
2,
3,
3,
3,
3,
3,
2,
2,
3,
2,
3,
2,
3,
3,
3,
2,
3,
3,
3,
2,
3,
2,
2,
]
test(clf4u, data["sepallength"], labelsu, title="IrisUniform")
test(clf4q, data["sepallength"], labelsq, title="IrisQuantile")
# print("Labels")
# print(labels)
# print("Expected")
# print(expected)
# for i in range(len(labels)):
# if labels[i] != expected[i]:
# print(f"Error at {i} {labels[i]} != {expected[i]}")

View File

@@ -8,11 +8,11 @@
using namespace std;
namespace mdlp {
typedef float precision_t;
typedef vector<precision_t> samples_t;
typedef vector<int> labels_t;
typedef vector<size_t> indices_t;
typedef vector<precision_t> cutPoints_t;
typedef map<pair<int, int>, precision_t> cacheEnt_t;
typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
typedef std::vector<precision_t> samples_t;
typedef std::vector<int> labels_t;
typedef std::vector<size_t> indices_t;
typedef std::vector<precision_t> cutPoints_t;
typedef std::map<std::pair<int, int>, precision_t> cacheEnt_t;
typedef std::map<std::tuple<int, int, int>, precision_t> cacheIg_t;
}
#endif