mirror of
https://github.com/rmontanana/mdlp.git
synced 2025-08-15 15:35:55 +00:00
Discretizer (#8)
* Add better check in testKBins.py * Add Discretizer base class for Both discretizers * Refactor order of constructors init
This commit is contained in:
committed by
GitHub
parent
f258fc220f
commit
638bb2a59e
11
.vscode/launch.json
vendored
11
.vscode/launch.json
vendored
@@ -8,15 +8,10 @@
|
||||
"name": "C++ Launch config",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/build/sample/sample",
|
||||
"cwd": "${workspaceFolder}/build/sample",
|
||||
"args": [
|
||||
"-f",
|
||||
"glass"
|
||||
],
|
||||
"targetArchitecture": "arm64",
|
||||
"program": "${workspaceFolder}/tests/build/BinDisc_unittest",
|
||||
"cwd": "${workspaceFolder}/tests/build",
|
||||
"args": [],
|
||||
"launchCompleteCommand": "exec-run",
|
||||
"preLaunchTask": "CMake: build",
|
||||
"stopAtEntry": false,
|
||||
"linux": {
|
||||
"MIMode": "gdb",
|
||||
|
86
.vscode/settings.json
vendored
86
.vscode/settings.json
vendored
@@ -5,5 +5,89 @@
|
||||
},
|
||||
"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
|
||||
"cmake.configureOnOpen": true,
|
||||
"sonarlint.pathToCompileCommands": "${workspaceFolder}/build/compile_commands.json"
|
||||
"sonarlint.pathToCompileCommands": "${workspaceFolder}/build/compile_commands.json",
|
||||
"files.associations": {
|
||||
"*.rmd": "markdown",
|
||||
"*.py": "python",
|
||||
"vector": "cpp",
|
||||
"__bit_reference": "cpp",
|
||||
"__bits": "cpp",
|
||||
"__config": "cpp",
|
||||
"__debug": "cpp",
|
||||
"__errc": "cpp",
|
||||
"__hash_table": "cpp",
|
||||
"__locale": "cpp",
|
||||
"__mutex_base": "cpp",
|
||||
"__node_handle": "cpp",
|
||||
"__nullptr": "cpp",
|
||||
"__split_buffer": "cpp",
|
||||
"__string": "cpp",
|
||||
"__threading_support": "cpp",
|
||||
"__tuple": "cpp",
|
||||
"array": "cpp",
|
||||
"atomic": "cpp",
|
||||
"bitset": "cpp",
|
||||
"cctype": "cpp",
|
||||
"chrono": "cpp",
|
||||
"clocale": "cpp",
|
||||
"cmath": "cpp",
|
||||
"compare": "cpp",
|
||||
"complex": "cpp",
|
||||
"concepts": "cpp",
|
||||
"cstdarg": "cpp",
|
||||
"cstddef": "cpp",
|
||||
"cstdint": "cpp",
|
||||
"cstdio": "cpp",
|
||||
"cstdlib": "cpp",
|
||||
"cstring": "cpp",
|
||||
"ctime": "cpp",
|
||||
"cwchar": "cpp",
|
||||
"cwctype": "cpp",
|
||||
"exception": "cpp",
|
||||
"initializer_list": "cpp",
|
||||
"ios": "cpp",
|
||||
"iosfwd": "cpp",
|
||||
"istream": "cpp",
|
||||
"limits": "cpp",
|
||||
"locale": "cpp",
|
||||
"memory": "cpp",
|
||||
"mutex": "cpp",
|
||||
"new": "cpp",
|
||||
"optional": "cpp",
|
||||
"ostream": "cpp",
|
||||
"ratio": "cpp",
|
||||
"sstream": "cpp",
|
||||
"stdexcept": "cpp",
|
||||
"streambuf": "cpp",
|
||||
"string": "cpp",
|
||||
"string_view": "cpp",
|
||||
"system_error": "cpp",
|
||||
"tuple": "cpp",
|
||||
"type_traits": "cpp",
|
||||
"typeinfo": "cpp",
|
||||
"unordered_map": "cpp",
|
||||
"variant": "cpp",
|
||||
"algorithm": "cpp",
|
||||
"iostream": "cpp",
|
||||
"iomanip": "cpp",
|
||||
"numeric": "cpp",
|
||||
"set": "cpp",
|
||||
"__tree": "cpp",
|
||||
"deque": "cpp",
|
||||
"list": "cpp",
|
||||
"map": "cpp",
|
||||
"unordered_set": "cpp",
|
||||
"any": "cpp",
|
||||
"condition_variable": "cpp",
|
||||
"forward_list": "cpp",
|
||||
"fstream": "cpp",
|
||||
"stack": "cpp",
|
||||
"thread": "cpp",
|
||||
"__memory": "cpp",
|
||||
"filesystem": "cpp",
|
||||
"*.toml": "toml",
|
||||
"utility": "cpp",
|
||||
"span": "cpp",
|
||||
"*.tcc": "cpp"
|
||||
}
|
||||
}
|
55
BinDisc.cpp
55
BinDisc.cpp
@@ -7,7 +7,8 @@
|
||||
|
||||
namespace mdlp {
|
||||
|
||||
BinDisc::BinDisc(int n_bins, strategy_t strategy) : n_bins{ n_bins }, strategy{ strategy }
|
||||
BinDisc::BinDisc(int n_bins, strategy_t strategy) :
|
||||
Discretizer(), n_bins{ n_bins }, strategy{ strategy }
|
||||
{
|
||||
if (n_bins < 3) {
|
||||
throw std::invalid_argument("n_bins must be greater than 2");
|
||||
@@ -16,6 +17,7 @@ namespace mdlp {
|
||||
BinDisc::~BinDisc() = default;
|
||||
void BinDisc::fit(samples_t& X)
|
||||
{
|
||||
// y is included for compatibility with the Discretizer interface
|
||||
cutPoints.clear();
|
||||
if (X.empty()) {
|
||||
cutPoints.push_back(std::numeric_limits<precision_t>::max());
|
||||
@@ -27,6 +29,10 @@ namespace mdlp {
|
||||
fit_uniform(X);
|
||||
}
|
||||
}
|
||||
void BinDisc::fit(samples_t& X, labels_t& y)
|
||||
{
|
||||
fit(X);
|
||||
}
|
||||
std::vector<precision_t> linspace(precision_t start, precision_t end, int num)
|
||||
{
|
||||
// Doesn't include end point as it is not needed
|
||||
@@ -90,49 +96,4 @@ namespace mdlp {
|
||||
// Remove first as it is not needed
|
||||
cutPoints.erase(cutPoints.begin());
|
||||
}
|
||||
labels_t& BinDisc::transform(const samples_t& X)
|
||||
{
|
||||
discretizedData.clear();
|
||||
discretizedData.reserve(X.size());
|
||||
for (const precision_t& item : X) {
|
||||
auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item);
|
||||
discretizedData.push_back(upper - cutPoints.begin());
|
||||
}
|
||||
return discretizedData;
|
||||
}
|
||||
}
|
||||
// void BinDisc::fit_quantile(samples_t& X)
|
||||
// {
|
||||
// cutPoints.clear();
|
||||
// if (X.empty()) {
|
||||
// cutPoints.push_back(std::numeric_limits<float>::max());
|
||||
// return;
|
||||
// }
|
||||
// samples_t data = X;
|
||||
// std::sort(data.begin(), data.end());
|
||||
// float min_val = data.front();
|
||||
// float max_val = data.back();
|
||||
// // Handle case of all data points having the same value
|
||||
// if (min_val == max_val) {
|
||||
// cutPoints.push_back(std::numeric_limits<float>::max());
|
||||
// return;
|
||||
// }
|
||||
// int first = X.size() / n_bins;
|
||||
// cutPoints.push_back(data.at(first - 1));
|
||||
// int bins_done = 1;
|
||||
// int prev = first - 1;
|
||||
// while (bins_done < n_bins) {
|
||||
// int next = first * (bins_done + 1) - 1;
|
||||
// while (next < X.size() && data.at(next) == data[prev]) {
|
||||
// ++next;
|
||||
// }
|
||||
// if (next == X.size() || bins_done == n_bins - 1) {
|
||||
// cutPoints.push_back(std::numeric_limits<float>::max());
|
||||
// break;
|
||||
// } else {
|
||||
// cutPoints.push_back(data[next]);
|
||||
// bins_done++;
|
||||
// prev = next;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
12
BinDisc.h
12
BinDisc.h
@@ -2,6 +2,7 @@
|
||||
#define BINDISC_H
|
||||
|
||||
#include "typesFImdlp.h"
|
||||
#include "Discretizer.h"
|
||||
#include <string>
|
||||
|
||||
namespace mdlp {
|
||||
@@ -10,22 +11,19 @@ namespace mdlp {
|
||||
UNIFORM,
|
||||
QUANTILE
|
||||
};
|
||||
class BinDisc {
|
||||
class BinDisc : public Discretizer {
|
||||
public:
|
||||
BinDisc(int n_bins = 3, strategy_t strategy = strategy_t::UNIFORM);
|
||||
~BinDisc();
|
||||
void fit(samples_t&);
|
||||
inline cutPoints_t getCutPoints() const { return cutPoints; };
|
||||
labels_t& transform(const samples_t&);
|
||||
static inline std::string version() { return "1.0.0"; };
|
||||
// y is included for compatibility with the Discretizer interface
|
||||
void fit(samples_t& X_, labels_t& y) override;
|
||||
void fit(samples_t& X);
|
||||
private:
|
||||
void fit_uniform(samples_t&);
|
||||
void fit_quantile(samples_t&);
|
||||
void normalizeCutPoints();
|
||||
int n_bins;
|
||||
strategy_t strategy;
|
||||
labels_t discretizedData = labels_t();
|
||||
cutPoints_t cutPoints;
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
@@ -6,16 +6,14 @@
|
||||
|
||||
namespace mdlp {
|
||||
|
||||
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_),
|
||||
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) :
|
||||
Discretizer(),
|
||||
min_length(min_length_),
|
||||
max_depth(max_depth_),
|
||||
proposed_cuts(proposed)
|
||||
{
|
||||
}
|
||||
|
||||
CPPFImdlp::CPPFImdlp() = default;
|
||||
|
||||
CPPFImdlp::~CPPFImdlp() = default;
|
||||
|
||||
size_t CPPFImdlp::compute_max_num_cut_points() const
|
||||
{
|
||||
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples
|
||||
@@ -208,14 +206,5 @@ namespace mdlp {
|
||||
}
|
||||
cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx));
|
||||
}
|
||||
labels_t& CPPFImdlp::transform(const samples_t& data)
|
||||
{
|
||||
discretizedData.clear();
|
||||
discretizedData.reserve(data.size());
|
||||
for (const precision_t& item : data) {
|
||||
auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item);
|
||||
discretizedData.push_back(upper - cutPoints.begin());
|
||||
}
|
||||
return discretizedData;
|
||||
}
|
||||
|
||||
}
|
||||
|
16
CPPFImdlp.h
16
CPPFImdlp.h
@@ -6,18 +6,16 @@
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include "Metrics.h"
|
||||
#include "Discretizer.h"
|
||||
|
||||
namespace mdlp {
|
||||
class CPPFImdlp {
|
||||
class CPPFImdlp : public Discretizer {
|
||||
public:
|
||||
CPPFImdlp();
|
||||
CPPFImdlp(size_t, int, float);
|
||||
~CPPFImdlp();
|
||||
void fit(samples_t&, labels_t&);
|
||||
inline cutPoints_t getCutPoints() const { return cutPoints; };
|
||||
labels_t& transform(const samples_t&);
|
||||
CPPFImdlp() = default;
|
||||
CPPFImdlp(size_t min_length_, int max_depth_, float proposed);
|
||||
virtual ~CPPFImdlp() = default;
|
||||
void fit(samples_t& X_, labels_t& y_) override;
|
||||
inline int get_depth() const { return depth; };
|
||||
static inline std::string version() { return "1.1.3"; };
|
||||
protected:
|
||||
size_t min_length = 3;
|
||||
int depth = 0;
|
||||
@@ -27,9 +25,7 @@ namespace mdlp {
|
||||
samples_t X = samples_t();
|
||||
labels_t y = labels_t();
|
||||
Metrics metrics = Metrics(y, indices);
|
||||
cutPoints_t cutPoints;
|
||||
size_t num_cut_points = numeric_limits<size_t>::max();
|
||||
labels_t discretizedData = labels_t();
|
||||
static indices_t sortIndices(samples_t&, labels_t&);
|
||||
void computeCutPoints(size_t, size_t, int);
|
||||
void resizeCutPoints();
|
||||
|
31
Discretizer.h
Normal file
31
Discretizer.h
Normal file
@@ -0,0 +1,31 @@
|
||||
#ifndef DISCRETIZER_H
|
||||
#define DISCRETIZER_H
|
||||
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include "typesFImdlp.h"
|
||||
|
||||
namespace mdlp {
|
||||
class Discretizer {
|
||||
public:
|
||||
Discretizer() = default;
|
||||
virtual ~Discretizer() = default;
|
||||
virtual void fit(samples_t& X_, labels_t& y_) = 0;
|
||||
inline cutPoints_t getCutPoints() const { return cutPoints; };
|
||||
labels_t& transform(const samples_t& data)
|
||||
{
|
||||
discretizedData.clear();
|
||||
discretizedData.reserve(data.size());
|
||||
for (const precision_t& item : data) {
|
||||
auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item);
|
||||
discretizedData.push_back(upper - cutPoints.begin());
|
||||
}
|
||||
return discretizedData;
|
||||
};
|
||||
static inline std::string version() { return "1.1.3"; };
|
||||
protected:
|
||||
labels_t discretizedData = labels_t();
|
||||
cutPoints_t cutPoints;
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -37,12 +37,13 @@ namespace mdlp {
|
||||
TEST_F(TestBinDisc3U, Easy3BinsUniform)
|
||||
{
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
|
||||
fit(X);
|
||||
auto y = labels_t();
|
||||
fit(X, y);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_NEAR(3.66667, cuts[0], margin);
|
||||
EXPECT_NEAR(6.33333, cuts[1], margin);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
||||
EXPECT_EQ(3, cuts.size());
|
||||
ASSERT_EQ(3, cuts.size());
|
||||
EXPECT_NEAR(3.66667, cuts.at(0), margin);
|
||||
EXPECT_NEAR(6.33333, cuts.at(1), margin);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts.at(2));
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -52,10 +53,10 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
ASSERT_EQ(3, cuts.size());
|
||||
EXPECT_NEAR(3.666667, cuts[0], margin);
|
||||
EXPECT_NEAR(6.333333, cuts[1], margin);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
||||
EXPECT_EQ(3, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -65,10 +66,10 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
ASSERT_EQ(3, cuts.size());
|
||||
EXPECT_EQ(4.0, cuts[0]);
|
||||
EXPECT_EQ(7.0, cuts[1]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
||||
EXPECT_EQ(3, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -78,10 +79,10 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
ASSERT_EQ(3, cuts.size());
|
||||
EXPECT_EQ(4, cuts[0]);
|
||||
EXPECT_EQ(7, cuts[1]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
||||
EXPECT_EQ(3, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -91,10 +92,10 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
ASSERT_EQ(3, cuts.size());
|
||||
EXPECT_NEAR(4.33333, cuts[0], margin);
|
||||
EXPECT_NEAR(7.66667, cuts[1], margin);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
||||
EXPECT_EQ(3, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -104,10 +105,10 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
ASSERT_EQ(3, cuts.size());
|
||||
EXPECT_NEAR(4.33333, cuts[0], margin);
|
||||
EXPECT_NEAR(7.66667, cuts[1], margin);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
||||
EXPECT_EQ(3, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -117,8 +118,8 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
ASSERT_EQ(1, cuts.size());
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
||||
EXPECT_EQ(1, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 0, 0, 0 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -128,8 +129,8 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
||||
EXPECT_EQ(1, cuts.size());
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 0, 0, 0 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -139,16 +140,16 @@ namespace mdlp {
|
||||
samples_t X = {};
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
||||
EXPECT_EQ(1, cuts.size());
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
||||
}
|
||||
TEST_F(TestBinDisc3Q, EmptyQuantile)
|
||||
{
|
||||
samples_t X = {};
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
||||
EXPECT_EQ(1, cuts.size());
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
||||
}
|
||||
TEST(TestBinDisc3, ExceptionNumberBins)
|
||||
{
|
||||
@@ -159,44 +160,38 @@ namespace mdlp {
|
||||
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
ASSERT_EQ(3, cuts.size());
|
||||
EXPECT_NEAR(1.66667, cuts[0], margin);
|
||||
EXPECT_NEAR(2.33333, cuts[1], margin);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
||||
EXPECT_EQ(3, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 2, 0, 0, 2, 0, 0, 2, 0, 0 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
EXPECT_EQ(3.0, X[0]); // X is not modified
|
||||
ASSERT_EQ(3.0, X[0]); // X is not modified
|
||||
}
|
||||
TEST_F(TestBinDisc3Q, EasyRepeated)
|
||||
{
|
||||
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
std::cout << "cuts: ";
|
||||
for (auto cut : cuts) {
|
||||
std::cout << cut << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << std::string(80, '-') << std::endl;
|
||||
EXPECT_EQ(2, cuts.size());
|
||||
EXPECT_NEAR(1.66667, cuts[0], margin);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[1]);
|
||||
EXPECT_EQ(2, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 1, 0, 0, 1, 0, 0, 1, 0, 0 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
EXPECT_EQ(3.0, X[0]); // X is not modified
|
||||
ASSERT_EQ(3.0, X[0]); // X is not modified
|
||||
}
|
||||
TEST_F(TestBinDisc4U, Easy4BinsUniform)
|
||||
{
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(3.75, cuts[0]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
ASSERT_EQ(3.75, cuts[0]);
|
||||
EXPECT_EQ(6.5, cuts[1]);
|
||||
EXPECT_EQ(9.25, cuts[2]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -206,11 +201,11 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(3.75, cuts[0]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
ASSERT_EQ(3.75, cuts[0]);
|
||||
EXPECT_EQ(6.5, cuts[1]);
|
||||
EXPECT_EQ(9.25, cuts[2]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -220,11 +215,11 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
EXPECT_EQ(4.0, cuts[0]);
|
||||
EXPECT_EQ(7.0, cuts[1]);
|
||||
EXPECT_EQ(10.0, cuts[2]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -234,11 +229,11 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
EXPECT_EQ(4.0, cuts[0]);
|
||||
EXPECT_EQ(7.0, cuts[1]);
|
||||
EXPECT_EQ(10.0, cuts[2]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -248,11 +243,11 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
EXPECT_EQ(4.25, cuts[0]);
|
||||
EXPECT_EQ(7.5, cuts[1]);
|
||||
EXPECT_EQ(10.75, cuts[2]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -262,11 +257,11 @@ namespace mdlp {
|
||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
EXPECT_EQ(4.25, cuts[0]);
|
||||
EXPECT_EQ(7.5, cuts[1]);
|
||||
EXPECT_EQ(10.75, cuts[2]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -276,11 +271,11 @@ namespace mdlp {
|
||||
samples_t X = { 15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
EXPECT_EQ(4.5, cuts[0]);
|
||||
EXPECT_EQ(8, cuts[1]);
|
||||
EXPECT_EQ(11.5, cuts[2]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -290,11 +285,11 @@ namespace mdlp {
|
||||
samples_t X = { 15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
EXPECT_EQ(4.5, cuts[0]);
|
||||
EXPECT_EQ(8, cuts[1]);
|
||||
EXPECT_EQ(11.5, cuts[2]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -305,11 +300,11 @@ namespace mdlp {
|
||||
// 0 1 2 3 4 5 6 7 8 9
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
EXPECT_EQ(1.0, cuts[0]);
|
||||
EXPECT_EQ(2.0, cuts[1]);
|
||||
EXPECT_EQ(3.0, cuts[2]);
|
||||
ASSERT_EQ(3.0, cuts[2]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
||||
EXPECT_EQ(4, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
@@ -320,10 +315,10 @@ namespace mdlp {
|
||||
// 0 1 2 3 4 5 6 7 8 9
|
||||
fit(X);
|
||||
auto cuts = getCutPoints();
|
||||
ASSERT_EQ(3, cuts.size());
|
||||
EXPECT_EQ(2.0, cuts[0]);
|
||||
EXPECT_EQ(3.0, cuts[1]);
|
||||
ASSERT_EQ(3.0, cuts[1]);
|
||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
||||
EXPECT_EQ(3, cuts.size());
|
||||
auto labels = transform(X);
|
||||
labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 };
|
||||
EXPECT_EQ(expected, labels);
|
||||
|
@@ -16,19 +16,28 @@ FetchContent_MakeAvailable(googletest)
|
||||
enable_testing()
|
||||
|
||||
add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp)
|
||||
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp)
|
||||
add_executable(BinDisc_unittest ../BinDisc.cpp ArffFiles.cpp BinDisc_unittest.cpp)
|
||||
target_link_libraries(Metrics_unittest GTest::gtest_main)
|
||||
target_link_libraries(FImdlp_unittest GTest::gtest_main)
|
||||
target_link_libraries(BinDisc_unittest GTest::gtest_main)
|
||||
target_compile_options(Metrics_unittest PRIVATE --coverage)
|
||||
target_compile_options(FImdlp_unittest PRIVATE --coverage)
|
||||
target_compile_options(BinDisc_unittest PRIVATE --coverage)
|
||||
target_link_options(Metrics_unittest PRIVATE --coverage)
|
||||
|
||||
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp)
|
||||
target_link_libraries(FImdlp_unittest GTest::gtest_main)
|
||||
target_compile_options(FImdlp_unittest PRIVATE --coverage)
|
||||
target_link_options(FImdlp_unittest PRIVATE --coverage)
|
||||
|
||||
add_executable(BinDisc_unittest ../BinDisc.cpp ArffFiles.cpp BinDisc_unittest.cpp)
|
||||
target_link_libraries(BinDisc_unittest GTest::gtest_main)
|
||||
target_compile_options(BinDisc_unittest PRIVATE --coverage)
|
||||
target_link_options(BinDisc_unittest PRIVATE --coverage)
|
||||
|
||||
add_executable(Discretizer_unittest ../BinDisc.cpp ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp Discretizer_unittest.cpp)
|
||||
target_link_libraries(Discretizer_unittest GTest::gtest_main)
|
||||
target_compile_options(Discretizer_unittest PRIVATE --coverage)
|
||||
target_link_options(Discretizer_unittest PRIVATE --coverage)
|
||||
|
||||
include(GoogleTest)
|
||||
|
||||
gtest_discover_tests(Metrics_unittest)
|
||||
gtest_discover_tests(FImdlp_unittest)
|
||||
gtest_discover_tests(BinDisc_unittest)
|
||||
gtest_discover_tests(BinDisc_unittest)
|
||||
gtest_discover_tests(Discretizer_unittest)
|
74
tests/Discretizer_unittest.cpp
Normal file
74
tests/Discretizer_unittest.cpp
Normal file
@@ -0,0 +1,74 @@
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include "gtest/gtest.h"
|
||||
#include "ArffFiles.h"
|
||||
#include "../Discretizer.h"
|
||||
#include "../BinDisc.h"
|
||||
#include "../CPPFImdlp.h"
|
||||
|
||||
namespace mdlp {
|
||||
const float margin = 1e-4;
|
||||
static std::string set_data_path()
|
||||
{
|
||||
std::string path = "../datasets/";
|
||||
std::ifstream file(path + "iris.arff");
|
||||
if (file.is_open()) {
|
||||
file.close();
|
||||
return path;
|
||||
}
|
||||
return "../../tests/datasets/";
|
||||
}
|
||||
const std::string data_path = set_data_path();
|
||||
|
||||
TEST(Discretizer, BinIrisUniform)
|
||||
{
|
||||
ArffFiles file;
|
||||
Discretizer* disc = new BinDisc(4, strategy_t::UNIFORM);
|
||||
file.load(data_path + "iris.arff", true);
|
||||
vector<samples_t>& X = file.getX();
|
||||
auto y = labels_t();
|
||||
disc->fit(X[0], y);
|
||||
auto Xt = disc->transform(X[0]);
|
||||
labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
|
||||
delete disc;
|
||||
EXPECT_EQ(expected, Xt);
|
||||
}
|
||||
TEST(Discretizer, BinIrisQuantile)
|
||||
{
|
||||
ArffFiles file;
|
||||
Discretizer* disc = new BinDisc(4, strategy_t::QUANTILE);
|
||||
file.load(data_path + "iris.arff", true);
|
||||
vector<samples_t>& X = file.getX();
|
||||
auto y = labels_t();
|
||||
disc->fit(X[0], y);
|
||||
auto Xt = disc->transform(X[0]);
|
||||
labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 };
|
||||
delete disc;
|
||||
EXPECT_EQ(expected, Xt);
|
||||
}
|
||||
TEST(Discretizer, FImdlpIris)
|
||||
{
|
||||
labels_t expected = {
|
||||
5, 3, 4, 4, 5, 5, 5, 5, 2, 4, 5, 5, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 4, 5, 3, 5, 5, 5, 4, 4, 5, 5, 5, 4, 4, 5, 4, 3, 5, 5, 0, 4, 5,
|
||||
5, 3, 5, 4, 5, 4, 4, 4, 4, 0, 1, 1, 4, 0, 2, 0, 0, 3, 0, 2, 2, 4,
|
||||
3, 0, 0, 0, 4, 1, 0, 1, 2, 3, 1, 3, 2, 0, 0, 0, 0, 0, 3, 5, 4, 0,
|
||||
3, 0, 0, 3, 0, 0, 0, 3, 2, 2, 0, 1, 4, 0, 3, 2, 3, 3, 0, 2, 0, 5,
|
||||
4, 0, 3, 0, 1, 4, 3, 5, 0, 0, 4, 1, 1, 0, 4, 4, 1, 3, 1, 3, 1, 5,
|
||||
1, 1, 0, 3, 5, 4, 3, 4, 4, 4, 0, 4, 4, 3, 0, 3, 5, 3
|
||||
};
|
||||
ArffFiles file;
|
||||
Discretizer* disc = new CPPFImdlp();
|
||||
file.load(data_path + "iris.arff", true);
|
||||
vector<samples_t>& X = file.getX();
|
||||
labels_t& y = file.getY();
|
||||
disc->fit(X[1], y);
|
||||
auto computed = disc->transform(X[1]);
|
||||
delete disc;
|
||||
EXPECT_EQ(computed.size(), expected.size());
|
||||
for (unsigned long i = 0; i < computed.size(); i++) {
|
||||
EXPECT_EQ(computed[i], expected[i]);
|
||||
}
|
||||
}
|
||||
}
|
@@ -5,7 +5,7 @@ fi
|
||||
if [ -d gcovr-report ] ; then
|
||||
rm -fr gcovr-report
|
||||
fi
|
||||
cmake -S . -B build -Wno-dev
|
||||
cmake -S . -B build -Wno-dev -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS="--coverage" -DCMAKE_C_FLAGS="--coverage"
|
||||
cmake --build build
|
||||
cd build
|
||||
ctest --output-on-failure
|
||||
@@ -15,4 +15,4 @@ mkdir gcovr-report
|
||||
#lcov --remove lcoverage/main_coverage.info 'v1/*' '/Applications/*' '*/tests/*' --output-file lcoverage/main_coverage.info -q
|
||||
#lcov --list lcoverage/main_coverage.info
|
||||
cd ..
|
||||
gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --gcov-filter "BinDisc.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml --exclude-noncode-lines
|
||||
gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --gcov-filter "BinDisc.cpp" --gcov-filter "Discretizer.h" --txt --sonarqube=tests/gcovr-report/coverage.xml --exclude-noncode-lines
|
||||
|
@@ -89,6 +89,7 @@ print(f"Quaintile {clf4q.bin_edges_=}")
|
||||
print("-" * 80)
|
||||
#
|
||||
data, meta = loadarff("tests/datasets/iris.arff")
|
||||
|
||||
labelsu = [
|
||||
0,
|
||||
0,
|
||||
@@ -117,12 +118,12 @@ labelsu = [
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
@@ -149,11 +150,11 @@ labelsu = [
|
||||
2,
|
||||
0,
|
||||
2,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
@@ -161,9 +162,9 @@ labelsu = [
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
@@ -181,7 +182,7 @@ labelsu = [
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
@@ -217,14 +218,14 @@ labelsu = [
|
||||
2,
|
||||
3,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
3,
|
||||
2,
|
||||
2,
|
||||
@@ -393,12 +394,19 @@ labelsq = [
|
||||
2,
|
||||
2,
|
||||
]
|
||||
test(clf4u, data["sepallength"], labelsu, title="IrisUniform")
|
||||
test(clf4q, data["sepallength"], labelsq, title="IrisQuantile")
|
||||
# print("Labels")
|
||||
# print(labels)
|
||||
# print("Expected")
|
||||
# print(expected)
|
||||
# for i in range(len(labels)):
|
||||
# if labels[i] != expected[i]:
|
||||
# print(f"Error at {i} {labels[i]} != {expected[i]}")
|
||||
# test(clf4u, data["sepallength"], labelsu, title="IrisUniform")
|
||||
# test(clf4q, data["sepallength"], labelsq, title="IrisQuantile")
|
||||
sepallength = [[x] for x in data["sepallength"]]
|
||||
clf4u.fit(sepallength)
|
||||
clf4q.fit(sepallength)
|
||||
computedu = clf4u.transform(sepallength)
|
||||
computedq = clf4q.transform(sepallength)
|
||||
wrongu = 0
|
||||
wrongq = 0
|
||||
for i in range(len(labelsu)):
|
||||
if labelsu[i] != computedu[i]:
|
||||
wrongu += 1
|
||||
if labelsq[i] != computedq[i]:
|
||||
wrongq += 1
|
||||
print(f"Iris sepallength diff. between BinDisc & sklearn::KBins Uniform ={wrongu:3d}")
|
||||
print(f"Iris sepallength diff. between BinDisc & sklearn::KBins Quantile ={wrongq:3d}")
|
||||
|
Reference in New Issue
Block a user