mirror of
https://github.com/rmontanana/mdlp.git
synced 2025-08-21 10:26:02 +00:00
Compare commits
2 Commits
v2.1.1
...
6f90516b3d
Author | SHA1 | Date | |
---|---|---|---|
|
6f90516b3d | ||
8f6e16f04f
|
2
.vscode/launch.json
vendored
2
.vscode/launch.json
vendored
@@ -8,7 +8,7 @@
|
|||||||
"name": "C++ Launch config",
|
"name": "C++ Launch config",
|
||||||
"type": "cppdbg",
|
"type": "cppdbg",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"program": "${workspaceFolder}/tests/build/Metrics_unittest",
|
"program": "${workspaceFolder}/tests/build/BinDisc_unittest",
|
||||||
"cwd": "${workspaceFolder}/tests/build",
|
"cwd": "${workspaceFolder}/tests/build",
|
||||||
"args": [],
|
"args": [],
|
||||||
"launchCompleteCommand": "exec-run",
|
"launchCompleteCommand": "exec-run",
|
||||||
|
28
BinDisc.cpp
28
BinDisc.cpp
@@ -1,5 +1,4 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <limits>
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "BinDisc.h"
|
#include "BinDisc.h"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
@@ -20,7 +19,8 @@ namespace mdlp {
|
|||||||
// y is included for compatibility with the Discretizer interface
|
// y is included for compatibility with the Discretizer interface
|
||||||
cutPoints.clear();
|
cutPoints.clear();
|
||||||
if (X.empty()) {
|
if (X.empty()) {
|
||||||
cutPoints.push_back(std::numeric_limits<precision_t>::max());
|
cutPoints.push_back(0.0);
|
||||||
|
cutPoints.push_back(0.0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (strategy == strategy_t::QUANTILE) {
|
if (strategy == strategy_t::QUANTILE) {
|
||||||
@@ -35,13 +35,12 @@ namespace mdlp {
|
|||||||
}
|
}
|
||||||
std::vector<precision_t> linspace(precision_t start, precision_t end, int num)
|
std::vector<precision_t> linspace(precision_t start, precision_t end, int num)
|
||||||
{
|
{
|
||||||
// Doesn't include end point as it is not needed
|
|
||||||
if (start == end) {
|
if (start == end) {
|
||||||
return { 0 };
|
return { start, end };
|
||||||
}
|
}
|
||||||
precision_t delta = (end - start) / static_cast<precision_t>(num - 1);
|
precision_t delta = (end - start) / static_cast<precision_t>(num - 1);
|
||||||
std::vector<precision_t> linspc;
|
std::vector<precision_t> linspc;
|
||||||
for (size_t i = 0; i < num - 1; ++i) {
|
for (size_t i = 0; i < num; ++i) {
|
||||||
precision_t val = start + delta * static_cast<precision_t>(i);
|
precision_t val = start + delta * static_cast<precision_t>(i);
|
||||||
linspc.push_back(val);
|
linspc.push_back(val);
|
||||||
}
|
}
|
||||||
@@ -55,6 +54,7 @@ namespace mdlp {
|
|||||||
{
|
{
|
||||||
// Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html
|
// Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html
|
||||||
std::vector<precision_t> results;
|
std::vector<precision_t> results;
|
||||||
|
bool first = true;
|
||||||
results.reserve(percentiles.size());
|
results.reserve(percentiles.size());
|
||||||
for (auto percentile : percentiles) {
|
for (auto percentile : percentiles) {
|
||||||
const size_t i = static_cast<size_t>(std::floor(static_cast<double>(data.size() - 1) * percentile / 100.));
|
const size_t i = static_cast<size_t>(std::floor(static_cast<double>(data.size() - 1) * percentile / 100.));
|
||||||
@@ -64,8 +64,9 @@ namespace mdlp {
|
|||||||
(percentile / 100.0 - percentI) /
|
(percentile / 100.0 - percentI) /
|
||||||
(static_cast<double>(indexLower + 1) / static_cast<double>(data.size() - 1) - percentI);
|
(static_cast<double>(indexLower + 1) / static_cast<double>(data.size() - 1) - percentI);
|
||||||
const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction;
|
const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction;
|
||||||
if (value != results.back())
|
if (value != results.back() || first) // first needed as results.back() return is undefined for empty vectors
|
||||||
results.push_back(value);
|
results.push_back(value);
|
||||||
|
first = false;
|
||||||
}
|
}
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
@@ -75,25 +76,16 @@ namespace mdlp {
|
|||||||
auto data = X;
|
auto data = X;
|
||||||
std::sort(data.begin(), data.end());
|
std::sort(data.begin(), data.end());
|
||||||
if (data.front() == data.back() || data.size() == 1) {
|
if (data.front() == data.back() || data.size() == 1) {
|
||||||
// if X is constant
|
// if X is constant, pass any two given points that shall be ignored in transform
|
||||||
cutPoints.push_back(std::numeric_limits<precision_t>::max());
|
cutPoints.push_back(data.front());
|
||||||
|
cutPoints.push_back(data.front());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
cutPoints = percentile(data, quantiles);
|
cutPoints = percentile(data, quantiles);
|
||||||
normalizeCutPoints();
|
|
||||||
}
|
}
|
||||||
void BinDisc::fit_uniform(samples_t& X)
|
void BinDisc::fit_uniform(samples_t& X)
|
||||||
{
|
{
|
||||||
|
|
||||||
auto minmax = std::minmax_element(X.begin(), X.end());
|
auto minmax = std::minmax_element(X.begin(), X.end());
|
||||||
cutPoints = linspace(*minmax.first, *minmax.second, n_bins + 1);
|
cutPoints = linspace(*minmax.first, *minmax.second, n_bins + 1);
|
||||||
normalizeCutPoints();
|
|
||||||
}
|
|
||||||
void BinDisc::normalizeCutPoints()
|
|
||||||
{
|
|
||||||
// Add max value to the end
|
|
||||||
cutPoints.push_back(std::numeric_limits<precision_t>::max());
|
|
||||||
// Remove first as it is not needed
|
|
||||||
cutPoints.erase(cutPoints.begin());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -20,7 +20,6 @@ namespace mdlp {
|
|||||||
private:
|
private:
|
||||||
void fit_uniform(samples_t&);
|
void fit_uniform(samples_t&);
|
||||||
void fit_quantile(samples_t&);
|
void fit_quantile(samples_t&);
|
||||||
void normalizeCutPoints();
|
|
||||||
int n_bins;
|
int n_bins;
|
||||||
strategy_t strategy;
|
strategy_t strategy;
|
||||||
};
|
};
|
||||||
|
@@ -25,7 +25,7 @@ namespace mdlp {
|
|||||||
}
|
}
|
||||||
if (proposed_cuts < 1)
|
if (proposed_cuts < 1)
|
||||||
return static_cast<size_t>(round(static_cast<float>(X.size()) * proposed_cuts));
|
return static_cast<size_t>(round(static_cast<float>(X.size()) * proposed_cuts));
|
||||||
return static_cast<size_t>(proposed_cuts);
|
return static_cast<size_t>(proposed_cuts); // As the first and last cutpoints shall be ignored in transform
|
||||||
}
|
}
|
||||||
|
|
||||||
void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
|
void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
|
||||||
@@ -58,6 +58,10 @@ namespace mdlp {
|
|||||||
resizeCutPoints();
|
resizeCutPoints();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Insert first & last X value to the cutpoints as them shall be ignored in transform
|
||||||
|
auto minmax = std::minmax_element(X.begin(), X.end());
|
||||||
|
cutPoints.push_back(*minmax.second);
|
||||||
|
cutPoints.insert(cutPoints.begin(), *minmax.first);
|
||||||
}
|
}
|
||||||
|
|
||||||
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
|
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
|
||||||
|
@@ -5,9 +5,19 @@ namespace mdlp {
|
|||||||
{
|
{
|
||||||
discretizedData.clear();
|
discretizedData.clear();
|
||||||
discretizedData.reserve(data.size());
|
discretizedData.reserve(data.size());
|
||||||
|
// CutPoints always have more than two items
|
||||||
|
// Have to ignore first and last cut points provided
|
||||||
|
auto first = cutPoints.begin() + 1;
|
||||||
|
auto last = cutPoints.end() - 1;
|
||||||
for (const precision_t& item : data) {
|
for (const precision_t& item : data) {
|
||||||
auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item);
|
auto upper = std::lower_bound(first, last, item);
|
||||||
discretizedData.push_back(upper - cutPoints.begin());
|
int number = upper - first;
|
||||||
|
/*
|
||||||
|
OJO
|
||||||
|
*/
|
||||||
|
if (number < 0)
|
||||||
|
throw std::runtime_error("number is less than 0 in discretizer::transform");
|
||||||
|
discretizedData.push_back(number);
|
||||||
}
|
}
|
||||||
return discretizedData;
|
return discretizedData;
|
||||||
}
|
}
|
||||||
|
@@ -18,10 +18,10 @@ namespace mdlp {
|
|||||||
void fit_t(torch::Tensor& X_, torch::Tensor& y_);
|
void fit_t(torch::Tensor& X_, torch::Tensor& y_);
|
||||||
torch::Tensor transform_t(torch::Tensor& X_);
|
torch::Tensor transform_t(torch::Tensor& X_);
|
||||||
torch::Tensor fit_transform_t(torch::Tensor& X_, torch::Tensor& y_);
|
torch::Tensor fit_transform_t(torch::Tensor& X_, torch::Tensor& y_);
|
||||||
static inline std::string version() { return "1.2.2"; };
|
static inline std::string version() { return "1.2.3"; };
|
||||||
protected:
|
protected:
|
||||||
labels_t discretizedData = labels_t();
|
labels_t discretizedData = labels_t();
|
||||||
cutPoints_t cutPoints;
|
cutPoints_t cutPoints; // At least two cutpoints must be provided, the first and the last will be ignored in transform
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@@ -4,6 +4,7 @@
|
|||||||
#include "gtest/gtest.h"
|
#include "gtest/gtest.h"
|
||||||
#include "ArffFiles.h"
|
#include "ArffFiles.h"
|
||||||
#include "../BinDisc.h"
|
#include "../BinDisc.h"
|
||||||
|
#include "Experiments.hpp"
|
||||||
|
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
const float margin = 1e-4;
|
const float margin = 1e-4;
|
||||||
@@ -40,10 +41,11 @@ namespace mdlp {
|
|||||||
auto y = labels_t();
|
auto y = labels_t();
|
||||||
fit(X, y);
|
fit(X, y);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
ASSERT_EQ(3, cuts.size());
|
ASSERT_EQ(4, cuts.size());
|
||||||
EXPECT_NEAR(3.66667, cuts.at(0), margin);
|
EXPECT_NEAR(1, cuts.at(0), margin);
|
||||||
EXPECT_NEAR(6.33333, cuts.at(1), margin);
|
EXPECT_NEAR(3.66667, cuts.at(1), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts.at(2));
|
EXPECT_NEAR(6.33333, cuts.at(2), margin);
|
||||||
|
EXPECT_NEAR(9.0, cuts.at(3), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
|
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -53,10 +55,11 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
ASSERT_EQ(3, cuts.size());
|
ASSERT_EQ(4, cuts.size());
|
||||||
EXPECT_NEAR(3.666667, cuts[0], margin);
|
EXPECT_NEAR(1, cuts[0], margin);
|
||||||
EXPECT_NEAR(6.333333, cuts[1], margin);
|
EXPECT_NEAR(3.666667, cuts[1], margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
EXPECT_NEAR(6.333333, cuts[2], margin);
|
||||||
|
EXPECT_NEAR(9, cuts[3], margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
|
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -66,12 +69,13 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
ASSERT_EQ(3, cuts.size());
|
ASSERT_EQ(4, cuts.size());
|
||||||
EXPECT_EQ(4.0, cuts[0]);
|
EXPECT_NEAR(1, cuts.at(0), margin);
|
||||||
EXPECT_EQ(7.0, cuts[1]);
|
EXPECT_NEAR(4.0, cuts.at(1), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
EXPECT_NEAR(7.0, cuts.at(2), margin);
|
||||||
|
EXPECT_NEAR(10.0, cuts.at(3), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
|
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
}
|
}
|
||||||
TEST_F(TestBinDisc3Q, X10BinsQuantile)
|
TEST_F(TestBinDisc3Q, X10BinsQuantile)
|
||||||
@@ -79,12 +83,13 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
ASSERT_EQ(3, cuts.size());
|
ASSERT_EQ(4, cuts.size());
|
||||||
EXPECT_EQ(4, cuts[0]);
|
EXPECT_NEAR(1, cuts.at(0), margin);
|
||||||
EXPECT_EQ(7, cuts[1]);
|
EXPECT_NEAR(4.0, cuts.at(1), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
EXPECT_NEAR(7.0, cuts.at(2), margin);
|
||||||
|
EXPECT_NEAR(10.0, cuts.at(3), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
|
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
}
|
}
|
||||||
TEST_F(TestBinDisc3U, X11BinsUniform)
|
TEST_F(TestBinDisc3U, X11BinsUniform)
|
||||||
@@ -92,10 +97,11 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
ASSERT_EQ(3, cuts.size());
|
ASSERT_EQ(4, cuts.size());
|
||||||
EXPECT_NEAR(4.33333, cuts[0], margin);
|
EXPECT_NEAR(1, cuts.at(0), margin);
|
||||||
EXPECT_NEAR(7.66667, cuts[1], margin);
|
EXPECT_NEAR(4.33333, cuts.at(1), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
EXPECT_NEAR(7.66667, cuts.at(2), margin);
|
||||||
|
EXPECT_NEAR(11.0, cuts.at(3), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
|
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -105,10 +111,11 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
ASSERT_EQ(3, cuts.size());
|
ASSERT_EQ(4, cuts.size());
|
||||||
EXPECT_NEAR(4.33333, cuts[0], margin);
|
EXPECT_NEAR(1, cuts.at(0), margin);
|
||||||
EXPECT_NEAR(7.66667, cuts[1], margin);
|
EXPECT_NEAR(4.33333, cuts.at(1), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
EXPECT_NEAR(7.66667, cuts.at(2), margin);
|
||||||
|
EXPECT_NEAR(11.0, cuts.at(3), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
|
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -118,8 +125,9 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
|
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
ASSERT_EQ(1, cuts.size());
|
ASSERT_EQ(2, cuts.size());
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
EXPECT_NEAR(1, cuts.at(0), margin);
|
||||||
|
EXPECT_NEAR(1, cuts.at(1), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 0, 0, 0 };
|
labels_t expected = { 0, 0, 0, 0, 0, 0 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -129,8 +137,9 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
|
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(1, cuts.size());
|
ASSERT_EQ(2, cuts.size());
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
EXPECT_NEAR(1, cuts.at(0), margin);
|
||||||
|
EXPECT_NEAR(1, cuts.at(1), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 0, 0, 0 };
|
labels_t expected = { 0, 0, 0, 0, 0, 0 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -140,16 +149,18 @@ namespace mdlp {
|
|||||||
samples_t X = {};
|
samples_t X = {};
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(1, cuts.size());
|
ASSERT_EQ(2, cuts.size());
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
EXPECT_NEAR(0, cuts.at(0), margin);
|
||||||
|
EXPECT_NEAR(0, cuts.at(1), margin);
|
||||||
}
|
}
|
||||||
TEST_F(TestBinDisc3Q, EmptyQuantile)
|
TEST_F(TestBinDisc3Q, EmptyQuantile)
|
||||||
{
|
{
|
||||||
samples_t X = {};
|
samples_t X = {};
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(1, cuts.size());
|
ASSERT_EQ(2, cuts.size());
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
|
EXPECT_NEAR(0, cuts.at(0), margin);
|
||||||
|
EXPECT_NEAR(0, cuts.at(1), margin);
|
||||||
}
|
}
|
||||||
TEST(TestBinDisc3, ExceptionNumberBins)
|
TEST(TestBinDisc3, ExceptionNumberBins)
|
||||||
{
|
{
|
||||||
@@ -160,10 +171,11 @@ namespace mdlp {
|
|||||||
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
|
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
ASSERT_EQ(3, cuts.size());
|
ASSERT_EQ(4, cuts.size());
|
||||||
EXPECT_NEAR(1.66667, cuts[0], margin);
|
EXPECT_NEAR(1, cuts.at(0), margin);
|
||||||
EXPECT_NEAR(2.33333, cuts[1], margin);
|
EXPECT_NEAR(1.66667, cuts.at(1), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
EXPECT_NEAR(2.33333, cuts.at(2), margin);
|
||||||
|
EXPECT_NEAR(3.0, cuts.at(3), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 2, 0, 0, 2, 0, 0, 2, 0, 0 };
|
labels_t expected = { 2, 0, 0, 2, 0, 0, 2, 0, 0 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -174,9 +186,10 @@ namespace mdlp {
|
|||||||
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
|
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(2, cuts.size());
|
ASSERT_EQ(3, cuts.size());
|
||||||
EXPECT_NEAR(1.66667, cuts[0], margin);
|
EXPECT_NEAR(1, cuts.at(0), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[1]);
|
EXPECT_NEAR(1.66667, cuts.at(1), margin);
|
||||||
|
EXPECT_NEAR(3.0, cuts.at(2), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 1, 0, 0, 1, 0, 0, 1, 0, 0 };
|
labels_t expected = { 1, 0, 0, 1, 0, 0, 1, 0, 0 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -187,11 +200,12 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(4, cuts.size());
|
ASSERT_EQ(5, cuts.size());
|
||||||
ASSERT_EQ(3.75, cuts[0]);
|
EXPECT_NEAR(1.0, cuts.at(0), margin);
|
||||||
EXPECT_EQ(6.5, cuts[1]);
|
EXPECT_NEAR(3.75, cuts.at(1), margin);
|
||||||
EXPECT_EQ(9.25, cuts[2]);
|
EXPECT_NEAR(6.5, cuts.at(2), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
EXPECT_NEAR(9.25, cuts.at(3), margin);
|
||||||
|
EXPECT_NEAR(12.0, cuts.at(4), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
|
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -201,11 +215,12 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(4, cuts.size());
|
ASSERT_EQ(5, cuts.size());
|
||||||
ASSERT_EQ(3.75, cuts[0]);
|
EXPECT_NEAR(1.0, cuts.at(0), margin);
|
||||||
EXPECT_EQ(6.5, cuts[1]);
|
EXPECT_NEAR(3.75, cuts.at(1), margin);
|
||||||
EXPECT_EQ(9.25, cuts[2]);
|
EXPECT_NEAR(6.5, cuts.at(2), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
EXPECT_NEAR(9.25, cuts.at(3), margin);
|
||||||
|
EXPECT_NEAR(12.0, cuts.at(4), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
|
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -215,13 +230,14 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(4, cuts.size());
|
ASSERT_EQ(5, cuts.size());
|
||||||
EXPECT_EQ(4.0, cuts[0]);
|
EXPECT_NEAR(1.0, cuts.at(0), margin);
|
||||||
EXPECT_EQ(7.0, cuts[1]);
|
EXPECT_NEAR(4.0, cuts.at(1), margin);
|
||||||
EXPECT_EQ(10.0, cuts[2]);
|
EXPECT_NEAR(7.0, cuts.at(2), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
EXPECT_NEAR(10.0, cuts.at(3), margin);
|
||||||
|
EXPECT_NEAR(13.0, cuts.at(4), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
|
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
}
|
}
|
||||||
TEST_F(TestBinDisc4Q, X13BinsQuantile)
|
TEST_F(TestBinDisc4Q, X13BinsQuantile)
|
||||||
@@ -229,13 +245,14 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(4, cuts.size());
|
ASSERT_EQ(5, cuts.size());
|
||||||
EXPECT_EQ(4.0, cuts[0]);
|
EXPECT_NEAR(1.0, cuts.at(0), margin);
|
||||||
EXPECT_EQ(7.0, cuts[1]);
|
EXPECT_NEAR(4.0, cuts.at(1), margin);
|
||||||
EXPECT_EQ(10.0, cuts[2]);
|
EXPECT_NEAR(7.0, cuts.at(2), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
EXPECT_NEAR(10.0, cuts.at(3), margin);
|
||||||
|
EXPECT_NEAR(13.0, cuts.at(4), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
|
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
}
|
}
|
||||||
TEST_F(TestBinDisc4U, X14BinsUniform)
|
TEST_F(TestBinDisc4U, X14BinsUniform)
|
||||||
@@ -243,11 +260,12 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(4, cuts.size());
|
ASSERT_EQ(5, cuts.size());
|
||||||
EXPECT_EQ(4.25, cuts[0]);
|
EXPECT_NEAR(1.0, cuts.at(0), margin);
|
||||||
EXPECT_EQ(7.5, cuts[1]);
|
EXPECT_NEAR(4.25, cuts.at(1), margin);
|
||||||
EXPECT_EQ(10.75, cuts[2]);
|
EXPECT_NEAR(7.5, cuts.at(2), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
EXPECT_NEAR(10.75, cuts.at(3), margin);
|
||||||
|
EXPECT_NEAR(14.0, cuts.at(4), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
|
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -257,11 +275,12 @@ namespace mdlp {
|
|||||||
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
|
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(4, cuts.size());
|
ASSERT_EQ(5, cuts.size());
|
||||||
EXPECT_EQ(4.25, cuts[0]);
|
EXPECT_NEAR(1.0, cuts.at(0), margin);
|
||||||
EXPECT_EQ(7.5, cuts[1]);
|
EXPECT_NEAR(4.25, cuts.at(1), margin);
|
||||||
EXPECT_EQ(10.75, cuts[2]);
|
EXPECT_NEAR(7.5, cuts.at(2), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
EXPECT_NEAR(10.75, cuts.at(3), margin);
|
||||||
|
EXPECT_NEAR(14.0, cuts.at(4), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
|
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
@@ -271,13 +290,14 @@ namespace mdlp {
|
|||||||
samples_t X = { 15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
|
samples_t X = { 15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(4, cuts.size());
|
ASSERT_EQ(5, cuts.size());
|
||||||
EXPECT_EQ(4.5, cuts[0]);
|
EXPECT_NEAR(1.0, cuts.at(0), margin);
|
||||||
EXPECT_EQ(8, cuts[1]);
|
EXPECT_NEAR(4.5, cuts.at(1), margin);
|
||||||
EXPECT_EQ(11.5, cuts[2]);
|
EXPECT_NEAR(8, cuts.at(2), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
EXPECT_NEAR(11.5, cuts.at(3), margin);
|
||||||
|
EXPECT_NEAR(15.0, cuts.at(4), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 };
|
labels_t expected = { 3, 1, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
}
|
}
|
||||||
TEST_F(TestBinDisc4Q, X15BinsQuantile)
|
TEST_F(TestBinDisc4Q, X15BinsQuantile)
|
||||||
@@ -285,13 +305,14 @@ namespace mdlp {
|
|||||||
samples_t X = { 15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
|
samples_t X = { 15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(4, cuts.size());
|
ASSERT_EQ(5, cuts.size());
|
||||||
EXPECT_EQ(4.5, cuts[0]);
|
EXPECT_NEAR(1.0, cuts.at(0), margin);
|
||||||
EXPECT_EQ(8, cuts[1]);
|
EXPECT_NEAR(4.5, cuts.at(1), margin);
|
||||||
EXPECT_EQ(11.5, cuts[2]);
|
EXPECT_NEAR(8, cuts.at(2), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
EXPECT_NEAR(11.5, cuts.at(3), margin);
|
||||||
|
EXPECT_NEAR(15.0, cuts.at(4), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0 };
|
labels_t expected = { 3, 3, 3, 3, 1, 0, 1, 2, 2, 2, 1, 0, 0, 1, 0 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
}
|
}
|
||||||
TEST_F(TestBinDisc4U, RepeatedValuesUniform)
|
TEST_F(TestBinDisc4U, RepeatedValuesUniform)
|
||||||
@@ -300,13 +321,14 @@ namespace mdlp {
|
|||||||
// 0 1 2 3 4 5 6 7 8 9
|
// 0 1 2 3 4 5 6 7 8 9
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
EXPECT_EQ(4, cuts.size());
|
ASSERT_EQ(5, cuts.size());
|
||||||
EXPECT_EQ(1.0, cuts[0]);
|
EXPECT_NEAR(0.0, cuts.at(0), margin);
|
||||||
EXPECT_EQ(2.0, cuts[1]);
|
EXPECT_NEAR(1.0, cuts.at(1), margin);
|
||||||
ASSERT_EQ(3.0, cuts[2]);
|
EXPECT_NEAR(2.0, cuts.at(2), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
|
EXPECT_NEAR(3.0, cuts.at(3), margin);
|
||||||
|
EXPECT_NEAR(4.0, cuts.at(4), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 };
|
labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 3 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
}
|
}
|
||||||
TEST_F(TestBinDisc4Q, RepeatedValuesQuantile)
|
TEST_F(TestBinDisc4Q, RepeatedValuesQuantile)
|
||||||
@@ -315,50 +337,80 @@ namespace mdlp {
|
|||||||
// 0 1 2 3 4 5 6 7 8 9
|
// 0 1 2 3 4 5 6 7 8 9
|
||||||
fit(X);
|
fit(X);
|
||||||
auto cuts = getCutPoints();
|
auto cuts = getCutPoints();
|
||||||
ASSERT_EQ(3, cuts.size());
|
ASSERT_EQ(5, cuts.size());
|
||||||
EXPECT_EQ(2.0, cuts[0]);
|
EXPECT_NEAR(0.0, cuts.at(0), margin);
|
||||||
ASSERT_EQ(3.0, cuts[1]);
|
EXPECT_NEAR(1.0, cuts.at(1), margin);
|
||||||
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
|
EXPECT_NEAR(2.0, cuts.at(2), margin);
|
||||||
|
EXPECT_NEAR(3.0, cuts.at(3), margin);
|
||||||
|
EXPECT_NEAR(4.0, cuts.at(4), margin);
|
||||||
auto labels = transform(X);
|
auto labels = transform(X);
|
||||||
labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 };
|
labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 3 };
|
||||||
EXPECT_EQ(expected, labels);
|
EXPECT_EQ(expected, labels);
|
||||||
}
|
}
|
||||||
TEST_F(TestBinDisc4U, irisUniform)
|
// TEST_F(TestBinDisc4U, irisUniform)
|
||||||
|
// {
|
||||||
|
// ArffFiles file;
|
||||||
|
// file.load(data_path + "iris.arff", true);
|
||||||
|
// vector<samples_t>& X = file.getX();
|
||||||
|
// fit(X[0]);
|
||||||
|
// auto Xt = transform(X[0]);
|
||||||
|
// labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
|
||||||
|
// EXPECT_EQ(expected, Xt);
|
||||||
|
// auto Xtt = fit_transform(X[0], file.getY());
|
||||||
|
// EXPECT_EQ(expected, Xtt);
|
||||||
|
// auto Xt_t = torch::tensor(X[0], torch::kFloat32);
|
||||||
|
// auto y_t = torch::tensor(file.getY(), torch::kInt32);
|
||||||
|
// auto Xtt_t = fit_transform_t(Xt_t, y_t);
|
||||||
|
// for (int i = 0; i < expected.size(); i++)
|
||||||
|
// EXPECT_EQ(expected[i], Xtt_t[i].item<int>());
|
||||||
|
// }
|
||||||
|
// TEST_F(TestBinDisc4Q, irisQuantile)
|
||||||
|
// {
|
||||||
|
// ArffFiles file;
|
||||||
|
// file.load(data_path + "iris.arff", true);
|
||||||
|
// vector<samples_t>& X = file.getX();
|
||||||
|
// fit(X[0]);
|
||||||
|
// auto Xt = transform(X[0]);
|
||||||
|
// labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 };
|
||||||
|
// EXPECT_EQ(expected, Xt);
|
||||||
|
// auto Xtt = fit_transform(X[0], file.getY());
|
||||||
|
// EXPECT_EQ(expected, Xtt);
|
||||||
|
// auto Xt_t = torch::tensor(X[0], torch::kFloat32);
|
||||||
|
// auto y_t = torch::tensor(file.getY(), torch::kInt32);
|
||||||
|
// auto Xtt_t = fit_transform_t(Xt_t, y_t);
|
||||||
|
// for (int i = 0; i < expected.size(); i++)
|
||||||
|
// EXPECT_EQ(expected[i], Xtt_t[i].item<int>());
|
||||||
|
// fit_t(Xt_t, y_t);
|
||||||
|
// auto Xt_t2 = transform_t(Xt_t);
|
||||||
|
// for (int i = 0; i < expected.size(); i++)
|
||||||
|
// EXPECT_EQ(expected[i], Xt_t2[i].item<int>());
|
||||||
|
// }
|
||||||
|
TEST(TestBinDiscGeneric, Fileset)
|
||||||
{
|
{
|
||||||
ArffFiles file;
|
Experiments exps(data_path + "tests.txt");
|
||||||
file.load(data_path + "iris.arff", true);
|
int num = 0;
|
||||||
vector<samples_t>& X = file.getX();
|
while (exps.is_next()) {
|
||||||
fit(X[0]);
|
Experiment exp = exps.next();
|
||||||
auto Xt = transform(X[0]);
|
std::cout << "Exp #: " << ++num << " From: " << exp.from_ << " To: " << exp.to_ << " Step: " << exp.step_ << " Bins: " << exp.n_bins_ << " Strategy: " << exp.strategy_ << std::endl;
|
||||||
labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
|
BinDisc disc(exp.n_bins_, exp.strategy_ == "Q" ? strategy_t::QUANTILE : strategy_t::UNIFORM);
|
||||||
EXPECT_EQ(expected, Xt);
|
std::vector<float> test;
|
||||||
auto Xtt = fit_transform(X[0], file.getY());
|
for (float i = exp.from_; i < exp.to_; i += exp.step_) {
|
||||||
EXPECT_EQ(expected, Xtt);
|
test.push_back(i);
|
||||||
auto Xt_t = torch::tensor(X[0], torch::kFloat32);
|
}
|
||||||
auto y_t = torch::tensor(file.getY(), torch::kInt32);
|
// show_vector(test, "Test");
|
||||||
auto Xtt_t = fit_transform_t(Xt_t, y_t);
|
auto empty = std::vector<int>();
|
||||||
for (int i = 0; i < expected.size(); i++)
|
auto Xt = disc.fit_transform(test, empty);
|
||||||
EXPECT_EQ(expected[i], Xtt_t[i].item<int>());
|
auto cuts = disc.getCutPoints();
|
||||||
|
EXPECT_EQ(exp.discretized_data_.size(), Xt.size());
|
||||||
|
for (int i = 0; i < exp.discretized_data_.size(); ++i) {
|
||||||
|
if (exp.discretized_data_.at(i) != Xt.at(i)) {
|
||||||
|
std::cout << "Error at " << i << " Expected: " << exp.discretized_data_.at(i) << " Got: " << Xt.at(i) << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EXPECT_EQ(exp.cutpoints_.size(), cuts.size());
|
||||||
|
for (int i = 0; i < exp.cutpoints_.size(); ++i) {
|
||||||
|
EXPECT_NEAR(exp.cutpoints_.at(i), cuts.at(i), margin);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
TEST_F(TestBinDisc4Q, irisQuantile)
|
|
||||||
{
|
|
||||||
ArffFiles file;
|
|
||||||
file.load(data_path + "iris.arff", true);
|
|
||||||
vector<samples_t>& X = file.getX();
|
|
||||||
fit(X[0]);
|
|
||||||
auto Xt = transform(X[0]);
|
|
||||||
labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 };
|
|
||||||
EXPECT_EQ(expected, Xt);
|
|
||||||
auto Xtt = fit_transform(X[0], file.getY());
|
|
||||||
EXPECT_EQ(expected, Xtt);
|
|
||||||
auto Xt_t = torch::tensor(X[0], torch::kFloat32);
|
|
||||||
auto y_t = torch::tensor(file.getY(), torch::kInt32);
|
|
||||||
auto Xtt_t = fit_transform_t(Xt_t, y_t);
|
|
||||||
for (int i = 0; i < expected.size(); i++)
|
|
||||||
EXPECT_EQ(expected[i], Xtt_t[i].item<int>());
|
|
||||||
fit_t(Xt_t, y_t);
|
|
||||||
auto Xt_t2 = transform_t(Xt_t);
|
|
||||||
for (int i = 0; i < expected.size(); i++)
|
|
||||||
EXPECT_EQ(expected[i], Xt_t2[i].item<int>());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -21,6 +21,15 @@ namespace mdlp {
|
|||||||
}
|
}
|
||||||
const std::string data_path = set_data_path();
|
const std::string data_path = set_data_path();
|
||||||
|
|
||||||
|
TEST(Discretizer, Version)
|
||||||
|
{
|
||||||
|
Discretizer* disc = new BinDisc(4, strategy_t::UNIFORM);
|
||||||
|
auto version = disc->version();
|
||||||
|
delete disc;
|
||||||
|
std::cout << "Version computed: " << version;
|
||||||
|
EXPECT_EQ("1.2.3", version);
|
||||||
|
}
|
||||||
|
|
||||||
TEST(Discretizer, BinIrisUniform)
|
TEST(Discretizer, BinIrisUniform)
|
||||||
{
|
{
|
||||||
ArffFiles file;
|
ArffFiles file;
|
||||||
|
102
tests/Experiments.hpp
Normal file
102
tests/Experiments.hpp
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
#ifndef EXPERIMENTS_HPP
|
||||||
|
#define EXPERIMENTS_HPP
|
||||||
|
#include<sstream>
|
||||||
|
#include<iostream>
|
||||||
|
#include<string>
|
||||||
|
#include<fstream>
|
||||||
|
#include<vector>
|
||||||
|
#include<tuple>
|
||||||
|
#include "../typesFImdlp.h"
|
||||||
|
class Experiment {
|
||||||
|
public:
|
||||||
|
Experiment(float from_, float to_, float step_, int n_bins, std::string strategy, std::vector<int> data_discretized, std::vector<float> cutpoints) :
|
||||||
|
from_{ from_ }, to_{ to_ }, step_{ step_ }, n_bins_{ n_bins }, strategy_{ strategy }, discretized_data_{ data_discretized }, cutpoints_{ cutpoints }
|
||||||
|
{
|
||||||
|
if (strategy != "Q" && strategy != "U") {
|
||||||
|
throw std::invalid_argument("Invalid strategy " + strategy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
float from_;
|
||||||
|
float to_;
|
||||||
|
float step_;
|
||||||
|
int n_bins_;
|
||||||
|
std::string strategy_;
|
||||||
|
std::vector<int> discretized_data_;
|
||||||
|
std::vector<float> cutpoints_;
|
||||||
|
};
|
||||||
|
class Experiments {
|
||||||
|
public:
|
||||||
|
Experiments(const std::string filename) : filename{ filename }
|
||||||
|
{
|
||||||
|
test_file.open(filename);
|
||||||
|
if (!test_file.is_open()) {
|
||||||
|
throw std::runtime_error("File " + filename + " not found");
|
||||||
|
}
|
||||||
|
exp_end = false;
|
||||||
|
}
|
||||||
|
~Experiments()
|
||||||
|
{
|
||||||
|
test_file.close();
|
||||||
|
}
|
||||||
|
bool end() const
|
||||||
|
{
|
||||||
|
return exp_end;
|
||||||
|
}
|
||||||
|
bool is_next()
|
||||||
|
{
|
||||||
|
while (std::getline(test_file, line) && line[0] == '#');
|
||||||
|
if (test_file.eof()) {
|
||||||
|
exp_end = true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
Experiment next()
|
||||||
|
{
|
||||||
|
return parse_experiment(line);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
std::tuple<float, float, float, int, std::string> parse_header(const std::string& line)
|
||||||
|
{
|
||||||
|
std::istringstream iss(line);
|
||||||
|
std::string from_, to_, step_, n_bins, strategy;
|
||||||
|
iss >> from_ >> to_ >> step_ >> n_bins >> strategy;
|
||||||
|
return { std::stof(from_), std::stof(to_), std::stof(step_), std::stoi(n_bins), strategy };
|
||||||
|
}
|
||||||
|
template <typename T>
|
||||||
|
std::vector<T> parse_vector(const std::string& line)
|
||||||
|
{
|
||||||
|
std::istringstream iss(line);
|
||||||
|
std::vector<T> data;
|
||||||
|
std::string d;
|
||||||
|
while (iss >> d) {
|
||||||
|
data.push_back(std::is_same<T, float>::value ? std::stof(d) : std::stoi(d));
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
Experiment parse_experiment(std::string& line)
|
||||||
|
{
|
||||||
|
auto [from_, to_, step_, n_bins, strategy] = parse_header(line);
|
||||||
|
std::getline(test_file, line);
|
||||||
|
auto data_discretized = parse_vector<int>(line);
|
||||||
|
std::getline(test_file, line);
|
||||||
|
auto cutpoints = parse_vector<float>(line);
|
||||||
|
return Experiment{ from_, to_, step_, n_bins, strategy, data_discretized, cutpoints };
|
||||||
|
}
|
||||||
|
std::ifstream test_file;
|
||||||
|
std::string filename;
|
||||||
|
std::string line;
|
||||||
|
bool exp_end;
|
||||||
|
};
|
||||||
|
template <typename T>
|
||||||
|
void show_vector(const std::vector<T>& data, std::string title)
|
||||||
|
{
|
||||||
|
std::cout << title << ": ";
|
||||||
|
std::string sep = "";
|
||||||
|
for (const auto& d : data) {
|
||||||
|
std::cout << sep << d;
|
||||||
|
sep = ", ";
|
||||||
|
}
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
35
tests/datasets/tests.txt
Normal file
35
tests/datasets/tests.txt
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
#
|
||||||
|
# from, to, step, #bins, Q/U
|
||||||
|
# discretized data
|
||||||
|
# cut points
|
||||||
|
#
|
||||||
|
0, 100, 1, 4, Q
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
||||||
|
0.0, 24.75, 49.5, 74.25, 99.0
|
||||||
|
0, 50, 1, 4, Q
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
||||||
|
0.0, 12.25, 24.5, 36.75, 49.0
|
||||||
|
0, 100, 1, 3, Q
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
||||||
|
0.0, 33.0, 66.0, 99.0
|
||||||
|
0, 50, 1, 3, Q
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
||||||
|
0.0, 16.33333, 32.66667, 49.0
|
||||||
|
0, 10, 1, 3, Q
|
||||||
|
0, 0, 0, 0, 1, 1, 1, 2, 2, 2
|
||||||
|
0.0, 3.0, 6.0, 9.0
|
||||||
|
0, 100, 1, 4, U
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
||||||
|
0.0, 24.75, 49.5, 74.25, 99.0
|
||||||
|
0, 50, 1, 4, U
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
||||||
|
0.0, 12.25, 24.5, 36.75, 49.0
|
||||||
|
0, 100, 1, 3, U
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
||||||
|
0.0, 33.0, 66.0, 99.0
|
||||||
|
0, 50, 1, 3, U
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
||||||
|
0.0, 16.33333, 32.66667, 49.0
|
||||||
|
0, 10, 1, 3, U
|
||||||
|
0, 0, 0, 1, 1, 1, 2, 2, 2, 2
|
||||||
|
0.0, 3.0, 6.0, 9.0
|
32
tests/k.cpp
Normal file
32
tests/k.cpp
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm> // For std::lower_bound
|
||||||
|
|
||||||
|
std::vector<int> searchsorted(const std::vector<float>& cuts, const std::vector<float>& data) {
|
||||||
|
std::vector<int> indices;
|
||||||
|
indices.reserve(data.size());
|
||||||
|
|
||||||
|
for (const float& value : data) {
|
||||||
|
// Find the first position in 'a' where 'value' could be inserted to maintain order
|
||||||
|
auto it = std::lower_bound(cuts.begin(), cuts.end(), value);
|
||||||
|
// Calculate the index
|
||||||
|
int index = it - cuts.begin();
|
||||||
|
indices.push_back(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
return indices;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
std::vector<float> cuts = { 10.0 };
|
||||||
|
std::vector<float> data = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
|
||||||
|
|
||||||
|
std::vector<int> result = searchsorted(cuts, data);
|
||||||
|
|
||||||
|
for (int idx : result) {
|
||||||
|
std::cout << idx << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
102
tests/t.cpp
Normal file
102
tests/t.cpp
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
typedef float precision_t;
|
||||||
|
|
||||||
|
std::vector<int> transform(const std::vector<float> cutPoints, const std::vector<float>& data)
|
||||||
|
{
|
||||||
|
std::vector<int> discretizedData;
|
||||||
|
discretizedData.reserve(data.size());
|
||||||
|
for (const float& item : data) {
|
||||||
|
auto upper = std::lower_bound(cutPoints.begin(), cutPoints.end(), item);
|
||||||
|
discretizedData.push_back(upper - cutPoints.begin());
|
||||||
|
}
|
||||||
|
return discretizedData;
|
||||||
|
}
|
||||||
|
template <typename T>
|
||||||
|
void show_vector(const std::vector<T>& data, std::string title)
|
||||||
|
{
|
||||||
|
std::cout << title << ": ";
|
||||||
|
std::string sep = "";
|
||||||
|
for (const auto& d : data) {
|
||||||
|
std::cout << sep << d;
|
||||||
|
sep = ", ";
|
||||||
|
}
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
std::vector<precision_t> linspace(precision_t start, precision_t end, int num)
|
||||||
|
{
|
||||||
|
if (start == end) {
|
||||||
|
return { start, end };
|
||||||
|
}
|
||||||
|
precision_t delta = (end - start) / static_cast<precision_t>(num - 1);
|
||||||
|
std::vector<precision_t> linspc;
|
||||||
|
for (size_t i = 0; i < num - 1; ++i) {
|
||||||
|
precision_t val = start + delta * static_cast<precision_t>(i);
|
||||||
|
linspc.push_back(val);
|
||||||
|
}
|
||||||
|
return linspc;
|
||||||
|
}
|
||||||
|
size_t clip(const size_t n, size_t lower, size_t upper)
|
||||||
|
{
|
||||||
|
return std::max(lower, std::min(n, upper));
|
||||||
|
}
|
||||||
|
std::vector<precision_t> percentile(std::vector<precision_t>& data, std::vector<precision_t>& percentiles)
|
||||||
|
{
|
||||||
|
// Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html
|
||||||
|
std::vector<precision_t> results;
|
||||||
|
results.reserve(percentiles.size());
|
||||||
|
for (auto percentile : percentiles) {
|
||||||
|
const size_t i = static_cast<size_t>(std::floor(static_cast<double>(data.size() - 1) * percentile / 100.));
|
||||||
|
const auto indexLower = clip(i, 0, data.size() - 2);
|
||||||
|
const double percentI = static_cast<double>(indexLower) / static_cast<double>(data.size() - 1);
|
||||||
|
const double fraction =
|
||||||
|
(percentile / 100.0 - percentI) /
|
||||||
|
(static_cast<double>(indexLower + 1) / static_cast<double>(data.size() - 1) - percentI);
|
||||||
|
const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction;
|
||||||
|
if (value != results.back())
|
||||||
|
results.push_back(value);
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
// std::vector<float> test;
|
||||||
|
// std::vector<float> cuts = { 0, 24.75, 49.5, 74.25, 10000 };
|
||||||
|
// for (int i = 0; i < 100; ++i) {
|
||||||
|
// test.push_back(i);
|
||||||
|
// }
|
||||||
|
// auto Xt = transform(cuts, test);
|
||||||
|
// show_vector(Xt, "Discretized data:");
|
||||||
|
// std::vector<float> test2 = { 0,1,2,3,4,5,6,7,8,9,10,11 };
|
||||||
|
// std::vector<float> cuts2 = { 0,1,2,3,4,5,6,7,8,9 };
|
||||||
|
// auto Xt2 = transform(cuts2, test2);
|
||||||
|
// show_vector(Xt2, "discretized data2: ");
|
||||||
|
auto quantiles = linspace(0.0, 100.0, 3 + 1);
|
||||||
|
std::vector<float> data = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
|
||||||
|
std::vector<float> cutPoints;
|
||||||
|
std::sort(data.begin(), data.end());
|
||||||
|
cutPoints = percentile(data, quantiles);
|
||||||
|
cutPoints.push_back(std::numeric_limits<precision_t>::max());
|
||||||
|
data.push_back(15);
|
||||||
|
data.push_back(0);
|
||||||
|
cutPoints.pop_back();
|
||||||
|
cutPoints.erase(cutPoints.begin());
|
||||||
|
cutPoints.clear();
|
||||||
|
cutPoints.push_back(9.0);
|
||||||
|
auto Xt = transform(cutPoints, data);
|
||||||
|
show_vector(data, "Original data");
|
||||||
|
show_vector(Xt, "Discretized data");
|
||||||
|
show_vector(cutPoints, "Cutpoints");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
n_bins = 3
|
||||||
|
data = [1,2,3,4,5,6,7,8,9,10]
|
||||||
|
quantiles = np.linspace(0, 100, n_bins + 1)
|
||||||
|
bin_edges = np.percentile(data, quantiles)
|
||||||
|
|
||||||
|
*/
|
39
tests/tests_do.py
Normal file
39
tests/tests_do.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
from sklearn.preprocessing import KBinsDiscretizer
|
||||||
|
|
||||||
|
with open("datasets/tests.txt") as f:
|
||||||
|
data = f.readlines()
|
||||||
|
|
||||||
|
data = [x.strip() for x in data if x[0] != "#"]
|
||||||
|
|
||||||
|
for i in range(0, len(data), 3):
|
||||||
|
print("Experiment:", data[i])
|
||||||
|
from_, to_, step_, n_bins_, strategy_ = data[i].split(",")
|
||||||
|
strategy = "quantile" if strategy_.strip() == "Q" else "uniform"
|
||||||
|
disc = KBinsDiscretizer(
|
||||||
|
n_bins=int(n_bins_),
|
||||||
|
encode="ordinal",
|
||||||
|
strategy=strategy,
|
||||||
|
)
|
||||||
|
X = [[float(x)] for x in range(int(from_), int(to_), int(step_))]
|
||||||
|
# result = disc.fit_transform(X)
|
||||||
|
disc.fit(X)
|
||||||
|
result = disc.transform(X)
|
||||||
|
result = [int(x) for x in result.flatten()]
|
||||||
|
expected = [int(x) for x in data[i + 1].split(",")]
|
||||||
|
assert len(result) == len(expected)
|
||||||
|
for j in range(len(result)):
|
||||||
|
if result[j] != expected[j]:
|
||||||
|
print("Error at", j, "Expected=", expected[j], "Result=", result[j])
|
||||||
|
expected_cuts = disc.bin_edges_[0]
|
||||||
|
computed_cuts = [float(x) for x in data[i + 2].split(",")]
|
||||||
|
assert len(expected_cuts) == len(computed_cuts)
|
||||||
|
for j in range(len(expected_cuts)):
|
||||||
|
if round(expected_cuts[j], 5) != computed_cuts[j]:
|
||||||
|
print(
|
||||||
|
"Error at",
|
||||||
|
j,
|
||||||
|
"Expected=",
|
||||||
|
expected_cuts[j],
|
||||||
|
"Result=",
|
||||||
|
computed_cuts[j],
|
||||||
|
)
|
85
tests/tests_generate.ipynb
Normal file
85
tests/tests_generate.ipynb
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.preprocessing import KBinsDiscretizer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"experiments = [\n",
|
||||||
|
" [0, 100, 1, 4, \"Q\"],\n",
|
||||||
|
" [0, 50, 1, 4, \"Q\"],\n",
|
||||||
|
" [0, 100, 1, 3, \"Q\"],\n",
|
||||||
|
" [0, 50, 1, 3, \"Q\"],\n",
|
||||||
|
" [0, 10, 1, 3, \"Q\"],\n",
|
||||||
|
" [0, 100, 1, 4, \"U\"],\n",
|
||||||
|
" [0, 50, 1, 4, \"U\"],\n",
|
||||||
|
" [0, 100, 1, 3, \"U\"],\n",
|
||||||
|
" [0, 50, 1, 3, \"U\"],\n",
|
||||||
|
" [0, 10, 1, 3, \"U\"],\n",
|
||||||
|
"]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open(\"datasets/tests.txt\", \"w\") as file:\n",
|
||||||
|
" file.write(\"#\\n\")\n",
|
||||||
|
" file.write(\"# from, to, step, #bins, Q/U\\n\")\n",
|
||||||
|
" file.write(\"# discretized data\\n\")\n",
|
||||||
|
" file.write(\"# cut points\\n\")\n",
|
||||||
|
" file.write(\"#\\n\")\n",
|
||||||
|
" for experiment in experiments:\n",
|
||||||
|
" (from_, to_, step_, bins_, strategy) = experiment\n",
|
||||||
|
" disc = KBinsDiscretizer(n_bins=bins_, encode='ordinal', strategy='quantile' if strategy.strip() == \"Q\" else 'uniform')\n",
|
||||||
|
" data = [[x] for x in range(from_, to_, step_)]\n",
|
||||||
|
" disc.fit(data)\n",
|
||||||
|
" result = disc.transform(data)\n",
|
||||||
|
" file.write(f\"{from_}, {to_}, {step_}, {bins_}, {strategy}\\n\")\n",
|
||||||
|
" sep = \"\"\n",
|
||||||
|
" for res in result:\n",
|
||||||
|
" file.write(f\"{sep}{int(res):d}\")\n",
|
||||||
|
" sep= \", \"\n",
|
||||||
|
" file.write(\"\\n\")\n",
|
||||||
|
" sep = \"\"\n",
|
||||||
|
" for res in disc.bin_edges_[0]:\n",
|
||||||
|
" file.write(sep + str(round(res,5)))\n",
|
||||||
|
" sep = \", \"\n",
|
||||||
|
" file.write(\"\\n\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "base",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.8"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
Reference in New Issue
Block a user