diff --git a/.gitignore b/.gitignore index 8934161..055db60 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,8 @@ *.out *.app **/build +build_Debug +build_Release **/lcoverage .idea cmake-* diff --git a/BinDisc.cpp b/BinDisc.cpp new file mode 100644 index 0000000..4865fdc --- /dev/null +++ b/BinDisc.cpp @@ -0,0 +1,138 @@ +#include +#include +#include +#include "BinDisc.h" +#include +#include + +namespace mdlp { + + BinDisc::BinDisc(int n_bins, strategy_t strategy) : n_bins{ n_bins }, strategy{ strategy } + { + if (n_bins < 3) { + throw std::invalid_argument("n_bins must be greater than 2"); + } + } + BinDisc::~BinDisc() = default; + void BinDisc::fit(samples_t& X) + { + cutPoints.clear(); + if (X.empty()) { + cutPoints.push_back(std::numeric_limits::max()); + return; + } + if (strategy == strategy_t::QUANTILE) { + fit_quantile(X); + } else if (strategy == strategy_t::UNIFORM) { + fit_uniform(X); + } + } + std::vector linspace(precision_t start, precision_t end, int num) + { + // Doesn't include end point as it is not needed + if (start == end) { + return { 0 }; + } + precision_t delta = (end - start) / static_cast(num - 1); + std::vector linspc; + for (size_t i = 0; i < num - 1; ++i) { + precision_t val = start + delta * static_cast(i); + linspc.push_back(val); + } + return linspc; + } + size_t clip(const size_t n, size_t lower, size_t upper) + { + return std::max(lower, std::min(n, upper)); + } + std::vector percentile(samples_t& data, std::vector& percentiles) + { + // Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html + std::vector results; + results.reserve(percentiles.size()); + for (auto percentile : percentiles) { + const size_t i = static_cast(std::floor(static_cast(data.size() - 1) * percentile / 100.)); + const auto indexLower = clip(i, 0, data.size() - 1); + const double percentI = static_cast(indexLower) / static_cast(data.size() - 1); + const double fraction = + (percentile / 100.0 - percentI) / + (static_cast(indexLower + 1) / static_cast(data.size() - 1) - percentI); + const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction; + if (value != results.back()) + results.push_back(value); + } + return results; + } + void BinDisc::fit_quantile(samples_t& X) + { + auto quantiles = linspace(0.0, 100.0, n_bins + 1); + auto data = X; + std::sort(data.begin(), data.end()); + if (data.front() == data.back() || data.size() == 1) { + // if X is constant + cutPoints.push_back(std::numeric_limits::max()); + return; + } + cutPoints = percentile(data, quantiles); + normalizeCutPoints(); + } + void BinDisc::fit_uniform(samples_t& X) + { + + auto minmax = std::minmax_element(X.begin(), X.end()); + cutPoints = linspace(*minmax.first, *minmax.second, n_bins + 1); + normalizeCutPoints(); + } + void BinDisc::normalizeCutPoints() + { + // Add max value to the end + cutPoints.push_back(std::numeric_limits::max()); + // Remove first as it is not needed + cutPoints.erase(cutPoints.begin()); + } + labels_t& BinDisc::transform(const samples_t& X) + { + discretizedData.clear(); + discretizedData.reserve(X.size()); + for (const precision_t& item : X) { + auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item); + discretizedData.push_back(upper - cutPoints.begin()); + } + return discretizedData; + } +} +// void BinDisc::fit_quantile(samples_t& X) + // { + // cutPoints.clear(); + // if (X.empty()) { + // cutPoints.push_back(std::numeric_limits::max()); + // return; + // } + // samples_t data = X; + // std::sort(data.begin(), data.end()); + // float min_val = data.front(); + // float max_val = data.back(); + // // Handle case of all data points having the same value + // if (min_val == max_val) { + // cutPoints.push_back(std::numeric_limits::max()); + // return; + // } + // int first = X.size() / n_bins; + // cutPoints.push_back(data.at(first - 1)); + // int bins_done = 1; + // int prev = first - 1; + // while (bins_done < n_bins) { + // int next = first * (bins_done + 1) - 1; + // while (next < X.size() && data.at(next) == data[prev]) { + // ++next; + // } + // if (next == X.size() || bins_done == n_bins - 1) { + // cutPoints.push_back(std::numeric_limits::max()); + // break; + // } else { + // cutPoints.push_back(data[next]); + // bins_done++; + // prev = next; + // } + // } + // } \ No newline at end of file diff --git a/BinDisc.h b/BinDisc.h new file mode 100644 index 0000000..b0ab432 --- /dev/null +++ b/BinDisc.h @@ -0,0 +1,31 @@ +#ifndef BINDISC_H +#define BINDISC_H + +#include "typesFImdlp.h" +#include + +namespace mdlp { + + enum class strategy_t { + UNIFORM, + QUANTILE + }; + class BinDisc { + public: + BinDisc(int n_bins = 3, strategy_t strategy = strategy_t::UNIFORM); + ~BinDisc(); + void fit(samples_t&); + inline cutPoints_t getCutPoints() const { return cutPoints; }; + labels_t& transform(const samples_t&); + static inline std::string version() { return "1.0.0"; }; + private: + void fit_uniform(samples_t&); + void fit_quantile(samples_t&); + void normalizeCutPoints(); + int n_bins; + strategy_t strategy; + labels_t discretizedData = labels_t(); + cutPoints_t cutPoints; + }; +} +#endif diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index 0b3d184..4b62222 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -3,7 +3,6 @@ #include #include #include "CPPFImdlp.h" -#include "Metrics.h" namespace mdlp { @@ -178,7 +177,7 @@ namespace mdlp { indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_) { indices_t idx(X_.size()); - iota(idx.begin(), idx.end(), 0); + std::iota(idx.begin(), idx.end(), 0); stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) { if (X_[i1] == X_[i2]) return y_[i1] < y_[i2]; @@ -214,7 +213,7 @@ namespace mdlp { discretizedData.clear(); discretizedData.reserve(data.size()); for (const precision_t& item : data) { - auto upper = upper_bound(cutPoints.begin(), cutPoints.end(), item); + auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item); discretizedData.push_back(upper - cutPoints.begin()); } return discretizedData; diff --git a/CPPFImdlp.h b/CPPFImdlp.h index 1fb0cab..4286c9e 100644 --- a/CPPFImdlp.h +++ b/CPPFImdlp.h @@ -2,13 +2,22 @@ #define CPPFIMDLP_H #include "typesFImdlp.h" -#include "Metrics.h" #include #include #include +#include "Metrics.h" namespace mdlp { class CPPFImdlp { + public: + CPPFImdlp(); + CPPFImdlp(size_t, int, float); + ~CPPFImdlp(); + void fit(samples_t&, labels_t&); + inline cutPoints_t getCutPoints() const { return cutPoints; }; + labels_t& transform(const samples_t&); + inline int get_depth() const { return depth; }; + static inline std::string version() { return "1.1.3"; }; protected: size_t min_length = 3; int depth = 0; @@ -21,25 +30,13 @@ namespace mdlp { cutPoints_t cutPoints; size_t num_cut_points = numeric_limits::max(); labels_t discretizedData = labels_t(); - static indices_t sortIndices(samples_t&, labels_t&); - void computeCutPoints(size_t, size_t, int); void resizeCutPoints(); bool mdlp(size_t, size_t, size_t); size_t getCandidate(size_t, size_t); size_t compute_max_num_cut_points() const; pair valueCutPoint(size_t, size_t, size_t); - - public: - CPPFImdlp(); - CPPFImdlp(size_t, int, float); - ~CPPFImdlp(); - void fit(samples_t&, labels_t&); - inline cutPoints_t getCutPoints() const { return cutPoints; }; - labels_t& transform(const samples_t&); - inline int get_depth() const { return depth; }; - static inline string version() { return "1.1.2"; }; }; } #endif diff --git a/sonar-project.properties b/sonar-project.properties index f2ef3e8..ab13fcc 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -3,7 +3,7 @@ sonar.organization=rmontanana # This is the name and version displayed in the SonarCloud UI. sonar.projectName=mdlp -sonar.projectVersion=1.0.2 +sonar.projectVersion=1.1.3 # sonar.test.exclusions=tests/** # sonar.tests=tests/ # sonar.coverage.exclusions=tests/**,sample/** diff --git a/tests/BinDisc_unittest.cpp b/tests/BinDisc_unittest.cpp new file mode 100644 index 0000000..f6e78dc --- /dev/null +++ b/tests/BinDisc_unittest.cpp @@ -0,0 +1,351 @@ +#include +#include +#include +#include "gtest/gtest.h" +#include "ArffFiles.h" +#include "../BinDisc.h" + +namespace mdlp { + const float margin = 1e-4; + static std::string set_data_path() + { + std::string path = "../datasets/"; + std::ifstream file(path + "iris.arff"); + if (file.is_open()) { + file.close(); + return path; + } + return "../../tests/datasets/"; + } + const std::string data_path = set_data_path(); + class TestBinDisc3U : public BinDisc, public testing::Test { + public: + TestBinDisc3U(int n_bins = 3) : BinDisc(n_bins, strategy_t::UNIFORM) {}; + }; + class TestBinDisc3Q : public BinDisc, public testing::Test { + public: + TestBinDisc3Q(int n_bins = 3) : BinDisc(n_bins, strategy_t::QUANTILE) {}; + }; + class TestBinDisc4U : public BinDisc, public testing::Test { + public: + TestBinDisc4U(int n_bins = 4) : BinDisc(n_bins, strategy_t::UNIFORM) {}; + }; + class TestBinDisc4Q : public BinDisc, public testing::Test { + public: + TestBinDisc4Q(int n_bins = 4) : BinDisc(n_bins, strategy_t::QUANTILE) {}; + }; + TEST_F(TestBinDisc3U, Easy3BinsUniform) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_NEAR(3.66667, cuts[0], margin); + EXPECT_NEAR(6.33333, cuts[1], margin); + EXPECT_EQ(numeric_limits::max(), cuts[2]); + EXPECT_EQ(3, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc3Q, Easy3BinsQuantile) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_NEAR(3.666667, cuts[0], margin); + EXPECT_NEAR(6.333333, cuts[1], margin); + EXPECT_EQ(numeric_limits::max(), cuts[2]); + EXPECT_EQ(3, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc3U, X10BinsUniform) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(4.0, cuts[0]); + EXPECT_EQ(7.0, cuts[1]); + EXPECT_EQ(numeric_limits::max(), cuts[2]); + EXPECT_EQ(3, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc3Q, X10BinsQuantile) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(4, cuts[0]); + EXPECT_EQ(7, cuts[1]); + EXPECT_EQ(numeric_limits::max(), cuts[2]); + EXPECT_EQ(3, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc3U, X11BinsUniform) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_NEAR(4.33333, cuts[0], margin); + EXPECT_NEAR(7.66667, cuts[1], margin); + EXPECT_EQ(numeric_limits::max(), cuts[2]); + EXPECT_EQ(3, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc3U, X11BinsQuantile) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_NEAR(4.33333, cuts[0], margin); + EXPECT_NEAR(7.66667, cuts[1], margin); + EXPECT_EQ(numeric_limits::max(), cuts[2]); + EXPECT_EQ(3, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc3U, ConstantUniform) + { + samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(numeric_limits::max(), cuts[0]); + EXPECT_EQ(1, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 0, 0, 0 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc3Q, ConstantQuantile) + { + samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(numeric_limits::max(), cuts[0]); + EXPECT_EQ(1, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 0, 0, 0 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc3U, EmptyUniform) + { + samples_t X = {}; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(numeric_limits::max(), cuts[0]); + EXPECT_EQ(1, cuts.size()); + } + TEST_F(TestBinDisc3Q, EmptyQuantile) + { + samples_t X = {}; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(numeric_limits::max(), cuts[0]); + EXPECT_EQ(1, cuts.size()); + } + TEST(TestBinDisc3, ExceptionNumberBins) + { + EXPECT_THROW(BinDisc(2), std::invalid_argument); + } + TEST_F(TestBinDisc3U, EasyRepeated) + { + samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_NEAR(1.66667, cuts[0], margin); + EXPECT_NEAR(2.33333, cuts[1], margin); + EXPECT_EQ(numeric_limits::max(), cuts[2]); + EXPECT_EQ(3, cuts.size()); + auto labels = transform(X); + labels_t expected = { 2, 0, 0, 2, 0, 0, 2, 0, 0 }; + EXPECT_EQ(expected, labels); + EXPECT_EQ(3.0, X[0]); // X is not modified + } + TEST_F(TestBinDisc3Q, EasyRepeated) + { + samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 }; + fit(X); + auto cuts = getCutPoints(); + std::cout << "cuts: "; + for (auto cut : cuts) { + std::cout << cut << " "; + } + std::cout << std::endl; + std::cout << std::string(80, '-') << std::endl; + EXPECT_NEAR(1.66667, cuts[0], margin); + EXPECT_EQ(numeric_limits::max(), cuts[1]); + EXPECT_EQ(2, cuts.size()); + auto labels = transform(X); + labels_t expected = { 1, 0, 0, 1, 0, 0, 1, 0, 0 }; + EXPECT_EQ(expected, labels); + EXPECT_EQ(3.0, X[0]); // X is not modified + } + TEST_F(TestBinDisc4U, Easy4BinsUniform) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(3.75, cuts[0]); + EXPECT_EQ(6.5, cuts[1]); + EXPECT_EQ(9.25, cuts[2]); + EXPECT_EQ(numeric_limits::max(), cuts[3]); + EXPECT_EQ(4, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc4Q, Easy4BinsQuantile) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(3.75, cuts[0]); + EXPECT_EQ(6.5, cuts[1]); + EXPECT_EQ(9.25, cuts[2]); + EXPECT_EQ(numeric_limits::max(), cuts[3]); + EXPECT_EQ(4, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc4U, X13BinsUniform) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(4.0, cuts[0]); + EXPECT_EQ(7.0, cuts[1]); + EXPECT_EQ(10.0, cuts[2]); + EXPECT_EQ(numeric_limits::max(), cuts[3]); + EXPECT_EQ(4, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc4Q, X13BinsQuantile) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(4.0, cuts[0]); + EXPECT_EQ(7.0, cuts[1]); + EXPECT_EQ(10.0, cuts[2]); + EXPECT_EQ(numeric_limits::max(), cuts[3]); + EXPECT_EQ(4, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc4U, X14BinsUniform) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(4.25, cuts[0]); + EXPECT_EQ(7.5, cuts[1]); + EXPECT_EQ(10.75, cuts[2]); + EXPECT_EQ(numeric_limits::max(), cuts[3]); + EXPECT_EQ(4, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc4Q, X14BinsQuantile) + { + samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(4.25, cuts[0]); + EXPECT_EQ(7.5, cuts[1]); + EXPECT_EQ(10.75, cuts[2]); + EXPECT_EQ(numeric_limits::max(), cuts[3]); + EXPECT_EQ(4, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc4U, X15BinsUniform) + { + samples_t X = { 15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(4.5, cuts[0]); + EXPECT_EQ(8, cuts[1]); + EXPECT_EQ(11.5, cuts[2]); + EXPECT_EQ(numeric_limits::max(), cuts[3]); + EXPECT_EQ(4, cuts.size()); + auto labels = transform(X); + labels_t expected = { 3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc4Q, X15BinsQuantile) + { + samples_t X = { 15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 }; + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(4.5, cuts[0]); + EXPECT_EQ(8, cuts[1]); + EXPECT_EQ(11.5, cuts[2]); + EXPECT_EQ(numeric_limits::max(), cuts[3]); + EXPECT_EQ(4, cuts.size()); + auto labels = transform(X); + labels_t expected = { 3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc4U, RepeatedValuesUniform) + { + samples_t X = { 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0 }; + // 0 1 2 3 4 5 6 7 8 9 + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(1.0, cuts[0]); + EXPECT_EQ(2.0, cuts[1]); + EXPECT_EQ(3.0, cuts[2]); + EXPECT_EQ(numeric_limits::max(), cuts[3]); + EXPECT_EQ(4, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc4Q, RepeatedValuesQuantile) + { + samples_t X = { 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0 }; + // 0 1 2 3 4 5 6 7 8 9 + fit(X); + auto cuts = getCutPoints(); + EXPECT_EQ(2.0, cuts[0]); + EXPECT_EQ(3.0, cuts[1]); + EXPECT_EQ(numeric_limits::max(), cuts[2]); + EXPECT_EQ(3, cuts.size()); + auto labels = transform(X); + labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 }; + EXPECT_EQ(expected, labels); + } + TEST_F(TestBinDisc4U, irisUniform) + { + ArffFiles file; + file.load(data_path + "iris.arff", true); + vector& X = file.getX(); + fit(X[0]); + auto Xt = transform(X[0]); + labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 }; + EXPECT_EQ(expected, Xt); + } + TEST_F(TestBinDisc4Q, irisQuantile) + { + ArffFiles file; + file.load(data_path + "iris.arff", true); + vector& X = file.getX(); + fit(X[0]); + auto Xt = transform(X[0]); + labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 }; + EXPECT_EQ(expected, Xt); + } +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4eb31f5..a382899 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,3 +1,4 @@ +cmake_minimum_required(VERSION 3.20) set(CMAKE_CXX_STANDARD 11) include(FetchContent) @@ -16,14 +17,18 @@ enable_testing() add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp) add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp) +add_executable(BinDisc_unittest ../BinDisc.cpp ArffFiles.cpp BinDisc_unittest.cpp) target_link_libraries(Metrics_unittest GTest::gtest_main) target_link_libraries(FImdlp_unittest GTest::gtest_main) +target_link_libraries(BinDisc_unittest GTest::gtest_main) target_compile_options(Metrics_unittest PRIVATE --coverage) target_compile_options(FImdlp_unittest PRIVATE --coverage) +target_compile_options(BinDisc_unittest PRIVATE --coverage) target_link_options(Metrics_unittest PRIVATE --coverage) target_link_options(FImdlp_unittest PRIVATE --coverage) +target_link_options(BinDisc_unittest PRIVATE --coverage) include(GoogleTest) gtest_discover_tests(Metrics_unittest) gtest_discover_tests(FImdlp_unittest) - +gtest_discover_tests(BinDisc_unittest) \ No newline at end of file diff --git a/tests/test b/tests/test index 33fde47..552d755 100755 --- a/tests/test +++ b/tests/test @@ -1,3 +1,4 @@ +#!/bin/bash if [ -d build ] ; then rm -fr build fi @@ -9,12 +10,9 @@ cmake --build build cd build ctest --output-on-failure cd .. -if [ ! -d gcovr-report ] ; then - mkdir gcovr-report -fi -rm -fr gcovr-report/* 2>/dev/null +mkdir gcovr-report #lcov --capture --directory ./ --output-file lcoverage/main_coverage.info #lcov --remove lcoverage/main_coverage.info 'v1/*' '/Applications/*' '*/tests/*' --output-file lcoverage/main_coverage.info -q #lcov --list lcoverage/main_coverage.info cd .. -gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml +gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --gcov-filter "BinDisc.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml --exclude-noncode-lines diff --git a/tests/testKbins.py b/tests/testKbins.py new file mode 100644 index 0000000..e2f8fea --- /dev/null +++ b/tests/testKbins.py @@ -0,0 +1,404 @@ +from scipy.io.arff import loadarff +from sklearn.preprocessing import KBinsDiscretizer + + +def test(clf, X, expected, title): + X = [[x] for x in X] + clf.fit(X) + computed = [int(x[0]) for x in clf.transform(X)] + print(f"{title}") + print(f"{computed=}") + print(f"{expected=}") + assert computed == expected + print("-" * 80) + + +# Test Uniform Strategy +clf3u = KBinsDiscretizer( + n_bins=3, encode="ordinal", strategy="uniform", subsample=200_000 +) +clf3q = KBinsDiscretizer( + n_bins=3, encode="ordinal", strategy="quantile", subsample=200_000 +) +clf4u = KBinsDiscretizer( + n_bins=4, encode="ordinal", strategy="uniform", subsample=200_000 +) +clf4q = KBinsDiscretizer( + n_bins=4, encode="ordinal", strategy="quantile", subsample=200_000 +) +# +X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] +labels = [0, 0, 0, 1, 1, 1, 2, 2, 2] +test(clf3u, X, labels, title="Easy3BinsUniform") +test(clf3q, X, labels, title="Easy3BinsQuantile") +# +X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] +labels = [0, 0, 0, 1, 1, 1, 2, 2, 2, 2] +# En C++ se obtiene el mismo resultado en ambos, no como aquí +labels2 = [0, 0, 0, 1, 1, 1, 1, 2, 2, 2] +test(clf3u, X, labels, title="X10BinsUniform") +test(clf3q, X, labels2, title="X10BinsQuantile") +# +X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0] +labels = [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2] +# En C++ se obtiene el mismo resultado en ambos, no como aquí +# labels2 = [0, 0, 0, 1, 1, 1, 1, 2, 2, 2] +test(clf3u, X, labels, title="X11BinsUniform") +test(clf3q, X, labels, title="X11BinsQuantile") +# +X = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +labels = [0, 0, 0, 0, 0, 0] +test(clf3u, X, labels, title="ConstantUniform") +test(clf3q, X, labels, title="ConstantQuantile") +# +X = [3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0] +labels = [2, 0, 0, 2, 0, 0, 2, 0, 0] +labels2 = [1, 0, 0, 1, 0, 0, 1, 0, 0] # igual que en C++ +test(clf3u, X, labels, title="EasyRepeatedUniform") +test(clf3q, X, labels2, title="EasyRepeatedQuantile") +# +X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0] +labels = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] +test(clf4u, X, labels, title="Easy4BinsUniform") +test(clf4q, X, labels, title="Easy4BinsQuantile") +# +X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0] +labels = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3] +test(clf4u, X, labels, title="X13BinsUniform") +test(clf4q, X, labels, title="X13BinsQuantile") +# +X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0] +labels = [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3] +test(clf4u, X, labels, title="X14BinsUniform") +test(clf4q, X, labels, title="X14BinsQuantile") +# +X1 = [15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0] +X2 = [15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0] +labels1 = [3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0] +labels2 = [3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0] +test(clf4u, X1, labels1, title="X15BinsUniform") +test(clf4q, X2, labels2, title="X15BinsQuantile") +# +X = [0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0] +labels = [0, 1, 1, 1, 2, 2, 3, 3, 3, 3] +test(clf4u, X, labels, title="RepeatedValuesUniform") +test(clf4q, X, labels, title="RepeatedValuesQuantile") + +print(f"Uniform {clf4u.bin_edges_=}") +print(f"Quaintile {clf4q.bin_edges_=}") +print("-" * 80) +# +data, meta = loadarff("tests/datasets/iris.arff") +labelsu = [ + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 1, + 1, + 1, + 0, + 1, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 3, + 2, + 2, + 1, + 2, + 1, + 2, + 0, + 2, + 1, + 0, + 1, + 1, + 2, + 1, + 2, + 1, + 1, + 2, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 1, + 1, + 1, + 2, + 1, + 0, + 1, + 1, + 1, + 2, + 0, + 1, + 2, + 1, + 3, + 2, + 2, + 3, + 0, + 3, + 2, + 3, + 2, + 2, + 2, + 1, + 1, + 2, + 2, + 3, + 3, + 1, + 2, + 1, + 3, + 2, + 2, + 3, + 2, + 2, + 2, + 3, + 3, + 3, + 2, + 2, + 2, + 3, + 2, + 2, + 1, + 2, + 2, + 2, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 1, +] +labelsq = [ + 1, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 1, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 1, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 1, + 0, + 1, + 0, + 1, + 0, + 3, + 3, + 3, + 1, + 3, + 1, + 2, + 0, + 3, + 1, + 0, + 2, + 2, + 2, + 1, + 3, + 1, + 2, + 2, + 1, + 2, + 2, + 2, + 2, + 3, + 3, + 3, + 3, + 2, + 1, + 1, + 1, + 2, + 2, + 1, + 2, + 3, + 2, + 1, + 1, + 1, + 2, + 2, + 0, + 1, + 1, + 1, + 2, + 1, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 0, + 3, + 3, + 3, + 3, + 3, + 3, + 1, + 2, + 3, + 3, + 3, + 3, + 2, + 3, + 1, + 3, + 2, + 3, + 3, + 2, + 2, + 3, + 3, + 3, + 3, + 3, + 2, + 2, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 2, + 3, + 3, + 3, + 2, + 3, + 2, + 2, +] +test(clf4u, data["sepallength"], labelsu, title="IrisUniform") +test(clf4q, data["sepallength"], labelsq, title="IrisQuantile") +# print("Labels") +# print(labels) +# print("Expected") +# print(expected) +# for i in range(len(labels)): +# if labels[i] != expected[i]: +# print(f"Error at {i} {labels[i]} != {expected[i]}") diff --git a/typesFImdlp.h b/typesFImdlp.h index b28b2ca..a0b6d54 100644 --- a/typesFImdlp.h +++ b/typesFImdlp.h @@ -8,11 +8,11 @@ using namespace std; namespace mdlp { typedef float precision_t; - typedef vector samples_t; - typedef vector labels_t; - typedef vector indices_t; - typedef vector cutPoints_t; - typedef map, precision_t> cacheEnt_t; - typedef map, precision_t> cacheIg_t; + typedef std::vector samples_t; + typedef std::vector labels_t; + typedef std::vector indices_t; + typedef std::vector cutPoints_t; + typedef std::map, precision_t> cacheEnt_t; + typedef std::map, precision_t> cacheIg_t; } #endif