Fix BinDisc quantile mistakes (#9)

* Fix BinDisc quantile mistakes * Fix FImdlp tests * Fix tests, samples and remove uneeded support files * Add coypright header to sources Fix coverage report Add coverage badge to README * Update sonar github action * Move sources to a folder and change ArffFiles files to library * Add recursive submodules to github action
2025-08-16 16:05:57 +00:00 · 2024-07-04 17:27:39 +02:00
parent 7b0673fd4b
commit e36d9af8f9
35 changed files with 1383 additions and 923 deletions
--- a/src/BinDisc.cpp
+++ b/src/BinDisc.cpp
@@ -0,0 +1,98 @@
+// ****************************************************************
+// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
+// SPDX - FileType: SOURCE
+// SPDX - License - Identifier: MIT
+// ****************************************************************
+
+#include <algorithm>
+#include <cmath>
+#include "BinDisc.h"
+#include <iostream>
+#include <string>
+
+namespace mdlp {
+
+    BinDisc::BinDisc(int n_bins, strategy_t strategy) :
+        Discretizer(), n_bins{ n_bins }, strategy{ strategy }
+    {
+        if (n_bins < 3) {
+            throw std::invalid_argument("n_bins must be greater than 2");
+        }
+    }
+    BinDisc::~BinDisc() = default;
+    void BinDisc::fit(samples_t& X)
+    {
+        // y is included for compatibility with the Discretizer interface
+        cutPoints.clear();
+        if (X.empty()) {
+            cutPoints.push_back(0.0);
+            cutPoints.push_back(0.0);
+            return;
+        }
+        if (strategy == strategy_t::QUANTILE) {
+            direction = bound_dir_t::RIGHT;
+            fit_quantile(X);
+        } else if (strategy == strategy_t::UNIFORM) {
+            direction = bound_dir_t::RIGHT;
+            fit_uniform(X);
+        }
+    }
+    void BinDisc::fit(samples_t& X, labels_t& y)
+    {
+        fit(X);
+    }
+    std::vector<precision_t> linspace(precision_t start, precision_t end, int num)
+    {
+        if (start == end) {
+            return { start, end };
+        }
+        precision_t delta = (end - start) / static_cast<precision_t>(num - 1);
+        std::vector<precision_t> linspc;
+        for (size_t i = 0; i < num; ++i) {
+            precision_t val = start + delta * static_cast<precision_t>(i);
+            linspc.push_back(val);
+        }
+        return linspc;
+    }
+    size_t clip(const size_t n, const size_t lower, const size_t upper)
+    {
+        return std::max(lower, std::min(n, upper));
+    }
+    std::vector<precision_t> percentile(samples_t& data, const std::vector<precision_t>& percentiles)
+    {
+        // Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html
+        std::vector<precision_t> results;
+        bool first = true;
+        results.reserve(percentiles.size());
+        for (auto percentile : percentiles) {
+            const auto i = static_cast<size_t>(std::floor(static_cast<precision_t>(data.size() - 1) * percentile / 100.));
+            const auto indexLower = clip(i, 0, data.size() - 2);
+            const precision_t percentI = static_cast<precision_t>(indexLower) / static_cast<precision_t>(data.size() - 1);
+            const precision_t fraction =
+                (percentile / 100.0 - percentI) /
+                (static_cast<precision_t>(indexLower + 1) / static_cast<precision_t>(data.size() - 1) - percentI);
+            if (const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction; value != results.back() || first) // first needed as results.back() return is undefined for empty vectors
+                results.push_back(value);
+            first = false;
+        }
+        return results;
+    }
+    void BinDisc::fit_quantile(const samples_t& X)
+    {
+        auto quantiles = linspace(0.0, 100.0, n_bins + 1);
+        auto data = X;
+        std::sort(data.begin(), data.end());
+        if (data.front() == data.back() || data.size() == 1) {
+            // if X is constant, pass any two given points that shall be ignored in transform
+            cutPoints.push_back(data.front());
+            cutPoints.push_back(data.front());
+            return;
+        }
+        cutPoints = percentile(data, quantiles);
+    }
+    void BinDisc::fit_uniform(const samples_t& X)
+    {
+        auto [vmin, vmax] = std::minmax_element(X.begin(), X.end());
+        cutPoints = linspace(*vmin, *vmax, n_bins + 1);
+    }
+}
--- a/src/BinDisc.h
+++ b/src/BinDisc.h
@@ -0,0 +1,33 @@
+// ****************************************************************
+// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
+// SPDX - FileType: SOURCE
+// SPDX - License - Identifier: MIT
+// ****************************************************************
+
+#ifndef BINDISC_H
+#define BINDISC_H
+
+#include "typesFImdlp.h"
+#include "Discretizer.h"
+#include <string>
+
+namespace mdlp {
+    enum class strategy_t {
+        UNIFORM,
+        QUANTILE
+    };
+    class BinDisc : public Discretizer {
+    public:
+        BinDisc(int n_bins = 3, strategy_t strategy = strategy_t::UNIFORM);
+        ~BinDisc();
+        // y is included for compatibility with the Discretizer interface
+        void fit(samples_t& X_, labels_t& y) override;
+        void fit(samples_t& X);
+    private:
+        void fit_uniform(const samples_t&);
+        void fit_quantile(const samples_t&);
+        int n_bins;
+        strategy_t strategy;
+    };
+}
+#endif
--- a/src/CPPFImdlp.cpp
+++ b/src/CPPFImdlp.cpp
@@ -0,0 +1,221 @@
+// ****************************************************************
+// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
+// SPDX - FileType: SOURCE
+// SPDX - License - Identifier: MIT
+// ****************************************************************
+
+#include <numeric>
+#include <algorithm>
+#include <set>
+#include <cmath>
+#include "CPPFImdlp.h"
+
+namespace mdlp {
+
+    CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) :
+        Discretizer(),
+        min_length(min_length_),
+        max_depth(max_depth_),
+        proposed_cuts(proposed)
+    {
+        direction = bound_dir_t::RIGHT;
+    }
+
+    size_t CPPFImdlp::compute_max_num_cut_points() const
+    {
+        // Set the actual maximum number of cut points as a number or as a percentage of the number of samples
+        if (proposed_cuts == 0) {
+            return numeric_limits<size_t>::max();
+        }
+        if (proposed_cuts < 0 || proposed_cuts > static_cast<precision_t>(X.size())) {
+            throw invalid_argument("wrong proposed num_cuts value");
+        }
+        if (proposed_cuts < 1)
+            return static_cast<size_t>(round(static_cast<precision_t>(X.size()) * proposed_cuts));
+        return static_cast<size_t>(proposed_cuts); // The 2 extra cutpoints should not be considered here as this parameter is considered before they are added
+    }
+
+    void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
+    {
+        X = X_;
+        y = y_;
+        num_cut_points = compute_max_num_cut_points();
+        depth = 0;
+        discretizedData.clear();
+        cutPoints.clear();
+        if (X.size() != y.size()) {
+            throw invalid_argument("X and y must have the same size");
+        }
+        if (X.empty() || y.empty()) {
+            throw invalid_argument("X and y must have at least one element");
+        }
+        if (min_length < 3) {
+            throw invalid_argument("min_length must be greater than 2");
+        }
+        if (max_depth < 1) {
+            throw invalid_argument("max_depth must be greater than 0");
+        }
+        indices = sortIndices(X_, y_);
+        metrics.setData(y, indices);
+        computeCutPoints(0, X.size(), 1);
+        sort(cutPoints.begin(), cutPoints.end());
+        if (num_cut_points > 0) {
+            // Select the best (with lower entropy) cut points
+            while (cutPoints.size() > num_cut_points) {
+                resizeCutPoints();
+            }
+        }
+        // Insert first & last X value to the cutpoints as them shall be ignored in transform
+        auto [vmin, vmax] = std::minmax_element(X.begin(), X.end());
+        cutPoints.push_back(*vmax);
+        cutPoints.insert(cutPoints.begin(), *vmin);
+    }
+
+    pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
+    {
+        size_t n;
+        size_t m;
+        size_t idxPrev = cut - 1 >= start ? cut - 1 : cut;
+        size_t idxNext = cut + 1 < end ? cut + 1 : cut;
+        bool backWall; // true if duplicates reach beginning of the interval
+        precision_t previous;
+        precision_t actual;
+        precision_t next;
+        previous = X[indices[idxPrev]];
+        actual = X[indices[cut]];
+        next = X[indices[idxNext]];
+        // definition 2 of the paper => X[t-1] < X[t]
+        // get the first equal value of X in the interval
+        while (idxPrev > start && actual == previous) {
+            previous = X[indices[--idxPrev]];
+        }
+        backWall = idxPrev == start && actual == previous;
+        // get the last equal value of X in the interval
+        while (idxNext < end - 1 && actual == next) {
+            next = X[indices[++idxNext]];
+        }
+        // # of duplicates before cutpoint
+        n = cut - 1 - idxPrev;
+        // # of duplicates after cutpoint
+        m = idxNext - cut - 1;
+        // Decide which values to use
+        cut = cut + (backWall ? m + 1 : -n);
+        actual = X[indices[cut]];
+        return { (actual + previous) / 2, cut };
+    }
+
+    void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_)
+    {
+        size_t cut;
+        pair<precision_t, size_t> result;
+        // Check if the interval length and the depth are Ok
+        if (end - start < min_length || depth_ > max_depth)
+            return;
+        depth = depth_ > depth ? depth_ : depth;
+        cut = getCandidate(start, end);
+        if (cut == numeric_limits<size_t>::max())
+            return;
+        if (mdlp(start, cut, end)) {
+            result = valueCutPoint(start, cut, end);
+            cut = result.second;
+            cutPoints.push_back(result.first);
+            computeCutPoints(start, cut, depth_ + 1);
+            computeCutPoints(cut, end, depth_ + 1);
+        }
+    }
+
+    size_t CPPFImdlp::getCandidate(size_t start, size_t end)
+    {
+        /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
+        E(A, TA; S) is minimal amongst all the candidate cut points. */
+        size_t candidate = numeric_limits<size_t>::max();
+        size_t elements = end - start;
+        bool sameValues = true;
+        precision_t entropy_left;
+        precision_t entropy_right;
+        precision_t minEntropy;
+        // Check if all the values of the variable in the interval are the same
+        for (size_t idx = start + 1; idx < end; idx++) {
+            if (X[indices[idx]] != X[indices[start]]) {
+                sameValues = false;
+                break;
+            }
+        }
+        if (sameValues)
+            return candidate;
+        minEntropy = metrics.entropy(start, end);
+        for (size_t idx = start + 1; idx < end; idx++) {
+            // Cutpoints are always on boundaries (definition 2)
+            if (y[indices[idx]] == y[indices[idx - 1]])
+                continue;
+            entropy_left = precision_t(idx - start) / static_cast<precision_t>(elements) * metrics.entropy(start, idx);
+            entropy_right = precision_t(end - idx) / static_cast<precision_t>(elements) * metrics.entropy(idx, end);
+            if (entropy_left + entropy_right < minEntropy) {
+                minEntropy = entropy_left + entropy_right;
+                candidate = idx;
+            }
+        }
+        return candidate;
+    }
+
+    bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
+    {
+        int k;
+        int k1;
+        int k2;
+        precision_t ig;
+        precision_t delta;
+        precision_t ent;
+        precision_t ent1;
+        precision_t ent2;
+        auto N = precision_t(end - start);
+        k = metrics.computeNumClasses(start, end);
+        k1 = metrics.computeNumClasses(start, cut);
+        k2 = metrics.computeNumClasses(cut, end);
+        ent = metrics.entropy(start, end);
+        ent1 = metrics.entropy(start, cut);
+        ent2 = metrics.entropy(cut, end);
+        ig = metrics.informationGain(start, cut, end);
+        delta = static_cast<precision_t>(log2(pow(3, precision_t(k)) - 2) -
+            (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
+        precision_t term = 1 / N * (log2(N - 1) + delta);
+        return ig > term;
+    }
+
+    // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
+    indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
+    {
+        indices_t idx(X_.size());
+        std::iota(idx.begin(), idx.end(), 0);
+        stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
+            if (X_[i1] == X_[i2])
+                return y_[i1] < y_[i2];
+            else
+                return X_[i1] < X_[i2];
+            });
+        return idx;
+    }
+
+    void CPPFImdlp::resizeCutPoints()
+    {
+        //Compute entropy of each of the whole cutpoint set and discards the biggest value
+        precision_t maxEntropy = 0;
+        precision_t entropy;
+        size_t maxEntropyIdx = 0;
+        size_t begin = 0;
+        size_t end;
+        for (size_t idx = 0; idx < cutPoints.size(); idx++) {
+            end = begin;
+            while (X[indices[end]] < cutPoints[idx] && end < X.size())
+                end++;
+            entropy = metrics.entropy(begin, end);
+            if (entropy > maxEntropy) {
+                maxEntropy = entropy;
+                maxEntropyIdx = idx;
+            }
+            begin = end;
+        }
+        cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx));
+    }
+
+}
--- a/src/CPPFImdlp.h
+++ b/src/CPPFImdlp.h
@@ -0,0 +1,44 @@
+// ****************************************************************
+// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
+// SPDX - FileType: SOURCE
+// SPDX - License - Identifier: MIT
+// ****************************************************************
+
+#ifndef CPPFIMDLP_H
+#define CPPFIMDLP_H
+
+#include "typesFImdlp.h"
+#include <limits>
+#include <utility>
+#include <string>
+#include "Metrics.h"
+#include "Discretizer.h"
+
+namespace mdlp {
+    class CPPFImdlp : public Discretizer {
+    public:
+        CPPFImdlp() = default;
+        CPPFImdlp(size_t min_length_, int max_depth_, float proposed);
+        virtual ~CPPFImdlp() = default;
+        void fit(samples_t& X_, labels_t& y_) override;
+        inline int get_depth() const { return depth; };
+    protected:
+        size_t min_length = 3;
+        int depth = 0;
+        int max_depth = numeric_limits<int>::max();
+        float proposed_cuts = 0;
+        indices_t indices = indices_t();
+        samples_t X = samples_t();
+        labels_t y = labels_t();
+        Metrics metrics = Metrics(y, indices);
+        size_t num_cut_points = numeric_limits<size_t>::max();
+        static indices_t sortIndices(samples_t&, labels_t&);
+        void computeCutPoints(size_t, size_t, int);
+        void resizeCutPoints();
+        bool mdlp(size_t, size_t, size_t);
+        size_t getCandidate(size_t, size_t);
+        size_t compute_max_num_cut_points() const;
+        pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
+    };
+}
+#endif
--- a/src/Discretizer.cpp
+++ b/src/Discretizer.cpp
@@ -0,0 +1,54 @@
+// ****************************************************************
+// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
+// SPDX - FileType: SOURCE
+// SPDX - License - Identifier: MIT
+// ****************************************************************
+
+#include "Discretizer.h"
+
+namespace mdlp {
+
+    labels_t& Discretizer::transform(const samples_t& data)
+    {
+        discretizedData.clear();
+        discretizedData.reserve(data.size());
+        // CutPoints always have at least two items
+        // Have to ignore first and last cut points provided
+        auto first = cutPoints.begin() + 1;
+        auto last = cutPoints.end() - 1;
+        auto bound = direction == bound_dir_t::LEFT ? std::lower_bound<std::vector<precision_t>::iterator, precision_t> : std::upper_bound<std::vector<precision_t>::iterator, precision_t>;
+        for (const precision_t& item : data) {
+            auto pos = bound(first, last, item);
+            auto number = pos - first;
+            discretizedData.push_back(static_cast<label_t>(number));
+        }
+        return discretizedData;
+    }
+    labels_t& Discretizer::fit_transform(samples_t& X_, labels_t& y_)
+    {
+        fit(X_, y_);
+        return transform(X_);
+    }
+    void Discretizer::fit_t(const torch::Tensor& X_, const torch::Tensor& y_)
+    {
+        auto num_elements = X_.numel();
+        samples_t X(X_.data_ptr<precision_t>(), X_.data_ptr<precision_t>() + num_elements);
+        labels_t y(y_.data_ptr<int>(), y_.data_ptr<int>() + num_elements);
+        fit(X, y);
+    }
+    torch::Tensor Discretizer::transform_t(const torch::Tensor& X_)
+    {
+        auto num_elements = X_.numel();
+        samples_t X(X_.data_ptr<precision_t>(), X_.data_ptr<precision_t>() + num_elements);
+        auto result = transform(X);
+        return torch::tensor(result, torch_label_t);
+    }
+    torch::Tensor Discretizer::fit_transform_t(const torch::Tensor& X_, const torch::Tensor& y_)
+    {
+        auto num_elements = X_.numel();
+        samples_t X(X_.data_ptr<precision_t>(), X_.data_ptr<precision_t>() + num_elements);
+        labels_t y(y_.data_ptr<int>(), y_.data_ptr<int>() + num_elements);
+        auto result = fit_transform(X, y);
+        return torch::tensor(result, torch_label_t);
+    }
+}
--- a/src/Discretizer.h
+++ b/src/Discretizer.h
@@ -0,0 +1,39 @@
+// ****************************************************************
+// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
+// SPDX - FileType: SOURCE
+// SPDX - License - Identifier: MIT
+// ****************************************************************
+
+#ifndef DISCRETIZER_H
+#define DISCRETIZER_H
+
+#include <string>
+#include <algorithm>
+#include "typesFImdlp.h"
+#include <torch/torch.h>
+
+namespace mdlp {
+    enum class bound_dir_t {
+        LEFT,
+        RIGHT
+    };
+    const auto torch_label_t = torch::kInt32;
+    class Discretizer {
+    public:
+        Discretizer() = default;
+        virtual ~Discretizer() = default;
+        inline cutPoints_t getCutPoints() const { return cutPoints; };
+        virtual void fit(samples_t& X_, labels_t& y_) = 0;
+        labels_t& transform(const samples_t& data);
+        labels_t& fit_transform(samples_t& X_, labels_t& y_);
+        void fit_t(const torch::Tensor& X_, const torch::Tensor& y_);
+        torch::Tensor transform_t(const torch::Tensor& X_);
+        torch::Tensor fit_transform_t(const torch::Tensor& X_, const torch::Tensor& y_);
+        static inline std::string version() { return "1.2.3"; };
+    protected:
+        labels_t discretizedData = labels_t();
+        cutPoints_t cutPoints; // At least two cutpoints must be provided, the first and the last will be ignored in transform
+        bound_dir_t direction; // used in transform
+    };
+}
+#endif
--- a/src/Metrics.cpp
+++ b/src/Metrics.cpp
@@ -0,0 +1,84 @@
+// ****************************************************************
+// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
+// SPDX - FileType: SOURCE
+// SPDX - License - Identifier: MIT
+// ****************************************************************
+
+#include "Metrics.h"
+#include <set>
+#include <cmath>
+
+using namespace std;
+namespace mdlp {
+    Metrics::Metrics(labels_t& y_, indices_t& indices_) : y(y_), indices(indices_),
+        numClasses(computeNumClasses(0, indices_.size()))
+    {
+    }
+
+    int Metrics::computeNumClasses(size_t start, size_t end)
+    {
+        set<int> nClasses;
+        for (auto i = start; i < end; ++i) {
+            nClasses.insert(y[indices[i]]);
+        }
+        return static_cast<int>(nClasses.size());
+    }
+
+    void Metrics::setData(const labels_t& y_, const indices_t& indices_)
+    {
+        indices = indices_;
+        y = y_;
+        numClasses = computeNumClasses(0, indices.size());
+        entropyCache.clear();
+        igCache.clear();
+    }
+
+    precision_t Metrics::entropy(size_t start, size_t end)
+    {
+        precision_t p;
+        precision_t ventropy = 0;
+        int nElements = 0;
+        labels_t counts(numClasses + 1, 0);
+        if (end - start < 2)
+            return 0;
+        if (entropyCache.find({ start, end }) != entropyCache.end()) {
+            return entropyCache[{start, end}];
+        }
+        for (auto i = &indices[start]; i != &indices[end]; ++i) {
+            counts[y[*i]]++;
+            nElements++;
+        }
+        for (auto count : counts) {
+            if (count > 0) {
+                p = static_cast<precision_t>(count) / static_cast<precision_t>(nElements);
+                ventropy -= p * log2(p);
+            }
+        }
+        entropyCache[{start, end}] = ventropy;
+        return ventropy;
+    }
+
+    precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
+    {
+        precision_t iGain;
+        precision_t entropyInterval;
+        precision_t entropyLeft;
+        precision_t entropyRight;
+        size_t nElementsLeft = cut - start;
+        size_t nElementsRight = end - cut;
+        size_t nElements = end - start;
+        if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
+            return igCache[make_tuple(start, cut, end)];
+        }
+        entropyInterval = entropy(start, end);
+        entropyLeft = entropy(start, cut);
+        entropyRight = entropy(cut, end);
+        iGain = entropyInterval -
+            (static_cast<precision_t>(nElementsLeft) * entropyLeft +
+                static_cast<precision_t>(nElementsRight) * entropyRight) /
+            static_cast<precision_t>(nElements);
+        igCache[make_tuple(start, cut, end)] = iGain;
+        return iGain;
+    }
+
+}
--- a/src/Metrics.h
+++ b/src/Metrics.h
@@ -0,0 +1,28 @@
+// ****************************************************************
+// SPDX - FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
+// SPDX - FileType: SOURCE
+// SPDX - License - Identifier: MIT
+// ****************************************************************
+
+#ifndef CCMETRICS_H
+#define CCMETRICS_H
+
+#include "typesFImdlp.h"
+
+namespace mdlp {
+    class Metrics {
+    protected:
+        labels_t& y;
+        indices_t& indices;
+        int numClasses;
+        cacheEnt_t entropyCache = cacheEnt_t();
+        cacheIg_t igCache = cacheIg_t();
+    public:
+        Metrics(labels_t&, indices_t&);
+        void setData(const labels_t&, const indices_t&);
+        int computeNumClasses(size_t, size_t);
+        precision_t entropy(size_t, size_t);
+        precision_t informationGain(size_t, size_t, size_t);
+    };
+}
+#endif
--- a/src/typesFImdlp.h
+++ b/src/typesFImdlp.h
@@ -0,0 +1,19 @@
+#ifndef TYPES_H
+#define TYPES_H
+
+#include <vector>
+#include <map>
+#include <stdexcept>
+
+using namespace std;
+namespace mdlp {
+    typedef float precision_t;
+    typedef int label_t;
+    typedef std::vector<precision_t> samples_t;
+    typedef std::vector<label_t> labels_t;
+    typedef std::vector<size_t> indices_t;
+    typedef std::vector<precision_t> cutPoints_t;
+    typedef std::map<std::pair<int, int>, precision_t> cacheEnt_t;
+    typedef std::map<std::tuple<int, int, int>, precision_t> cacheIg_t;
+}
+#endif