fimdlp/fimdlp/CPPFImdlp.cpp

#include "CPPFImdlp.h"
#include <numeric>
#include <iostream>
#include <algorithm>
#include "Metrics.h"
namespace mdlp {
    std::ostream& operator << (std::ostream& os, const cutPoint_t& cut)
    {
        os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
            ") - (" << cut.fromValue << ", " << cut.toValue << ")  "
            << std::endl;
        return os;

    }
    CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false)
    {
        divider = pow(10, precision);
        numClasses = 0;
    }
    CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug)
    {
        divider = pow(10, precision);
        numClasses = 0;
    }
    CPPFImdlp::~CPPFImdlp()
        = default;
    std::vector<cutPoint_t> CPPFImdlp::getCutPoints()
    {
        return cutPoints;
    }
    labels CPPFImdlp::getDiscretizedValues()
    {
        return xDiscretized;
    }
    CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
    {
        X = X_;
        y = y_;
        if (X.size() != y.size()) {
            std::cerr << "X and y must have the same size" << std::endl;
            return *this;
        }
        if (X.size() == 0) {
            std::cerr << "X and y must have at least one element" << std::endl;
            return *this;
        }
        this->indices = sortIndices(X_);
        this->xDiscretized = labels(X.size(), -1);
        this->numClasses = Metrics::numClasses(y, indices, 0, X.size());

        if (proposed) {
            computeCutPointsProposed();
        } else {
            computeCutPointsOriginal();
        }
        filterCutPoints();
        applyCutPoints();
        return *this;
    }
    labels& CPPFImdlp::transform(samples& X_)
    {
        indices_t indices_transform = sortIndices(X_);
        applyCutPoints();
        return xDiscretized;
    }
    void CPPFImdlp::debugPoints(samples& X_, labels& y_)
    {
        std::cout << "+++++++++++++++++++++++" << std::endl;
        // for (auto i : sortIndices(X))
        indices_t indices_n = sortIndices(X);
        for (size_t i = 0; i < indices_n.size(); i++) {
            printf("(%3lu, %3lu) -> (%3.1f, %d)\n", i, indices_n[i], X_[indices_n[i]], y_[indices_n[i]]);
        }
        std::cout << "+++++++++++++++++++++++" << std::endl;
        fit(X_, y_);
        for (auto item : cutPoints) {
            std::cout << item.start << "  X_[" << item.end << "]=" << X_[item.end] << std::endl;
        }
    }
    void CPPFImdlp::applyCutPoints()
    {
        for (auto cut : cutPoints) {
            for (size_t i = cut.start; i < cut.end; i++) {
                xDiscretized[indices[i]] = cut.classNumber;
            }
        }
    }
    bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
    {
        int k, k1, k2;
        float ig, delta;
        float ent, ent1, ent2;
        auto N = float(rest.end - rest.start);
        if (N < 2) {
            return false;
        }
        k = Metrics::numClasses(y, indices, rest.start, rest.end);
        k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
        k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
        ent = Metrics::entropy(y, indices, rest.start, rest.end, numClasses);
        ent1 = Metrics::entropy(y, indices, rest.start, candidate.end, numClasses);
        ent2 = Metrics::entropy(y, indices, candidate.end, rest.end, numClasses);
        ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
        delta = log2(pow(3, float(k)) - 2) - (float(k) * ent - float(k1) * ent1 - float(k2) * ent2);
        float term = 1 / N * (log2(N - 1) + delta);
        if (debug) {
            std::cout << "Rest: " << rest;
            std::cout << "Candidate: " << candidate;
            std::cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << std::endl;
            std::cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << std::endl;
        }
        return (ig > term);
    }
    void CPPFImdlp::filterCutPoints()
    {
        cutPoints_t filtered;
        cutPoint_t rest;
        int classNumber = 0;

        rest.start = 0;
        rest.end = X.size();
        rest.fromValue = std::numeric_limits<float>::lowest();
        rest.toValue = std::numeric_limits<float>::max();
        rest.classNumber = classNumber;
        bool lastReject = false, first = true;
        for (auto item : cutPoints) {
            if (evaluateCutPoint(rest, item)) {
                if (debug)
                    std::cout << "Accepted" << std::endl;
                if (lastReject) {
                    if (first) {
                        item.fromValue = std::numeric_limits<float>::lowest();
                        item.start = indices[0];
                    } else {
                        item.fromValue = filtered.back().toValue;
                        item.start = filtered.back().end;
                    }
                }
                //Assign class number to the interval (cutpoint)
                item.classNumber = classNumber++;
                filtered.push_back(item);
                first = false;
                rest.start = item.end;
            } else {
                if (debug)
                    std::cout << "Rejected" << std::endl;
                lastReject = true;
            }
        }
        if (!first) {
            filtered.back().toValue = std::numeric_limits<float>::max();
            filtered.back().end = X.size() - 1;
        } else {
            filtered.push_back(rest);
        }

        cutPoints = filtered;
    }
    void CPPFImdlp::computeCutPointsProposed()
    {
        cutPoints_t cutPts;
        cutPoint_t cutPoint;
        float xPrev, xCur, xPivot;
        int yPrev, yCur, yPivot;
        size_t idx, numElements, start;

        xCur = xPrev = X[indices[0]];
        yCur = yPrev = y[indices[0]];
        numElements = indices.size() - 1;
        idx = start = 0;
        bool firstCutPoint = true;
        if (debug)
            printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
        while (idx < numElements) {
            xPivot = xCur;
            yPivot = yCur;
            if (debug)
                printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
            // Read the same values and check class changes
            do {
                idx++;
                xCur = X[indices[idx]];
                yCur = y[indices[idx]];
                if (yCur != yPivot && xCur == xPivot) {
                    yPivot = -1;
                }
                if (debug)
                    printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
            }
            while (idx < numElements && xCur == xPivot);
            if (yPivot == -1 || yPrev != yCur) {
                cutPoint.start = start;
                cutPoint.end = idx - 1;
                start = idx;
                cutPoint.fromValue = firstCutPoint ? std::numeric_limits<float>::lowest() : cutPts.back().toValue;
                cutPoint.toValue = (xPrev + xCur) / 2;
                cutPoint.classNumber = -1;
                firstCutPoint = false;
                if (debug) {
                    printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
                }
                cutPts.push_back(cutPoint);
            }
            yPrev = yPivot;
            xPrev = xPivot;
        }
        if (idx == numElements) {
            cutPoint.start = start;
            cutPoint.end = numElements + 1;
            cutPoint.fromValue = firstCutPoint ? std::numeric_limits<float>::lowest() : cutPts.back().toValue;
            cutPoint.toValue = std::numeric_limits<float>::max();
            cutPoint.classNumber = -1;
            if (debug)
                printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
            cutPts.push_back(cutPoint);
        }
        for (auto cutPt : cutPts)
            std::cout << "Cut point: " << cutPt;
        cutPoints = cutPts;
    }
    void CPPFImdlp::computeCutPointsOriginal()
    {
        cutPoints_t cutPts;
        cutPoint_t cutPoint;
        float xPrev;
        int yPrev;
        bool first = true;
        // idxPrev is the index of the init instance of the cutPoint
        size_t index, idxPrev = 0, idx = indices[0];
        xPrev = X[idx];
        yPrev = y[idx];
        for (index = 0; index < size_t(indices.size()) - 1; index++) {
            idx = indices[index];
            //  Definition 2 Cut points are always on boundaries
            if (y[idx] != yPrev && xPrev < X[idx]) {
                if (first) {
                    first = false;
                    cutPoint.fromValue = std::numeric_limits<float>::lowest();
                } else {
                    cutPoint.fromValue = cutPts.back().toValue;
                }
                cutPoint.start = idxPrev;
                cutPoint.end = index;
                cutPoint.classNumber = -1;
                cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
                idxPrev = index;
                cutPts.push_back(cutPoint);
            }
            xPrev = X[idx];
            yPrev = y[idx];
        }
        if (first) {
            cutPoint.start = 0;
            cutPoint.classNumber = -1;
            cutPoint.fromValue = std::numeric_limits<float>::lowest();
            cutPoint.toValue = std::numeric_limits<float>::max();
            cutPts.push_back(cutPoint);
        } else
            cutPts.back().toValue = std::numeric_limits<float>::max();
        cutPts.back().end = X.size();
        if (debug)
            for (auto cutPt : cutPts)
                std::cout << "-Cut point: " << cutPt;
        cutPoints = cutPts;
    }
    // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
    indices_t CPPFImdlp::sortIndices(samples& X_)
    {
        indices_t idx(X_.size());
        std::iota(idx.begin(), idx.end(), 0);
        for (std::size_t i = 0; i < X_.size(); i++)
            stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
                { return X_[i1] < X_[i2]; });
        return idx;
    }
}