Merge pull request #1 from rmontanana/proposal

Proposal
2025-08-16 07:55:58 +00:00 · 2022-12-21 11:49:45 +01:00
parent 50543e4921 036b41a0eb
commit b6a7d1e1fa
7 changed files with 167 additions and 151 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,4 @@
 **/lcoverage
 .idea
 cmake-*
+**/CMakeFiles
--- a/CPPFImdlp.cpp
+++ b/CPPFImdlp.cpp
@@ -4,14 +4,12 @@
 #include <cmath>
 #include "CPPFImdlp.h"
 #include "Metrics.h"
-
 namespace mdlp {
-    CPPFImdlp::CPPFImdlp(int proposal):proposal(proposal), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices))
+    CPPFImdlp::CPPFImdlp(int algorithm):algorithm(algorithm), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices))
    {
    }
    CPPFImdlp::~CPPFImdlp()
        = default;
-
    CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_)
    {
        X = X_;
@@ -23,93 +21,87 @@ namespace mdlp {
        if (X.size() == 0 || y.size() == 0) {
            throw invalid_argument("X and y must have at least one element");
        }
-        indices = sortIndices(X_);
+        indices = sortIndices(X_, y_);
        metrics.setData(y, indices);
-        switch (proposal) {
+        switch (algorithm) {
            case 0:
                computeCutPoints(0, X.size());
                break;
            case 1:
-                computeCutPointsProposal();
-                break;
-            case 2:
                computeCutPointsAlternative(0, X.size());
                break;
+            default:
+                throw invalid_argument("algorithm must be 0 or 1");
        }
        return *this;
    }
+    precision_t CPPFImdlp::halfWayValueCutPoint(size_t start, size_t idx)
+    {
+        size_t idxPrev = idx - 1;
+        precision_t previous = X[indices[idxPrev]], actual = X[indices[idx]];
+        // definition 2 of the paper => X[t-1] < X[t]
+        while (idxPrev-- > start && actual == previous) {
+            previous = X[indices[idxPrev]];
+        }
+        return (previous + actual) / 2;
+    }
+    tuple<precision_t, size_t> CPPFImdlp::completeValueCutPoint(size_t start, size_t cut, size_t end)
+    {
+        size_t idxPrev = cut - 1;
+        precision_t previous, next, actual;
+        previous = X[indices[idxPrev]];
+        next = actual = X[indices[cut]];
+        // definition 2 of the paper => X[t-1] < X[t]
+        while (idxPrev-- > start && actual == previous) {
+            previous = X[indices[idxPrev]];
+        }
+        // get the last equal value of X in the interval
+        while (actual == X[indices[cut++]] && cut < end);
+        if (previous == actual && cut < end)
+            actual = X[indices[cut]];
+        cut--;
+        return make_tuple((previous + actual) / 2, cut);
+    }
    void CPPFImdlp::computeCutPoints(size_t start, size_t end)
    {
-        int cut;
+        size_t cut;
+        tuple<precision_t, size_t> result;
        if (end - start < 2)
            return;
        cut = getCandidate(start, end);
-        if (cut == -1 || !mdlp(start, cut, end)) {
-            // cut.value == -1 means that there is no candidate in the interval
-            // No boundary found, so we add both ends of the interval as cutpoints
-            // because they were selected by the algorithm before
-            if (start != 0)
-                cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2);
-            if (end != X.size())
-                cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2);
+        if (cut == numeric_limits<size_t>::max())
            return;
+        if (mdlp(start, cut, end)) {
+            result = completeValueCutPoint(start, cut, end);
+            cut = get<1>(result);
+            cutPoints.push_back(get<0>(result));
+            computeCutPoints(start, cut);
+            computeCutPoints(cut, end);
        }
-        computeCutPoints(start, cut);
-        computeCutPoints(cut, end);
    }
    void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end)
    {
-        precision_t cut;
+        size_t cut;
        if (end - start < 2)
            return;
        cut = getCandidate(start, end);
-        if (cut == -1)
+        if (cut == numeric_limits<size_t>::max())
            return;
        if (mdlp(start, cut, end)) {
-            cutPoints.push_back((X[indices[cut]] + X[indices[cut - 1]]) / 2);
-        }
-        computeCutPointsAlternative(start, cut);
-        computeCutPointsAlternative(cut, end);
-    }
-    void CPPFImdlp::computeCutPointsProposal()
-    {
-        precision_t xPrev, xCur, xPivot, cutPoint;
-        int yPrev, yCur, yPivot;
-        size_t idx, numElements, start;
-
-        xCur = xPrev = X[indices[0]];
-        yCur = yPrev = y[indices[0]];
-        numElements = indices.size() - 1;
-        idx = start = 0;
-        while (idx < numElements) {
-            xPivot = xCur;
-            yPivot = yCur;
-            // Read the same values and check class changes
-            do {
-                idx++;
-                xCur = X[indices[idx]];
-                yCur = y[indices[idx]];
-                if (yCur != yPivot && xCur == xPivot) {
-                    yPivot = -1;
-                }
-            }
-            while (idx < numElements && xCur == xPivot);
-            // Check if the class changed and there are more than 1 element
-            if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) {
-                start = idx;
-                cutPoint = (xPrev + xCur) / 2;
-                cutPoints.push_back(cutPoint);
-            }
-            yPrev = yPivot;
-            xPrev = xPivot;
+            cutPoints.push_back(halfWayValueCutPoint(start, cut));
+            computeCutPointsAlternative(start, cut);
+            computeCutPointsAlternative(cut, end);
        }
    }
-    long int CPPFImdlp::getCandidate(size_t start, size_t end)
+    size_t CPPFImdlp::getCandidate(size_t start, size_t end)
    {
-        long int candidate = -1, elements = end - start;
-        precision_t entropy_left, entropy_right, minEntropy = numeric_limits<precision_t>::max();
+        /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
+        E(A, TA; S) is minimal amogst all the candidate cut points. */
+        size_t candidate = numeric_limits<size_t>::max(), elements = end - start;
+        precision_t entropy_left, entropy_right, minEntropy;
+        minEntropy = metrics.entropy(start, end);
        for (auto idx = start + 1; idx < end; idx++) {
-            // Cutpoints are always on boundaries
+            // Cutpoints are always on boundaries (definition 2)
            if (y[indices[idx]] == y[indices[idx - 1]])
                continue;
            entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx);
@@ -142,6 +134,20 @@ namespace mdlp {
        precision_t term = 1 / N * (log2(N - 1) + delta);
        return ig > term;
    }
+    // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
+    indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
+    {
+        indices_t idx(X_.size());
+        iota(idx.begin(), idx.end(), 0);
+        for (size_t i = 0; i < X_.size(); i++)
+            stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2)
+                {
+                    if (X_[i1] == X_[i2]) return y_[i1] < y_[i2];
+                    else
+                        return X_[i1] < X_[i2];
+                });
+        return idx;
+    }
    cutPoints_t CPPFImdlp::getCutPoints()
    {
        // Remove duplicates and sort
@@ -154,14 +160,4 @@ namespace mdlp {
        sort(output.begin(), output.end());
        return output;
    }
-    // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
-    indices_t CPPFImdlp::sortIndices(samples_t& X_)
-    {
-        indices_t idx(X_.size());
-        iota(idx.begin(), idx.end(), 0);
-        for (size_t i = 0; i < X_.size(); i++)
-            stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
-                { return X_[i1] < X_[i2]; });
-        return idx;
-    }
 }
--- a/CPPFImdlp.h
+++ b/CPPFImdlp.h
@@ -3,28 +3,31 @@
 #include "typesFImdlp.h"
 #include "Metrics.h"
 #include <utility>
+#include <tuple>
+#include <string>
 namespace mdlp {
    class CPPFImdlp {
    protected:
-        int proposal;
-        indices_t indices; // sorted indices to use with X and y
+        int algorithm;
+        indices_t indices;
        samples_t X;
        labels_t y;
        Metrics metrics;
        cutPoints_t cutPoints;

-        static indices_t sortIndices(samples_t&);
+        static indices_t sortIndices(samples_t&, labels_t&);
        void computeCutPoints(size_t, size_t);
-        bool mdlp(size_t, size_t, size_t);
-        long int getCandidate(size_t, size_t);
        void computeCutPointsAlternative(size_t, size_t);
-        void computeCutPointsProposal();
-
+        bool mdlp(size_t, size_t, size_t);
+        size_t getCandidate(size_t, size_t);
+        precision_t halfWayValueCutPoint(size_t, size_t);
+        tuple<precision_t, size_t> completeValueCutPoint(size_t, size_t, size_t);
    public:
-        CPPFImdlp(int);
+        CPPFImdlp(int algorithm = 0);
        ~CPPFImdlp();
        CPPFImdlp& fit(samples_t&, labels_t&);
        samples_t getCutPoints();
+        inline string version() { return "0.9.7"; };
    };
 }
 #endif
--- a/README.md
+++ b/README.md
@@ -1,2 +1,22 @@
 # mdlp
-Discretization algorithm based on the paper by Fayyad &amp; Irani Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning
+Discretization algorithm based on the paper by Fayyad &amp; Irani [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](https://www.ijcai.org/Proceedings/93-2/Papers/022.pdf)
+
+The implementation tries to mitigate the problem of different label values with the same value of the variable:
+
+- Sorts the values of the variable using the label values as a tie-breaker
+- Once found a valid candidate for the split, it checks if the previous value is the same as actual one, and tries to get previous one, or next if the former is not possible.
+
+The algorithm returns the cut points for the variable.
+
+## Sample
+
+To run the sample, just execute the following commands:
+
+```bash
+cd sample
+mkdir build
+cd build
+cmake ..
+make
+./sample iris
+```
--- a/sample/ArffFiles.cpp
+++ b/sample/ArffFiles.cpp
@@ -1,5 +1,4 @@
 #include "ArffFiles.h"
-
 #include <fstream>
 #include <sstream>
 #include <map>
--- a/sample/sample.cpp
+++ b/sample/sample.cpp
@@ -5,6 +5,7 @@
 #include "../CPPFImdlp.h"

 using namespace std;
+using namespace mdlp;

 int main(int argc, char** argv)
 {
@@ -33,8 +34,8 @@ int main(int argc, char** argv)
    cout << "Class name: " << file.getClassName() << endl;
    cout << "Class type: " << file.getClassType() << endl;
    cout << "Data: " << endl;
-    vector<vector<float>>& X = file.getX();
-    vector<int>& y = file.getY();
+    vector<samples_t>& X = file.getX();
+    labels_t& y = file.getY();
    for (int i = 0; i < 50; i++) {
        for (auto feature : X) {
            cout << fixed << setprecision(1) << feature[i] << " ";
--- a/tests/FImdlp_unittest.cpp
+++ b/tests/FImdlp_unittest.cpp
@@ -4,31 +4,26 @@
 #include <iostream>

 namespace mdlp {
-    class TestFImdlp : public CPPFImdlp, public testing::Test {
+    class TestFImdlp: public CPPFImdlp, public testing::Test {
    public:
        precision_t precision = 0.000001;
-
-        TestFImdlp() : CPPFImdlp(false) {}
-
-        void SetUp() {
+        TestFImdlp(): CPPFImdlp() {}
+        void SetUp()
+        {
            //    5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
            //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
-            X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9};
-            y = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2};
-            proposal = false;
+            X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
+            y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
+            algorithm = false;
            fit(X, y);
        }
-
-        void setProposal(bool value) {
-            proposal = value;
+        void setalgorithm(bool value)
+        {
+            algorithm = value;
        }
-
-        // void initIndices()
-        // {
-        //     indices = indices_t();
-        // }
-        void checkSortedVector() {
-            indices_t testSortedIndices = sortIndices(X);
+        void checkSortedVector()
+        {
+            indices_t testSortedIndices = sortIndices(X, y);
            precision_t prev = X[testSortedIndices[0]];
            for (auto i = 0; i < X.size(); ++i) {
                EXPECT_EQ(testSortedIndices[i], indices[i]);
@@ -36,54 +31,55 @@ namespace mdlp {
                prev = X[testSortedIndices[i]];
            }
        }
-
-        void checkCutPoints(cutPoints_t &expected) {
+        void checkCutPoints(cutPoints_t& expected)
+        {
            int expectedSize = expected.size();
            EXPECT_EQ(cutPoints.size(), expectedSize);
            for (auto i = 0; i < cutPoints.size(); i++) {
                EXPECT_NEAR(cutPoints[i], expected[i], precision);
            }
        }
-
        template<typename T, typename A>
-        void checkVectors(std::vector<T, A> const &expected, std::vector<T, A> const &computed) {
+        void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
+        {
            EXPECT_EQ(expected.size(), computed.size());
            ASSERT_EQ(expected.size(), computed.size());
            for (auto i = 0; i < expected.size(); i++) {
-                EXPECT_NEAR(expected[i], computed[i],precision);
+                EXPECT_NEAR(expected[i], computed[i], precision);
            }
        }
    };
-
-    TEST_F(TestFImdlp, FitErrorEmptyDataset) {
+    TEST_F(TestFImdlp, FitErrorEmptyDataset)
+    {
        X = samples_t();
        y = labels_t();
        EXPECT_THROW(fit(X, y), std::invalid_argument);
    }
-
-    TEST_F(TestFImdlp, FitErrorDifferentSize) {
-        X = {1, 2, 3};
-        y = {1, 2};
+    TEST_F(TestFImdlp, FitErrorDifferentSize)
+    {
+        X = { 1, 2, 3 };
+        y = { 1, 2 };
        EXPECT_THROW(fit(X, y), std::invalid_argument);
    }
-
-    TEST_F(TestFImdlp, SortIndices) {
-        X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9};
-        indices = {4, 3, 6, 8, 2, 1, 5, 0, 9, 7};
+    TEST_F(TestFImdlp, SortIndices)
+    {
+        X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
+        indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 };
        checkSortedVector();
-        X = {5.77, 5.88, 5.99};
-        indices = {0, 1, 2};
+        X = { 5.77, 5.88, 5.99 };
+        indices = { 0, 1, 2 };
        checkSortedVector();
-        X = {5.33, 5.22, 5.11};
-        indices = {2, 1, 0};
+        X = { 5.33, 5.22, 5.11 };
+        indices = { 2, 1, 0 };
        checkSortedVector();
    }

-    TEST_F(TestFImdlp, TestDataset) {
-        proposal = false;
+    TEST_F(TestFImdlp, TestDataset)
+    {
+        algorithm = 0;
        fit(X, y);
-        computeCutPointsOriginal(0, 10);
-        cutPoints_t expected = {5.6499996185302734};
+        computeCutPoints(0, 10);
+        cutPoints_t expected = { 5.6499996185302734 };
        vector<precision_t> computed = getCutPoints();
        computed = getCutPoints();
        int expectedSize = expected.size();
@@ -92,49 +88,49 @@ namespace mdlp {
            EXPECT_NEAR(computed[i], expected[i], precision);
        }
    }
-
-    TEST_F(TestFImdlp, ComputeCutPointsOriginal) {
-        cutPoints_t expected = {5.65};
-        proposal = false;
-        computeCutPointsOriginal(0, 10);
+    TEST_F(TestFImdlp, ComputeCutPoints)
+    {
+        cutPoints_t expected = { 5.65 };
+        algorithm = false;
+        computeCutPoints(0, 10);
        checkCutPoints(expected);
    }
-
-    TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) {
+    TEST_F(TestFImdlp, ComputeCutPointsGCase)
+    {
        cutPoints_t expected;
-        proposal = false;
-        expected = {2};
-        samples_t X_ = {0, 1, 2, 2};
-        labels_t y_ = {1, 1, 1, 2};
+        algorithm = false;
+        expected = { 2 };
+        samples_t X_ = { 0, 1, 2, 2 };
+        labels_t y_ = { 1, 1, 1, 2 };
        fit(X_, y_);
        checkCutPoints(expected);
    }
-
-    TEST_F(TestFImdlp, ComputeCutPointsProposal) {
-        proposal = true;
+    TEST_F(TestFImdlp, ComputeCutPointsalAlternative)
+    {
+        algorithm = true;
        cutPoints_t expected;
        expected = {};
        fit(X, y);
-        computeCutPointsProposal();
+        computeCutPointsAlternative(0, 10);
        checkCutPoints(expected);
    }
-
-    TEST_F(TestFImdlp, ComputeCutPointsProposalGCase) {
+    TEST_F(TestFImdlp, ComputeCutPointsAlternativeGCase)
+    {
        cutPoints_t expected;
-        expected = {1.5};
-        proposal = true;
-        samples_t X_ = {0, 1, 2, 2};
-        labels_t y_ = {1, 1, 1, 2};
+        expected = { 1.5 };
+        algorithm = true;
+        samples_t X_ = { 0, 1, 2, 2 };
+        labels_t y_ = { 1, 1, 1, 2 };
        fit(X_, y_);
        checkCutPoints(expected);
    }
-
-    TEST_F(TestFImdlp, GetCutPoints) {
-        samples_t computed, expected = {5.65};
-        proposal = false;
-        computeCutPointsOriginal(0, 10);
+    TEST_F(TestFImdlp, GetCutPoints)
+    {
+        samples_t computed, expected = { 5.65 };
+        algorithm = false;
+        computeCutPoints(0, 10);
        computed = getCutPoints();
-        for (auto item: cutPoints)
+        for (auto item : cutPoints)
            cout << setprecision(6) << item << endl;
        checkVectors(expected, computed);
    }