Enhance cutpoints computation

2025-08-17 00:15:52 +00:00 · 2022-12-02 19:22:13 +01:00
parent 5657c1cd9f
commit 97cd2243fa
13 changed files with 207 additions and 104 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -129,3 +129,5 @@ dmypy.json
 .pyre/
 cfimdlp.cpp
 .vscode/*
+**/.idea/*
+
--- a/2
+++ b/2
@@ -8,7 +8,7 @@ clean: ## Clean up
 	if [ -f fimdlp/cppfimdlp.cpython-310-darwin.so ]; then rm fimdlp/cppfimdlp.cpython-310-darwin.so; fi;

 test:
-	cd fimdlp/testcpp && ./test.sh
+	cd fimdlp/testcpp && ./test

 lint:  ## Lint and static-check
 	black fimdlp
--- a/fimdlp/CPPFImdlp.cpp
+++ b/fimdlp/CPPFImdlp.cpp
@@ -5,18 +5,26 @@
 #include <algorithm>
 #include "Metrics.h"
 namespace mdlp {
-    CPPFImdlp::CPPFImdlp() : debug(false), precision(6)
+    std::ostream& operator << (std::ostream& os, const cutPoint_t& cut)
+    {
+        os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
+            ") - (" << cut.fromValue << ", " << cut.toValue << ")  "
+            << std::endl;
+        return os;
+
+    }
+    CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false)
    {
        divider = pow(10, precision);
    }
-    CPPFImdlp::CPPFImdlp(int precision, bool debug) : debug(debug), precision(precision)
+    CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug)
    {
        divider = pow(10, precision);
    }
    CPPFImdlp::~CPPFImdlp()
    {
    }
-    std::vector<CutPoint_t> CPPFImdlp::getCutPoints()
+    std::vector<cutPoint_t> CPPFImdlp::getCutPoints()
    {
        return cutPoints;
    }
@@ -32,7 +40,11 @@ namespace mdlp {
        this->xDiscretized = labels(X.size(), -1);
        this->numClasses = Metrics::numClasses(y, indices, 0, X.size());

-        computeCutPoints();
+        if (proposed) {
+            computeCutPointsProposed();
+        } else {
+            computeCutPointsOriginal();
+        }
        filterCutPoints();
        applyCutPoints();
    }
@@ -64,7 +76,7 @@ namespace mdlp {
            }
        }
    }
-    bool CPPFImdlp::evaluateCutPoint(CutPoint_t rest, CutPoint_t candidate)
+    bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
    {
        int k, k1, k2;
        float ig, delta;
@@ -73,7 +85,6 @@ namespace mdlp {
        if (N < 2) {
            return false;
        }
-
        k = Metrics::numClasses(y, indices, rest.start, rest.end);
        k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
        k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
@@ -83,15 +94,18 @@ namespace mdlp {
        ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
        delta = log2(pow(3, k) - 2) - (k * ent - k1 * ent1 - k2 * ent2);
        float term = 1 / N * (log2(N - 1) + delta);
-        std::cout << candidate
+        if (debug) {
+            std::cout << "Rest: " << rest;
+            std::cout << "Candidate: " << candidate;
            std::cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << std::endl;
            std::cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << std::endl;
+        }
        return (ig > term);
    }
    void CPPFImdlp::filterCutPoints()
    {
-        std::vector<CutPoint_t> filtered;
-        CutPoint_t rest;
+        cutPoints_t filtered;
+        cutPoint_t rest;
        int classNumber = 0;

        rest.start = 0;
@@ -116,24 +130,25 @@ namespace mdlp {
                item.classNumber = classNumber++;
                filtered.push_back(item);
                first = false;
+                rest.start = item.end;
            } else {
                std::cout << "Rejected" << std::endl;
                lastReject = true;
            }
        }
-        if (!first)
+        if (!first) {
            filtered.back().toValue = std::numeric_limits<float>::max();
-        else {
+            filtered.back().end = X.size();
+        } else {
            filtered.push_back(rest);
        }

        cutPoints = filtered;
    }
-    void CPPFImdlp::computeCutPoints()
+    void CPPFImdlp::computeCutPointsProposed()
    {
-
-        std::vector<CutPoint_t> cutPts;
-        CutPoint_t cutPoint;
+        cutPoints_t cutPts;
+        cutPoint_t cutPoint;
        indices_t cutIdx;
        float xPrev, xCur, xPivot;
        int yPrev, yCur, yPivot;
@@ -196,38 +211,56 @@ namespace mdlp {
        }
        cutPoints = cutPts;
    }
-    void CPPFImdlp::computeCutPointsAnt()
+    void CPPFImdlp::computeCutPointsOriginal()
    {
-        samples cutPts;
-        labels cutIdx;
-        float xPrev, cutPoint;
+        cutPoints_t cutPts;
+        cutPoint_t cutPoint;
+        float xPrev = std::numeric_limits<float>::lowest();
        int yPrev;
-        size_t idxPrev;
-        xPrev = X.at(indices[0]);
-        yPrev = y.at(indices[0]);
-        idxPrev = indices[0];
-        if (debug) {
-            std::cout << "Entropy: " << Metrics::entropy(y, indices, 0, y.size(), Metrics::numClasses(y, indices, 0, indices.size())) << std::endl;
-        }
-        for (auto index = indices.begin(); index != indices.end(); ++index) {
+        bool first = true;
+        // idxPrev is the index of the init instance of the cutPoint
+        size_t index, idxPrev = 0, idx = indices[0];
+        xPrev = X[idx];
+        yPrev = y[idx];
+        for (index = 0; index < size_t(indices.size()) - 1; index++) {
+            idx = indices[index];
            //  Definition 2 Cut points are always on boundaries
-            if (y.at(*index) != yPrev && xPrev < X.at(*index)) {
-                cutPoint = round(divider * (X.at(*index) + xPrev) / 2) / divider;
+            if (y[idx] != yPrev && xPrev < X[idx]) {
+                if (first) {
+                    first = false;
+                    cutPoint.fromValue = std::numeric_limits<float>::lowest();
+                } else {
+                    cutPoint.fromValue = cutPts.back().toValue;
+                }
+                cutPoint.start = idxPrev;
+                cutPoint.end = index;
+                cutPoint.classNumber = -1;
+                cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
                if (debug) {
-                    std::cout << "Cut point: " << (xPrev + X.at(*index)) / 2 << " //";
-                    std::cout << X.at(*index) << " -> " << y.at(*index) << " yPrev= " << yPrev;
-                    std::cout << "* (" << X.at(*index) << ", " << xPrev << ")="
-                        << ((X.at(*index) + xPrev) / 2) << "idxPrev"
-                        << idxPrev << std::endl;
+                    std::cout << "Cut point: " << cutPoint << " //";
+                    std::cout << X[idx] << " -> " << y[idx] << " yPrev= "
+                        << yPrev << idxPrev << std::endl;
                }
+                idxPrev = index;
                cutPts.push_back(cutPoint);
-                cutIdx.push_back(idxPrev);
            }
-            xPrev = X.at(*index);
-            yPrev = y.at(*index);
-            idxPrev = *index;
+            xPrev = X[idx];
+            yPrev = y[idx];
        }
-        // cutPoints = cutPts;
+        std::cout << "Came to here" << first << std::endl;
+        if (first) {
+            cutPoint.start = 0;
+            cutPoint.classNumber = -1;
+            cutPoint.fromValue = std::numeric_limits<float>::lowest();
+            cutPoint.toValue = std::numeric_limits<float>::max();
+            cutPoints.push_back(cutPoint);
+        } else
+            cutPts.back().toValue = std::numeric_limits<float>::max();
+        cutPts.back().end = X.size();
+        if (debug)
+            for (auto cutPoint : cutPts)
+                std::cout << "Cut point: " << cutPoint << std::endl;
+        cutPoints = cutPts;
    }
    // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
    indices_t CPPFImdlp::sortIndices(samples& X)
--- a/fimdlp/CPPFImdlp.h
+++ b/fimdlp/CPPFImdlp.h
@@ -5,29 +5,30 @@
 namespace mdlp {
    class CPPFImdlp {
    private:
-        bool debug;
+        bool proposed; // proposed algorithm or original algorithm
        int precision;
+        bool debug;
        float divider;
        indices_t indices; // sorted indices to use with X and y
        samples X;
        labels y;
        labels xDiscretized;
        int numClasses;
-        std::vector<CutPoint_t> cutPoints;
+        cutPoints_t cutPoints;

    protected:
        indices_t sortIndices(samples&);
-        void computeCutPointsAnt();
-        void computeCutPoints();
-        bool evaluateCutPoint(CutPoint_t, CutPoint_t);
+        void computeCutPointsOriginal();
+        void computeCutPointsProposed();
+        bool evaluateCutPoint(cutPoint_t, cutPoint_t);
        void filterCutPoints();
        void applyCutPoints();

    public:
        CPPFImdlp();
-        CPPFImdlp(int, bool debug = false);
+        CPPFImdlp(bool, int, bool debug = false);
        ~CPPFImdlp();
-        std::vector<CutPoint_t> getCutPoints();
+        cutPoints_t getCutPoints();
        labels getDiscretizedValues();
        void debugPoints(samples&, labels&);
        void fit(samples&, labels&);
--- a/fimdlp/Metrics.cpp
+++ b/fimdlp/Metrics.cpp
@@ -41,7 +41,7 @@ namespace mdlp {
        entropy = Metrics::entropy(y, indices, start, end, nClasses);
        entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft);
        entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight);
-        iGain = entropy - (float)nElementsLeft / nElements * entropyLeft - (float)nElementsRight / nElements * entropyRight;
+        iGain = entropy - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements;
        return iGain;
    }

--- a/fimdlp/cfimdlp.pyx
+++ b/fimdlp/cfimdlp.pyx
@@ -10,7 +10,7 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
        float fromValue, toValue;
    cdef cppclass CPPFImdlp:
        CPPFImdlp() except + 
-        CPPFImdlp(int, bool) except + 
+        CPPFImdlp(bool, int, bool) except + 
        void fit(vector[float]&, vector[int]&)
        vector[int] transform(vector[float]&)
        vector[int] getDiscretizedValues()
@@ -18,7 +18,7 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
        void debugPoints(vector[float]&, vector[int]&)
        

-class PCutPoint_t:
+class PcutPoint_t:
    def __init__(self, start, end, fromValue, toValue):
        self.start = start
        self.end = end
@@ -27,8 +27,9 @@ class PCutPoint_t:

 cdef class CFImdlp:
    cdef CPPFImdlp *thisptr
-    def __cinit__(self, precision=6, debug=False):
-        self.thisptr = new CPPFImdlp(precision, debug)
+    def __cinit__(self, precision=6, debug=False, proposed=True):
+        # Proposed or original algorithm
+        self.thisptr = new CPPFImdlp(proposed, precision, debug)
    def __dealloc__(self):
        del self.thisptr
    def fit(self, X, y):
--- a/fimdlp/cppfimdlp.cpython-310-darwin.so
+++ b/fimdlp/cppfimdlp.cpython-310-darwin.so
--- a/fimdlp/mdlp.py
+++ b/fimdlp/mdlp.py
@@ -61,7 +61,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
        self.n_features_ = X.shape[1]
        self.X_ = X
        self.y_ = y
-        self.discretizer_ = CFImdlp(debug=False)
+        self.discretizer_ = CFImdlp(debug=True, proposed=False)
        return self

    def transform(self, X):
@@ -104,19 +104,31 @@ class FImdlp(TransformerMixin, BaseEstimator):
        print("Cuts calculados en python: ", cuts)
        print("Cuts calculados en C++")
        print("Cut points for each feature in Iris dataset:")
-        for i in range(0, self.n_features_):
+        for i in range(0, 1):
            # datax = self.X_[np.argsort(self.X_[:, i]), i]
            # y_ = self.y_[np.argsort(self.X_[:, i])]
            datax = self.X_[:, i]
            y_ = self.y_
-            Xcutpoints = self.discretizer_.cut_points(datax, y_)
+            self.discretizer_.fit(datax, y_)
+            Xcutpoints = self.discretizer_.get_cut_points()
            print(
                f"New ({len(Xcutpoints)}):{self.features_[i]:20s}: "
-                f"{Xcutpoints}"
-            )
-            Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
-            print(
-                f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
-                f"{Xcutpoints}"
+                f"{[i['toValue'] for i in Xcutpoints]}"
            )
+            X_translated = [
+                f"{i['classNumber']} - ({i['start']}, {i['end']}) - "
+                f"({i['fromValue']}, {i['toValue']})"
+                for i in Xcutpoints
+            ]
+            print(X_translated)
+            print("*******************************")
+            print("Disretized values:")
+            print(self.discretizer_.transform(datax))
+            print("*******************************")
+            print("indices:", np.argsort(X[:, 0]))
+            # Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
+            # print(
+            #     f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
+            #     f"{Xcutpoints}"
+            # )
        return X
--- a/fimdlp/testcpp/FImdlp_unittest.cc
+++ b/fimdlp/testcpp/FImdlp_unittest.cc
@@ -30,7 +30,7 @@ namespace mdlp {
                prev = X[testSortedIndices[i]];
            }
        }
-        std::vector<CutPoint_t> testCutPoints(samples& X, indices_t& indices, labels& y)
+        std::vector<cutPoint_t> testCutPoints(samples& X, indices_t& indices, labels& y)
        {
            this->X = X;
            this->y = y;
@@ -56,7 +56,7 @@ namespace mdlp {
    }
    // TEST_F(TestMetrics, EvaluateCutPoint)
    // {
-    //     CutPoint_t rest, candidate;
+    //     cutPoint_t rest, candidate;
    //     rest.start = 0;
    //     rest.end = 10;
    //     candidate.start = 0;
@@ -64,13 +64,13 @@ namespace mdlp {
    //     float computed = evaluateCutPoint(rest, candidate);
    //     ASSERT_NEAR(0.468996, computed, precision_test);
    // }
-    TEST_F(TestMetrics, ComputeCutPoints)
+    TEST_F(TestMetrics, ComputeCutPointsOriginal)
    {
-        std::vector<CutPoint_t> computed, expected;
-        computeCutPoints();
+        std::vector<cutPoint_t> computed, expected;
+        computeCutPointsOriginal();
        computed = getCutPoints();
        for (auto cut : computed) {
-            std::cout << "(" << cut.start << ", " << cut.end << ") -> (" << cut.fromValue << ",  " << cut.toValue << ")" << std::endl;
+            std::cout << cut.classNumber << " -> (" << cut.start << ", " << cut.end << ") -> (" << cut.fromValue << ",  " << cut.toValue << ")" << std::endl;
        }
    }
 }
--- a/fimdlp/testcpp/test.sh
+++ b/fimdlp/testcpp/test.sh
@@ -1,12 +0,0 @@
-cmake -S . -B build -Wno-dev 
-if test $? -ne 0; then
-   echo "Error in creating build commands."
-   exit 1
-fi
-cmake --build build
-if test $? -ne 0; then
-   echo "Error in build command."
-   exit 1
-fi
-cd build
-ctest --output-on-failure
--- a/fimdlp/typesFImdlp.h
+++ b/fimdlp/typesFImdlp.h
@@ -2,14 +2,15 @@
 #define TYPES_H
 #include <vector>
 namespace mdlp {
-    typedef std::vector<float> samples;
-    typedef std::vector<int> labels;
-    typedef std::vector<size_t> indices_t;
    struct CutPointBody {
        size_t start, end;        // indices of the sorted vector
        int classNumber;          // class assigned to the cut point
        float fromValue, toValue;
    };
-    typedef CutPointBody CutPoint_t;
+    typedef CutPointBody cutPoint_t;
+    typedef std::vector<float> samples;
+    typedef std::vector<int> labels;
+    typedef std::vector<size_t> indices_t;
+    typedef std::vector<cutPoint_t> cutPoints_t;
 }
 #endif
--- a/sample.py
+++ b/sample.py
@@ -2,6 +2,59 @@ from sklearn.datasets import load_iris
 from fimdlp.mdlp import FImdlp
 from fimdlp.cppfimdlp import CFImdlp
 import numpy as np
+from math import log
+
+
+def entropy(y: np.array) -> float:
+    """Compute entropy of a labels set
+
+    Parameters
+    ----------
+    y : np.array
+        set of labels
+
+    Returns
+    -------
+    float
+        entropy
+    """
+    n_labels = len(y)
+    if n_labels <= 1:
+        return 0
+    counts = np.bincount(y)
+    proportions = counts / n_labels
+    n_classes = np.count_nonzero(proportions)
+    if n_classes <= 1:
+        return 0
+    entropy = 0.0
+    # Compute standard entropy.
+    for prop in proportions:
+        if prop != 0.0:
+            entropy -= prop * log(prop, 2)
+    return entropy
+
+
+def information_gain(
+    labels: np.array, labels_up: np.array, labels_dn: np.array
+) -> float:
+    imp_prev = entropy(labels)
+    card_up = card_dn = imp_up = imp_dn = 0
+    if labels_up is not None:
+        card_up = labels_up.shape[0]
+        imp_up = entropy(labels_up)
+    if labels_dn is not None:
+        card_dn = labels_dn.shape[0] if labels_dn is not None else 0
+        imp_dn = entropy(labels_dn)
+    samples = card_up + card_dn
+    if samples == 0:
+        return 0.0
+    else:
+        result = (
+            imp_prev
+            - (card_up / samples) * imp_up
+            - (card_dn / samples) * imp_dn
+        )
+        return result


 data = load_iris()
@@ -10,26 +63,38 @@ y = data.target
 features = data.feature_names
 test = FImdlp()
 test.fit(X, y, features=features)
-# test.transform(X)
+test.transform(X)

-test = CFImdlp(debug=False)
-# k = test.cut_points(X[:, 0], y)
-# print(k)
-# k = test.cut_points_ant(X[:, 0], y)
-# print(k)
-# test.debug_points(X[:, 0], y)
-X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
-indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
-y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
-# test.fit(X[:, 0], y)
-test.fit(X, y)
-result = test.get_cut_points()
-for item in result:
-    print(
-        f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
-        f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
-    )
-print(test.get_discretized_values())
+# test = CFImdlp(debug=False)
+# # k = test.cut_points(X[:, 0], y)
+# # print(k)
+# # k = test.cut_points_ant(X[:, 0], y)
+# # print(k)
+# # test.debug_points(X[:, 0], y)
+# X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
+# indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
+# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+# # To check
+# indices2 = np.argsort(X)
+# Xs = np.array(X)[indices2]
+# ys = np.array(y)[indices2]
+# # test.fit(X[:, 0], y)
+# test.fit(X, y)
+# result = test.get_cut_points()
+# for item in result:
+#     print(
+#         f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
+#         f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
+#     )
+# print(test.get_discretized_values())
+
+# print(Xs, ys)
+# print("**********************")
+# test = [(0, 3), (4, 4), (5, 5), (6, 8), (9, 9)]
+# print(ys)
+# for start, end in test:
+#     print("Testing ", start, end, ys[:end], ys[end:])
+#     print("Information gain: ", information_gain(ys, ys[:end], ys[end:]))
 # print(test.transform(X))
 # print(X)
 # print(indices)
--- a/test1.xlsx
+++ b/test1.xlsx