Refactor to ensemble the pieces

2025-08-16 16:05:52 +00:00 · 2022-12-04 14:05:00 +01:00
parent 34a69622bc
commit 31c9b8a3a3
8 changed files with 69 additions and 108 deletions
--- a/cppfimdlp.cpython-310-darwin.so
+++ b/cppfimdlp.cpython-310-darwin.so
--- a/fimdlp/CPPFImdlp.cpp
+++ b/fimdlp/CPPFImdlp.cpp
@@ -12,21 +12,24 @@ namespace mdlp {
        return os;

    }
-    CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false)
+    CPPFImdlp::CPPFImdlp() : proposal(true), precision(6), debug(false)
    {
        divider = pow(10, precision);
        numClasses = 0;
    }
-    CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug)
+    CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug) : proposal(proposal), precision(precision), debug(debug)
    {
        divider = pow(10, precision);
        numClasses = 0;
    }
    CPPFImdlp::~CPPFImdlp()
        = default;
-    std::vector<cutPoint_t> CPPFImdlp::getCutPoints()
+    samples CPPFImdlp::getCutPoints()
    {
-        return cutPoints;
+        samples output(cutPoints.size());
+        std::transform(cutPoints.begin(), cutPoints.end(), output.begin(),
+            [](cutPoint_t cut) { return cut.toValue; });
+        return output;
    }
    labels CPPFImdlp::getDiscretizedValues()
    {
@@ -48,28 +51,19 @@ namespace mdlp {
        this->xDiscretized = labels(X.size(), -1);
        this->numClasses = Metrics::numClasses(y, indices, 0, X.size());

-        if (proposed) {
-            computeCutPointsProposed();
+        if (proposal) {
+            computeCutPointsProposal();
        } else {
            computeCutPointsOriginal();
        }
        filterCutPoints();
-        applyCutPoints();
-        return *this;
-    }
-    labels& CPPFImdlp::transform(samples& X_)
-    {
-        indices_t indices_transform = sortIndices(X_);
-        applyCutPoints();
-        return xDiscretized;
-    }
-    void CPPFImdlp::applyCutPoints()
-    {
+        // Apply cut points to the input vector
        for (auto cut : cutPoints) {
            for (size_t i = cut.start; i < cut.end; i++) {
                xDiscretized[indices[i]] = cut.classNumber;
            }
        }
+        return *this;
    }
    bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
    {
@@ -142,7 +136,7 @@ namespace mdlp {
        }
        cutPoints = filtered;
    }
-    void CPPFImdlp::computeCutPointsProposed()
+    void CPPFImdlp::computeCutPointsProposal()
    {
        cutPoints_t cutPts;
        cutPoint_t cutPoint;
@@ -206,7 +200,7 @@ namespace mdlp {
        if (debug) {
            std::cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << std::endl;
            for (auto cutPt : cutPts)
-                std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposed: Cut point: " << cutPt;
+                std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt;
        }
        cutPoints = cutPts;
    }
--- a/fimdlp/CPPFImdlp.h
+++ b/fimdlp/CPPFImdlp.h
@@ -5,7 +5,7 @@
 namespace mdlp {
    class CPPFImdlp {
    protected:
-        bool proposed; // proposed algorithm or original algorithm
+        bool proposal; // proposed algorithm or original algorithm
        int precision;
        bool debug;
        float divider;
@@ -19,21 +19,20 @@ namespace mdlp {
        void setCutPoints(cutPoints_t);
        static indices_t sortIndices(samples&);
        void computeCutPointsOriginal();
-        void computeCutPointsProposed();
+        void computeCutPointsProposal();
        bool evaluateCutPoint(cutPoint_t, cutPoint_t);
        void filterCutPoints();
-        void applyCutPoints();

    public:
        CPPFImdlp();
        CPPFImdlp(bool, int, bool debug = false);
        ~CPPFImdlp();
-        cutPoints_t getCutPoints();
+        samples getCutPoints();
        indices_t getIndices();
        labels getDiscretizedValues();
        void debugPoints(samples&, labels&);
        CPPFImdlp& fit(samples&, labels&);
-        labels& transform(samples&);
+        labels transform(samples&);
    };
 }
 #endif
--- a/fimdlp/cfimdlp.pyx
+++ b/fimdlp/cfimdlp.pyx
@@ -12,9 +12,8 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
        CPPFImdlp() except + 
        CPPFImdlp(bool, int, bool) except + 
        CPPFImdlp& fit(vector[float]&, vector[int]&)
-        vector[int] transform(vector[float]&)
        vector[int] getDiscretizedValues()
-        vector[CutPointBody] getCutPoints()
+        vector[float] getCutPoints()
        

 class PcutPoint_t:
@@ -26,16 +25,14 @@ class PcutPoint_t:

 cdef class CFImdlp:
    cdef CPPFImdlp *thisptr
-    def __cinit__(self, precision=6, debug=False, proposed=True):
-        # Proposed or original algorithm
-        self.thisptr = new CPPFImdlp(proposed, precision, debug)
+    def __cinit__(self, precision=6, debug=False, proposal=True):
+        # Proposal or original algorithm
+        self.thisptr = new CPPFImdlp(proposal, precision, debug)
    def __dealloc__(self):
        del self.thisptr
    def fit(self, X, y):
        self.thisptr.fit(X, y)
        return self
-    def transform(self, X):
-        return self.thisptr.transform(X)
    def get_discretized_values(self):
        return self.thisptr.getDiscretizedValues()
    def get_cut_points(self):
--- a/fimdlp/cppfimdlp.cpython-310-darwin.so
+++ b/fimdlp/cppfimdlp.cpython-310-darwin.so
--- a/fimdlp/mdlp.py
+++ b/fimdlp/mdlp.py
@@ -6,6 +6,9 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted


 class FImdlp(TransformerMixin, BaseEstimator):
+    def __init__(self, proposal=True):
+        self.proposal = proposal  # proposed algorithm or original algorithm
+
    """Fayyad - Irani MDLP discretization algorithm.

    Parameters
@@ -57,11 +60,18 @@ class FImdlp(TransformerMixin, BaseEstimator):
        X, y = self._check_params_fit(
            X, y, expected_args=["class_name", "features"], kwargs=kwargs
        )
-
        self.n_features_ = X.shape[1]
        self.X_ = X
        self.y_ = y
-        self.discretizer_ = CFImdlp(debug=True, proposed=False)
+        self.discretizer_ = [None] * self.n_features_
+        self.cut_points_ = [None] * self.n_features_
+        # Can do it in parallel
+        for feature in range(self.n_features_):
+            self.discretizer_[feature] = CFImdlp(proposal=self.proposal)
+            self.discretizer_[feature].fit(X[:, feature], y)
+            self.cut_points_[feature] = self.discretizer_[
+                feature
+            ].get_cut_points()
        return self

    def transform(self, X):
@@ -91,6 +101,15 @@ class FImdlp(TransformerMixin, BaseEstimator):
            raise ValueError(
                "Shape of input is different from what was seen in `fit`"
            )
+        result = np.zeros_like(X, dtype=np.int32) - 1
+        # Can do it in parallel
+        for feature in range(self.n_features_):
+            result[:, feature] = np.searchsorted(
+                self.cut_points_[feature], X[:, feature]
+            )
+        return result
+
+    def test(self):
        print("Calculating cut points in python for first feature")
        yz = self.y_.copy()
        xz = X[:, 0].copy()
@@ -102,7 +121,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
                print(f"Cut point: ({xz[i-1]}, {xz[i]}) ({yz[i-1]}, {yz[i]})")
                cuts.append((xz[i] + xz[i - 1]) / 2)
        print("Cuts calculados en python: ", cuts)
-        print("Cuts calculados en C++")
+        print("-- Cuts calculados en C++ --")
        print("Cut points for each feature in Iris dataset:")
        for i in range(0, 1):
            # datax = self.X_[np.argsort(self.X_[:, i]), i]
@@ -123,12 +142,6 @@ class FImdlp(TransformerMixin, BaseEstimator):
            print(X_translated)
            print("*******************************")
            print("Disretized values:")
-            print(self.discretizer_.transform(datax))
+            print(self.discretizer_.get_discretized_values())
            print("*******************************")
-            print("indices:", np.argsort(X[:, 0]))
-            # Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
-            # print(
-            #     f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
-            #     f"{Xcutpoints}"
-            # )
        return X
--- a/fimdlp/testcpp/FImdlp_unittest.cc
+++ b/fimdlp/testcpp/FImdlp_unittest.cc
@@ -37,6 +37,19 @@ namespace mdlp {
                prev = X[testSortedIndices[i]];
            }
        }
+        void checkCutPoints(cutPoints_t& expected)
+        {
+            int expectedSize = expected.size();
+            EXPECT_EQ(cutPoints.size(), expectedSize);
+            for (auto i = 0; i < expectedSize; i++) {
+                EXPECT_EQ(cutPoints[i].start, expected[i].start);
+                EXPECT_EQ(cutPoints[i].end, expected[i].end);
+                EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber);
+                EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision);
+                EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision);
+            }
+        }
+
    };
    TEST_F(TestFImdlp, SortIndices)
    {
@@ -60,22 +73,13 @@ namespace mdlp {
    TEST_F(TestFImdlp, ComputeCutPointsOriginal)
    {
        cutPoints_t computed, expected;
-        int expectedSize = 3;
        expected = {
            { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
            { 6, 10, -1, 5.45, 3.4028234663852886e+38 }
        };
        setCutPoints(cutPoints_t());
        computeCutPointsOriginal();
-        computed = getCutPoints();
-        EXPECT_EQ(computed.size(), expectedSize);
-        for (auto i = 0; i < expectedSize; i++) {
-            EXPECT_EQ(computed[i].start, expected[i].start);
-            EXPECT_EQ(computed[i].end, expected[i].end);
-            EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
-            EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
-            EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
-        }
+        checkCutPoints(expected);
    }
    TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase)
    {
@@ -83,22 +87,13 @@ namespace mdlp {
        expected = {
                { 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 },
        };
-        int expectedSize = 1;
        X = { 0, 1, 2, 2 };
        y = { 1, 1, 1, 2 };
        fit(X, y);
        computeCutPointsOriginal();
-        computed = getCutPoints();
-        EXPECT_EQ(computed.size(), expectedSize);
-        for (auto i = 0; i < expectedSize; i++) {
-            EXPECT_EQ(computed[i].start, expected[i].start);
-            EXPECT_EQ(computed[i].end, expected[i].end);
-            EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
-            EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
-            EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
-        }
+        checkCutPoints(expected);
    }
-    TEST_F(TestFImdlp, ComputeCutPointsProposed)
+    TEST_F(TestFImdlp, ComputeCutPointsProposal)
    {
        cutPoints_t computed, expected;
        expected = {
@@ -106,57 +101,20 @@ namespace mdlp {
            { 6, 9, -1, 5.4, 5.85 },
            { 9, 10, -1, 5.85, 3.4028234663852886e+38 }
        };
-        int expectedSize = 4;
-        computeCutPointsProposed();
-        computed = getCutPoints();
-        EXPECT_EQ(computed.size(), expectedSize);
-        for (auto i = 0; i < expectedSize; i++) {
-            EXPECT_EQ(computed[i].start, expected[i].start);
-            EXPECT_EQ(computed[i].end, expected[i].end);
-            EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
-            EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
-            EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
-        }
+        computeCutPointsProposal();
+        checkCutPoints(expected);
    }
-    TEST_F(TestFImdlp, ComputeCutPointsProposedGCase)
+    TEST_F(TestFImdlp, ComputeCutPointsProposalGCase)
    {
        cutPoints_t computed, expected;
        expected = {
                { 0, 3, -1, -3.4028234663852886e+38, 1.5 },
                { 3, 4, -1, 1.5, 3.4028234663852886e+38 }
        };
-        int expectedSize = 2;
        X = { 0, 1, 2, 2 };
        y = { 1, 1, 1, 2 };
        fit(X, y);
-        computeCutPointsProposed();
-        computed = getCutPoints();
-        EXPECT_EQ(computed.size(), expectedSize);
-        for (auto i = 0; i < expectedSize; i++) {
-            EXPECT_EQ(computed[i].start, expected[i].start);
-            EXPECT_EQ(computed[i].end, expected[i].end);
-            EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
-            EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
-            EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
-        }
-    }
-    TEST_F(TestFImdlp, ApplyCutPoints)
-    {
-        cutPoints_t expected = {
-            { 0, 4, 17, -3.4028234663852886e+38, 5.1 }, { 4, 6, 31, 5.1, 5.4 },
-            { 6, 8, 59, 5.4, 5.85 },
-            { 8, 10, 41, 5.85, 3.4028234663852886e+38 }
-        };
-        setCutPoints(expected);
-        applyCutPoints();
-        labels expected_x = getDiscretizedValues();
-        indices_t indices_x = getIndices();
-        for (auto i = 0; i < 5; i++) {
-            std::cout << "cutPoint[" << i << "].start = " << expected[i].start << std::endl;
-            for (auto j = expected[i].start; j < expected[i].end; j++) {
-                std::cout << expected_x[j] << expected[i].classNumber << std::endl;
-                EXPECT_EQ(expected_x[indices_x[j]], expected[i].classNumber);
-            }
-        }
+        computeCutPointsProposal();
+        checkCutPoints(expected);
    }
 }
--- a/sample.py
+++ b/sample.py
@@ -65,11 +65,11 @@ features = data.feature_names
 # test.fit(X, y, features=features)
 # test.transform(X)
 # test.get_cut_points()
-for proposed in [True, False]:
+for proposal in [True, False]:
    X = data.data
    y = data.target
-    print("*** Proposed: ", proposed)
-    test = CFImdlp(debug=True, proposed=proposed)
+    print("*** Proposal: ", proposal)
+    test = CFImdlp(debug=True, proposal=proposal)
    test.fit(X[:, 0], y)
    result = test.get_cut_points()
    for item in result:
@@ -118,7 +118,7 @@ for proposed in [True, False]:
 # X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
 # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
 # indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
-# clf = CFImdlp(debug=True, proposed=False)
+# clf = CFImdlp(debug=True, proposal=False)
 # clf.fit(X, y)
 # print(clf.get_cut_points())
 # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]