Add max_cuts hyperparamter as in mdlp

2025-08-16 16:05:52 +00:00 · 2023-03-13 18:14:56 +01:00
parent aa55d3a340
commit ccce9725b3
5 changed files with 30 additions and 12 deletions
--- a/src/cppmdlp
+++ b/src/cppmdlp
--- a/src/fimdlp/ArffFiles.cpp
+++ b/src/fimdlp/ArffFiles.cpp
@@ -40,11 +40,10 @@ vector<int>& ArffFiles::getY()
 void ArffFiles::load(string fileName, bool classLast)
 {
    ifstream file(fileName);
-    string keyword, attribute, type;
    if (file.is_open()) {
-        string line;
+        string line, keyword, attribute, type;
        while (getline(file, line)) {
-            if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
+            if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
                continue;
            }
            if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
@@ -79,7 +78,7 @@ void ArffFiles::generateDataset(bool classLast)
    X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
    vector<string> yy = vector<string>(lines.size(), "");
    int labelIndex = classLast ? attributes.size() : 0;
-    for (int i = 0; i < lines.size(); i++) {
+    for (size_t i = 0; i < lines.size(); i++) {
        stringstream ss(lines[i]);
        string value;
        int pos = 0, xIndex = 0;
--- a/src/fimdlp/cfimdlp.pyx
+++ b/src/fimdlp/cfimdlp.pyx
@@ -12,7 +12,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
    ctypedef float precision_t
    cdef cppclass CPPFImdlp:
        CPPFImdlp() except + 
-        CPPFImdlp(size_t, int) except + 
+        CPPFImdlp(size_t, int, float) except + 
        CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
        int get_depth()
        vector[precision_t] getCutPoints()
@@ -20,8 +20,8 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
        
 cdef class CFImdlp:
    cdef CPPFImdlp *thisptr
-    def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX):
-        self.thisptr = new CPPFImdlp(min_length, max_depth)
+    def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX, float max_cuts=0):
+        self.thisptr = new CPPFImdlp(min_length, max_depth, max_cuts)
    def __dealloc__(self):
        del self.thisptr
    def fit(self, X, y):
--- a/src/fimdlp/mdlp.py
+++ b/src/fimdlp/mdlp.py
@@ -10,10 +10,11 @@ from ._version import __version__


 class FImdlp(TransformerMixin, BaseEstimator):
-    def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6):
+    def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0):
        self.n_jobs = n_jobs
        self.min_length = min_length
        self.max_depth = max_depth
+        self.max_cuts = max_cuts

    """Fayyad - Irani MDLP discretization algorithm based implementation.

@@ -108,7 +109,9 @@ class FImdlp(TransformerMixin, BaseEstimator):
    def _fit_discretizer(self, feature):
        if feature in self.features_:
            self.discretizer_[feature] = CFImdlp(
-                min_length=self.min_length, max_depth=self.max_depth
+                min_length=self.min_length,
+                max_depth=self.max_depth,
+                max_cuts=self.max_cuts,
            )
            self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
            self.cut_points_[feature] = self.discretizer_[
--- a/src/fimdlp/tests/FImdlp_test.py
+++ b/src/fimdlp/tests/FImdlp_test.py
@@ -7,8 +7,6 @@ from ..cppfimdlp import CFImdlp, factorize, CArffFiles
 from ..mdlp import FImdlp
 from .. import __version__

-# from .._version import __version__
-

 class FImdlpTest(unittest.TestCase):
    delta = 1e-6  # same tolerance as in C++ code
@@ -288,6 +286,24 @@ class FImdlpTest(unittest.TestCase):
            for e, c in zip(expected, computed):
                self.assertAlmostEqual(e, c, delta=self.delta)

+    def test_max_cuts(self):
+        clf = FImdlp(max_cuts=1)
+        X, y = load_iris(return_X_y=True)
+        clf.fit(X, y)
+        expected_cutpoints = [
+            [5.45],
+            [3.35],
+            [2.45],
+            [0.8],
+        ]
+        expected_depths = [1] * 4
+        self.assertListEqual(expected_depths, clf.get_depths())
+        for expected, computed in zip(
+            expected_cutpoints, clf.get_cut_points()
+        ):
+            for e, c in zip(expected, computed):
+                self.assertAlmostEqual(e, c, delta=self.delta)
+
    def test_ArffFiles(self):
        loader = CArffFiles()
        loader.load(b"src/cppmdlp/tests/datasets/iris.arff")