From ccce9725b36aaed0e223942139bcfcf845a1f375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 13 Mar 2023 18:14:56 +0100 Subject: [PATCH] Add max_cuts hyperparamter as in mdlp --- src/cppmdlp | 2 +- src/fimdlp/ArffFiles.cpp | 7 +++---- src/fimdlp/cfimdlp.pyx | 6 +++--- src/fimdlp/mdlp.py | 7 +++++-- src/fimdlp/tests/FImdlp_test.py | 20 ++++++++++++++++++-- 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/cppmdlp b/src/cppmdlp index a7d13f6..ed74336 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit a7d13f602de3d347d1d5ad53bb654b6dedd4def1 +Subproject commit ed7433672d98745115fb5f0bc49fcbd7bf035427 diff --git a/src/fimdlp/ArffFiles.cpp b/src/fimdlp/ArffFiles.cpp index 470f5fa..4fbca78 100644 --- a/src/fimdlp/ArffFiles.cpp +++ b/src/fimdlp/ArffFiles.cpp @@ -40,11 +40,10 @@ vector& ArffFiles::getY() void ArffFiles::load(string fileName, bool classLast) { ifstream file(fileName); - string keyword, attribute, type; if (file.is_open()) { - string line; + string line, keyword, attribute, type; while (getline(file, line)) { - if (line[0] == '%' || line.empty() || line == "\r" || line == " ") { + if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { continue; } if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { @@ -79,7 +78,7 @@ void ArffFiles::generateDataset(bool classLast) X = vector>(attributes.size(), vector(lines.size())); vector yy = vector(lines.size(), ""); int labelIndex = classLast ? attributes.size() : 0; - for (int i = 0; i < lines.size(); i++) { + for (size_t i = 0; i < lines.size(); i++) { stringstream ss(lines[i]); string value; int pos = 0, xIndex = 0; diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx index fce850d..560bd13 100644 --- a/src/fimdlp/cfimdlp.pyx +++ b/src/fimdlp/cfimdlp.pyx @@ -12,7 +12,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": ctypedef float precision_t cdef cppclass CPPFImdlp: CPPFImdlp() except + - CPPFImdlp(size_t, int) except + + CPPFImdlp(size_t, int, float) except + CPPFImdlp& fit(vector[precision_t]&, vector[int]&) int get_depth() vector[precision_t] getCutPoints() @@ -20,8 +20,8 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": cdef class CFImdlp: cdef CPPFImdlp *thisptr - def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX): - self.thisptr = new CPPFImdlp(min_length, max_depth) + def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX, float max_cuts=0): + self.thisptr = new CPPFImdlp(min_length, max_depth, max_cuts) def __dealloc__(self): del self.thisptr def fit(self, X, y): diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 60521ec..f22be91 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -10,10 +10,11 @@ from ._version import __version__ class FImdlp(TransformerMixin, BaseEstimator): - def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6): + def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0): self.n_jobs = n_jobs self.min_length = min_length self.max_depth = max_depth + self.max_cuts = max_cuts """Fayyad - Irani MDLP discretization algorithm based implementation. @@ -108,7 +109,9 @@ class FImdlp(TransformerMixin, BaseEstimator): def _fit_discretizer(self, feature): if feature in self.features_: self.discretizer_[feature] = CFImdlp( - min_length=self.min_length, max_depth=self.max_depth + min_length=self.min_length, + max_depth=self.max_depth, + max_cuts=self.max_cuts, ) self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.cut_points_[feature] = self.discretizer_[ diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index afa92c9..111f960 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -7,8 +7,6 @@ from ..cppfimdlp import CFImdlp, factorize, CArffFiles from ..mdlp import FImdlp from .. import __version__ -# from .._version import __version__ - class FImdlpTest(unittest.TestCase): delta = 1e-6 # same tolerance as in C++ code @@ -288,6 +286,24 @@ class FImdlpTest(unittest.TestCase): for e, c in zip(expected, computed): self.assertAlmostEqual(e, c, delta=self.delta) + def test_max_cuts(self): + clf = FImdlp(max_cuts=1) + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + expected_cutpoints = [ + [5.45], + [3.35], + [2.45], + [0.8], + ] + expected_depths = [1] * 4 + self.assertListEqual(expected_depths, clf.get_depths()) + for expected, computed in zip( + expected_cutpoints, clf.get_cut_points() + ): + for e, c in zip(expected, computed): + self.assertAlmostEqual(e, c, delta=self.delta) + def test_ArffFiles(self): loader = CArffFiles() loader.load(b"src/cppmdlp/tests/datasets/iris.arff")