Add max_cuts hyperparamter as in mdlp

This commit is contained in:
2023-03-13 18:14:56 +01:00
parent aa55d3a340
commit ccce9725b3
5 changed files with 30 additions and 12 deletions

View File

@@ -40,11 +40,10 @@ vector<int>& ArffFiles::getY()
void ArffFiles::load(string fileName, bool classLast) void ArffFiles::load(string fileName, bool classLast)
{ {
ifstream file(fileName); ifstream file(fileName);
string keyword, attribute, type;
if (file.is_open()) { if (file.is_open()) {
string line; string line, keyword, attribute, type;
while (getline(file, line)) { while (getline(file, line)) {
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") { if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue; continue;
} }
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
@@ -79,7 +78,7 @@ void ArffFiles::generateDataset(bool classLast)
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size())); X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
vector<string> yy = vector<string>(lines.size(), ""); vector<string> yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? attributes.size() : 0; int labelIndex = classLast ? attributes.size() : 0;
for (int i = 0; i < lines.size(); i++) { for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]); stringstream ss(lines[i]);
string value; string value;
int pos = 0, xIndex = 0; int pos = 0, xIndex = 0;

View File

@@ -12,7 +12,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
ctypedef float precision_t ctypedef float precision_t
cdef cppclass CPPFImdlp: cdef cppclass CPPFImdlp:
CPPFImdlp() except + CPPFImdlp() except +
CPPFImdlp(size_t, int) except + CPPFImdlp(size_t, int, float) except +
CPPFImdlp& fit(vector[precision_t]&, vector[int]&) CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
int get_depth() int get_depth()
vector[precision_t] getCutPoints() vector[precision_t] getCutPoints()
@@ -20,8 +20,8 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
cdef class CFImdlp: cdef class CFImdlp:
cdef CPPFImdlp *thisptr cdef CPPFImdlp *thisptr
def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX): def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX, float max_cuts=0):
self.thisptr = new CPPFImdlp(min_length, max_depth) self.thisptr = new CPPFImdlp(min_length, max_depth, max_cuts)
def __dealloc__(self): def __dealloc__(self):
del self.thisptr del self.thisptr
def fit(self, X, y): def fit(self, X, y):

View File

@@ -10,10 +10,11 @@ from ._version import __version__
class FImdlp(TransformerMixin, BaseEstimator): class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6): def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0):
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.min_length = min_length self.min_length = min_length
self.max_depth = max_depth self.max_depth = max_depth
self.max_cuts = max_cuts
"""Fayyad - Irani MDLP discretization algorithm based implementation. """Fayyad - Irani MDLP discretization algorithm based implementation.
@@ -108,7 +109,9 @@ class FImdlp(TransformerMixin, BaseEstimator):
def _fit_discretizer(self, feature): def _fit_discretizer(self, feature):
if feature in self.features_: if feature in self.features_:
self.discretizer_[feature] = CFImdlp( self.discretizer_[feature] = CFImdlp(
min_length=self.min_length, max_depth=self.max_depth min_length=self.min_length,
max_depth=self.max_depth,
max_cuts=self.max_cuts,
) )
self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[ self.cut_points_[feature] = self.discretizer_[

View File

@@ -7,8 +7,6 @@ from ..cppfimdlp import CFImdlp, factorize, CArffFiles
from ..mdlp import FImdlp from ..mdlp import FImdlp
from .. import __version__ from .. import __version__
# from .._version import __version__
class FImdlpTest(unittest.TestCase): class FImdlpTest(unittest.TestCase):
delta = 1e-6 # same tolerance as in C++ code delta = 1e-6 # same tolerance as in C++ code
@@ -288,6 +286,24 @@ class FImdlpTest(unittest.TestCase):
for e, c in zip(expected, computed): for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta) self.assertAlmostEqual(e, c, delta=self.delta)
def test_max_cuts(self):
clf = FImdlp(max_cuts=1)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45],
[3.35],
[2.45],
[0.8],
]
expected_depths = [1] * 4
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_ArffFiles(self): def test_ArffFiles(self):
loader = CArffFiles() loader = CArffFiles()
loader.load(b"src/cppmdlp/tests/datasets/iris.arff") loader.load(b"src/cppmdlp/tests/datasets/iris.arff")