mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-17 08:25:51 +00:00
Add max_cuts hyperparamter as in mdlp
This commit is contained in:
Submodule src/cppmdlp updated: a7d13f602d...ed7433672d
@@ -40,11 +40,10 @@ vector<int>& ArffFiles::getY()
|
|||||||
void ArffFiles::load(string fileName, bool classLast)
|
void ArffFiles::load(string fileName, bool classLast)
|
||||||
{
|
{
|
||||||
ifstream file(fileName);
|
ifstream file(fileName);
|
||||||
string keyword, attribute, type;
|
|
||||||
if (file.is_open()) {
|
if (file.is_open()) {
|
||||||
string line;
|
string line, keyword, attribute, type;
|
||||||
while (getline(file, line)) {
|
while (getline(file, line)) {
|
||||||
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
|
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
|
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
|
||||||
@@ -79,7 +78,7 @@ void ArffFiles::generateDataset(bool classLast)
|
|||||||
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
|
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
|
||||||
vector<string> yy = vector<string>(lines.size(), "");
|
vector<string> yy = vector<string>(lines.size(), "");
|
||||||
int labelIndex = classLast ? attributes.size() : 0;
|
int labelIndex = classLast ? attributes.size() : 0;
|
||||||
for (int i = 0; i < lines.size(); i++) {
|
for (size_t i = 0; i < lines.size(); i++) {
|
||||||
stringstream ss(lines[i]);
|
stringstream ss(lines[i]);
|
||||||
string value;
|
string value;
|
||||||
int pos = 0, xIndex = 0;
|
int pos = 0, xIndex = 0;
|
||||||
|
@@ -12,7 +12,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
|
|||||||
ctypedef float precision_t
|
ctypedef float precision_t
|
||||||
cdef cppclass CPPFImdlp:
|
cdef cppclass CPPFImdlp:
|
||||||
CPPFImdlp() except +
|
CPPFImdlp() except +
|
||||||
CPPFImdlp(size_t, int) except +
|
CPPFImdlp(size_t, int, float) except +
|
||||||
CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
|
CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
|
||||||
int get_depth()
|
int get_depth()
|
||||||
vector[precision_t] getCutPoints()
|
vector[precision_t] getCutPoints()
|
||||||
@@ -20,8 +20,8 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
|
|||||||
|
|
||||||
cdef class CFImdlp:
|
cdef class CFImdlp:
|
||||||
cdef CPPFImdlp *thisptr
|
cdef CPPFImdlp *thisptr
|
||||||
def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX):
|
def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX, float max_cuts=0):
|
||||||
self.thisptr = new CPPFImdlp(min_length, max_depth)
|
self.thisptr = new CPPFImdlp(min_length, max_depth, max_cuts)
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
del self.thisptr
|
del self.thisptr
|
||||||
def fit(self, X, y):
|
def fit(self, X, y):
|
||||||
|
@@ -10,10 +10,11 @@ from ._version import __version__
|
|||||||
|
|
||||||
|
|
||||||
class FImdlp(TransformerMixin, BaseEstimator):
|
class FImdlp(TransformerMixin, BaseEstimator):
|
||||||
def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6):
|
def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0):
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
self.min_length = min_length
|
self.min_length = min_length
|
||||||
self.max_depth = max_depth
|
self.max_depth = max_depth
|
||||||
|
self.max_cuts = max_cuts
|
||||||
|
|
||||||
"""Fayyad - Irani MDLP discretization algorithm based implementation.
|
"""Fayyad - Irani MDLP discretization algorithm based implementation.
|
||||||
|
|
||||||
@@ -108,7 +109,9 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
def _fit_discretizer(self, feature):
|
def _fit_discretizer(self, feature):
|
||||||
if feature in self.features_:
|
if feature in self.features_:
|
||||||
self.discretizer_[feature] = CFImdlp(
|
self.discretizer_[feature] = CFImdlp(
|
||||||
min_length=self.min_length, max_depth=self.max_depth
|
min_length=self.min_length,
|
||||||
|
max_depth=self.max_depth,
|
||||||
|
max_cuts=self.max_cuts,
|
||||||
)
|
)
|
||||||
self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
|
self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
|
||||||
self.cut_points_[feature] = self.discretizer_[
|
self.cut_points_[feature] = self.discretizer_[
|
||||||
|
@@ -7,8 +7,6 @@ from ..cppfimdlp import CFImdlp, factorize, CArffFiles
|
|||||||
from ..mdlp import FImdlp
|
from ..mdlp import FImdlp
|
||||||
from .. import __version__
|
from .. import __version__
|
||||||
|
|
||||||
# from .._version import __version__
|
|
||||||
|
|
||||||
|
|
||||||
class FImdlpTest(unittest.TestCase):
|
class FImdlpTest(unittest.TestCase):
|
||||||
delta = 1e-6 # same tolerance as in C++ code
|
delta = 1e-6 # same tolerance as in C++ code
|
||||||
@@ -288,6 +286,24 @@ class FImdlpTest(unittest.TestCase):
|
|||||||
for e, c in zip(expected, computed):
|
for e, c in zip(expected, computed):
|
||||||
self.assertAlmostEqual(e, c, delta=self.delta)
|
self.assertAlmostEqual(e, c, delta=self.delta)
|
||||||
|
|
||||||
|
def test_max_cuts(self):
|
||||||
|
clf = FImdlp(max_cuts=1)
|
||||||
|
X, y = load_iris(return_X_y=True)
|
||||||
|
clf.fit(X, y)
|
||||||
|
expected_cutpoints = [
|
||||||
|
[5.45],
|
||||||
|
[3.35],
|
||||||
|
[2.45],
|
||||||
|
[0.8],
|
||||||
|
]
|
||||||
|
expected_depths = [1] * 4
|
||||||
|
self.assertListEqual(expected_depths, clf.get_depths())
|
||||||
|
for expected, computed in zip(
|
||||||
|
expected_cutpoints, clf.get_cut_points()
|
||||||
|
):
|
||||||
|
for e, c in zip(expected, computed):
|
||||||
|
self.assertAlmostEqual(e, c, delta=self.delta)
|
||||||
|
|
||||||
def test_ArffFiles(self):
|
def test_ArffFiles(self):
|
||||||
loader = CArffFiles()
|
loader = CArffFiles()
|
||||||
loader.load(b"src/cppmdlp/tests/datasets/iris.arff")
|
loader.load(b"src/cppmdlp/tests/datasets/iris.arff")
|
||||||
|
Reference in New Issue
Block a user