Remove algorithm hyperparameter in discretizer

This commit is contained in:
2023-02-20 18:26:51 +01:00
parent 31d79a77fa
commit e0b7cae9a0
4 changed files with 126 additions and 129 deletions

View File

@@ -1,8 +1,4 @@
from ._version import __version__ from ._version import __version__
def version():
return __version__
all = ["FImdlp", "__version__"] all = ["FImdlp", "__version__"]

View File

@@ -6,17 +6,15 @@ from libcpp.string cimport string
cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
ctypedef float precision_t ctypedef float precision_t
cdef cppclass CPPFImdlp: cdef cppclass CPPFImdlp:
CPPFImdlp(int) except + CPPFImdlp() except +
CPPFImdlp& fit(vector[precision_t]&, vector[int]&) CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
vector[precision_t] getCutPoints() vector[precision_t] getCutPoints()
string version() string version()
cdef class CFImdlp: cdef class CFImdlp:
cdef CPPFImdlp *thisptr cdef CPPFImdlp *thisptr
cdef int algorithm def __cinit__(self):
def __cinit__(self, algorithm:int ): self.thisptr = new CPPFImdlp()
self.algorithm = algorithm
self.thisptr = new CPPFImdlp(algorithm)
def __dealloc__(self): def __dealloc__(self):
del self.thisptr del self.thisptr
def fit(self, X, y): def fit(self, X, y):
@@ -27,7 +25,7 @@ cdef class CFImdlp:
def get_version(self): def get_version(self):
return self.thisptr.version() return self.thisptr.version()
def __reduce__(self): def __reduce__(self):
return (CFImdlp, (self.algorithm,)) return (CFImdlp, ())
cdef extern from "Factorize.h" namespace "utils": cdef extern from "Factorize.h" namespace "utils":
vector[int] cppFactorize(vector[string] &input_vector) vector[int] cppFactorize(vector[string] &input_vector)

View File

@@ -4,22 +4,19 @@ from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import unique_labels from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from joblib import Parallel, delayed from joblib import Parallel, delayed
from ._version import __version__
# from ._version import __version__
class FImdlp(TransformerMixin, BaseEstimator): class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, algorithm=0, n_jobs=-1): def __init__(self, n_jobs=-1):
self.algorithm = algorithm
self.n_jobs = n_jobs self.n_jobs = n_jobs
"""Fayyad - Irani MDLP discretization algorithm based implementation. """Fayyad - Irani MDLP discretization algorithm based implementation.
Parameters Parameters
---------- ----------
algorithm : int, default=0
The type of algorithm to use computing the cut points.
0 - Definitive implementation
1 - Alternative proposal
2 - Classic proposal
n_jobs : int, default=-1 n_jobs : int, default=-1
The number of jobs to run in parallel. :meth:`fit` and The number of jobs to run in parallel. :meth:`fit` and
:meth:`transform`, are parallelized over the features. ``-1`` means :meth:`transform`, are parallelized over the features. ``-1`` means
@@ -73,6 +70,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
self.n_classes_ = self.classes_.shape[0] self.n_classes_ = self.classes_.shape[0]
self.n_features_in_ = X.shape[1] self.n_features_in_ = X.shape[1]
@staticmethod
def get_version():
return f"{__version__}({CFImdlp().get_version().decode()})"
def fit(self, X, y, **kwargs): def fit(self, X, y, **kwargs):
"""A reference implementation of a fitting function for a transformer. """A reference implementation of a fitting function for a transformer.
Parameters Parameters
@@ -104,7 +105,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
def _fit_discretizer(self, feature): def _fit_discretizer(self, feature):
if feature in self.features_: if feature in self.features_:
self.discretizer_[feature] = CFImdlp(algorithm=self.algorithm) self.discretizer_[feature] = CFImdlp()
self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[ self.cut_points_[feature] = self.discretizer_[
feature feature
@@ -205,6 +206,8 @@ class FImdlp(TransformerMixin, BaseEstimator):
index of the features to join with the labels index of the features to join with the labels
target : [int] target : [int]
index of the target variable to discretize index of the target variable to discretize
data: [array] shape (n_samples, n_features)
dataset that contains the features to join
Returns Returns
------- -------
@@ -227,11 +230,13 @@ class FImdlp(TransformerMixin, BaseEstimator):
raise ValueError( raise ValueError(
f"Target {target} not in range [0, {self.n_features_in_})" f"Target {target} not in range [0, {self.n_features_in_})"
) )
if target in features:
raise ValueError("Target cannot in features to join")
y_join = [ y_join = [
f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode()
for item_y, items_x in zip(self.y_, data[:, features]) for item_y, items_x in zip(self.y_, data[:, features])
] ]
self.y_join = y_join self.y_join_ = y_join
self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))
self.cut_points_[target] = self.discretizer_[target].get_cut_points() self.cut_points_[target] = self.discretizer_[target].get_cut_points()
# return the discretized target variable with the new cut points # return the discretized target variable with the new cut points

View File

@@ -3,67 +3,44 @@ import sklearn
import numpy as np import numpy as np
from sklearn.datasets import load_iris from sklearn.datasets import load_iris
from sklearn.utils.estimator_checks import check_estimator from sklearn.utils.estimator_checks import check_estimator
from ..cppfimdlp import factorize from ..cppfimdlp import CFImdlp, factorize
from ..mdlp import FImdlp from ..mdlp import FImdlp
from .. import version from .. import __version__
from .._version import __version__
# from .._version import __version__
class FImdlpTest(unittest.TestCase): class FImdlpTest(unittest.TestCase):
def test_version(self): def test_version(self):
self.assertEqual(version(), __version__) clf = FImdlp()
self.assertEqual(
clf.get_version(),
f"{__version__}({CFImdlp().get_version().decode()})",
)
def test_init(self): def test_init(self):
clf = FImdlp() clf = FImdlp()
self.assertEqual(-1, clf.n_jobs) self.assertEqual(-1, clf.n_jobs)
self.assertEqual(0, clf.algorithm) clf = FImdlp(n_jobs=7)
clf = FImdlp(algorithm=1, n_jobs=7)
self.assertEqual(1, clf.algorithm)
self.assertEqual(7, clf.n_jobs) self.assertEqual(7, clf.n_jobs)
def test_fit_definitive(self): def test_fit_definitive(self):
clf = FImdlp(algorithm=0) clf = FImdlp()
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_in_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[2.0], [3.0]], clf.get_cut_points())
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
clf.fit(X, y) clf.fit(X, y)
self.assertEqual(clf.n_features_in_, 4) self.assertEqual(clf.n_features_in_, 4)
self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_)) self.assertTrue(np.array_equal(y, clf.y_))
expected = [
[5.449999809265137, 6.25],
[2.8499999046325684, 3.0, 3.049999952316284, 3.3499999046325684],
[2.450000047683716, 4.75, 5.050000190734863],
[0.800000011920929, 1.4500000476837158, 1.75],
]
self.assertListEqual(expected, clf.get_cut_points())
self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_)
def test_fit_alternative(self):
clf = FImdlp(algorithm=1)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_in_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[2], [3]], clf.get_cut_points())
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_in_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
expected = [ expected = [
[5.449999809265137, 5.75], [5.449999809265137, 5.75],
[2.8499999046325684, 3.3499999046325684], [2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684],
[2.450000047683716, 4.75], [2.45, 4.75, 5.050000190734863],
[0.800000011920929, 1.75], [0.8, 1.75],
] ]
self.assertListEqual(expected, clf.get_cut_points()) computed = clf.get_cut_points()
for item_computed, item_expected in zip(computed, expected):
for x_, y_ in zip(item_computed, item_expected):
self.assertAlmostEqual(x_, y_)
self.assertListEqual([0, 1, 2, 3], clf.features_) self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3]) clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_) self.assertListEqual([0, 2, 3], clf.features_)
@@ -84,8 +61,12 @@ class FImdlpTest(unittest.TestCase):
clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2]) clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2])
def test_fit_features(self): def test_fit_features(self):
clf = FImdlp() clf = FImdlp(n_jobs=-1)
# Two samples doesn't have enough information to split
clf.fit([[1, -2], [3, 4]], [1, 2], features=[0]) clf.fit([[1, -2], [3, 4]], [1, 2], features=[0])
self.assertListEqual(clf.get_cut_points(), [[], []])
clf.fit([[1, -2], [3, 4], [5, 6]], [1, 2, 2], features=[0])
self.assertListEqual(clf.get_cut_points(), [[2], []])
res = clf.transform([[1, -2], [3, 4]]) res = clf.transform([[1, -2], [3, 4]])
self.assertListEqual(res.tolist(), [[0, -2], [1, 4]]) self.assertListEqual(res.tolist(), [[0, -2], [1, 4]])
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
@@ -100,9 +81,9 @@ class FImdlpTest(unittest.TestCase):
) )
self.assertEqual(X_computed.dtype, np.float64) self.assertEqual(X_computed.dtype, np.float64)
def test_transform_definitive(self): def test_transform(self):
clf = FImdlp(algorithm=0) clf = FImdlp()
clf.fit([[1, 2], [3, 4]], [1, 2]) clf.fit([[1, 2], [3, 4], [5, 6]], [1, 2, 2])
self.assertEqual( self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]] clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
) )
@@ -118,48 +99,18 @@ class FImdlpTest(unittest.TestCase):
self.assertEqual(X_transformed.dtype, np.int32) self.assertEqual(X_transformed.dtype, np.int32)
expected = [ expected = [
[1, 0, 1, 1], [1, 0, 1, 1],
[1, 1, 1, 1], [2, 3, 1, 1],
[1, 0, 1, 1],
[0, 0, 1, 1],
[1, 0, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1],
]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(algorithm=0)
clf.transform([[1, 2], [3, 4]])
def test_transform_alternative(self):
clf = FImdlp(algorithm=1)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_in_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
self.assertListEqual(
clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
)
expected = [
[1, 0, 1, 1],
[2, 1, 1, 1],
[2, 0, 1, 1], [2, 0, 1, 1],
[0, 0, 1, 1], [0, 0, 1, 1],
[1, 0, 1, 1], [1, 0, 1, 1],
[1, 1, 1, 1], [1, 3, 1, 1],
[1, 1, 1, 1], [1, 2, 1, 1],
] ]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected)) self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]]) clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError): with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(algorithm=1) clf = FImdlp()
clf.transform([[1, 2], [3, 4]]) clf.transform([[1, 2], [3, 4]])
def test_cppfactorize(self): def test_cppfactorize(self):
@@ -180,40 +131,69 @@ class FImdlpTest(unittest.TestCase):
computed = factorize(source) computed = factorize(source)
self.assertListEqual(expected, computed) self.assertListEqual(expected, computed)
def test_join_transform(self): def test_join_fit(self):
y = ["f0", "f0", "f2", "f3", "f4"] y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
x = [ x = np.array(
[0, 1, 2, 3, 4], [
[0, 1, 2, 3, 4], [0, 1, 2, 3, 4],
[1, 2, 3, 4, 5], [0, 1, 2, 3, 4],
[2, 3, 4, 5, 6], [1, 2, 3, 4, 5],
[3, 4, 5, 6, 7], [2, 3, 4, 5, 6],
] [3, 4, 5, 6, 7],
expected = [ ]
[0, 0, 0, 0], )
[0, 0, 0, 0], expected = [0, 0, 1, 2, 2]
[1, 1, 1, 1],
[2, 2, 2, 2],
[2, 2, 2, 2],
]
clf = FImdlp() clf = FImdlp()
computed = clf.join_transform(x, y, 0) clf.fit(x, factorize(y))
for computed, expected in zip(computed, expected): computed = clf.join_fit([0, 2], 1, x)
self.assertListEqual(expected, computed.tolist()) self.assertListEqual(computed.tolist(), expected)
expected_y = [b"f00", b"f00", b"f21", b"f32", b"f43"] expected_y = [b"002", b"002", b"113", b"224", b"335"]
self.assertListEqual(expected_y, clf.y_join_) self.assertListEqual(expected_y, clf.y_join_)
def test_join_transform_error(self): def test_join_fit_error(self):
y = ["f0", "f0", "f2", "f3", "f4"] y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
x = [ x = np.array(
[0, 1, 2, 3, 4], [
[0, 1, 2, 3, 4], [0, 1, 2, 3, 4],
[1, 2, 3, 4, 5], [0, 1, 2, 3, 4],
[2, 3, 4, 5, 6], [1, 2, 3, 4, 5],
[3, 4, 5, 6, 7], [2, 3, 4, 5, 6],
] [3, 4, 5, 6, 7],
with self.assertRaises(ValueError): ]
FImdlp().join_transform(x, y, 5) )
clf = FImdlp()
clf.fit(x, factorize(y))
with self.assertRaises(ValueError) as exception:
clf.join_fit([], 1, x)
self.assertEqual(
str(exception.exception),
"Number of features must be in range [1, 5]",
)
with self.assertRaises(ValueError) as exception:
FImdlp().join_fit([0, 4], 1, x)
self.assertTrue(
str(exception.exception).startswith(
"This FImdlp instance is not fitted yet."
)
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 5], 1, x)
self.assertEqual(
str(exception.exception),
"Feature 5 not in range [0, 5)",
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 2], 5, x)
self.assertEqual(
str(exception.exception),
"Target 5 not in range [0, 5)",
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 2], 2, x)
self.assertEqual(
str(exception.exception),
"Target cannot in features to join",
)
def test_factorize(self): def test_factorize(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
@@ -228,3 +208,21 @@ class FImdlpTest(unittest.TestCase):
def test_sklearn_transformer(self): def test_sklearn_transformer(self):
for check, test in check_estimator(FImdlp(), generate_only=True): for check, test in check_estimator(FImdlp(), generate_only=True):
test(check) test(check)
def test_states_feature(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected = []
for i in [3, 6, 4, 3]:
expected.append(list(range(i)))
for feature in range(X.shape[1]):
self.assertListEqual(
expected[feature], clf.get_states_feature(feature)
)
def test_states_no_feature(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertIsNone(clf.get_states_feature(4))