From e0b7cae9a0902b1af661b434963038e32752f969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 20 Feb 2023 18:26:51 +0100 Subject: [PATCH] Remove algorithm hyperparameter in discretizer --- src/fimdlp/__init__.py | 4 - src/fimdlp/cfimdlp.pyx | 10 +- src/fimdlp/mdlp.py | 23 ++-- src/fimdlp/tests/FImdlp_test.py | 218 ++++++++++++++++---------------- 4 files changed, 126 insertions(+), 129 deletions(-) diff --git a/src/fimdlp/__init__.py b/src/fimdlp/__init__.py index 3a99d3b..0abf8ef 100644 --- a/src/fimdlp/__init__.py +++ b/src/fimdlp/__init__.py @@ -1,8 +1,4 @@ from ._version import __version__ -def version(): - return __version__ - - all = ["FImdlp", "__version__"] diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx index 18e1d81..8892e8b 100644 --- a/src/fimdlp/cfimdlp.pyx +++ b/src/fimdlp/cfimdlp.pyx @@ -6,17 +6,15 @@ from libcpp.string cimport string cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": ctypedef float precision_t cdef cppclass CPPFImdlp: - CPPFImdlp(int) except + + CPPFImdlp() except + CPPFImdlp& fit(vector[precision_t]&, vector[int]&) vector[precision_t] getCutPoints() string version() cdef class CFImdlp: cdef CPPFImdlp *thisptr - cdef int algorithm - def __cinit__(self, algorithm:int ): - self.algorithm = algorithm - self.thisptr = new CPPFImdlp(algorithm) + def __cinit__(self): + self.thisptr = new CPPFImdlp() def __dealloc__(self): del self.thisptr def fit(self, X, y): @@ -27,7 +25,7 @@ cdef class CFImdlp: def get_version(self): return self.thisptr.version() def __reduce__(self): - return (CFImdlp, (self.algorithm,)) + return (CFImdlp, ()) cdef extern from "Factorize.h" namespace "utils": vector[int] cppFactorize(vector[string] &input_vector) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 808e75a..0378ebf 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -4,22 +4,19 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from joblib import Parallel, delayed +from ._version import __version__ + +# from ._version import __version__ class FImdlp(TransformerMixin, BaseEstimator): - def __init__(self, algorithm=0, n_jobs=-1): - self.algorithm = algorithm + def __init__(self, n_jobs=-1): self.n_jobs = n_jobs """Fayyad - Irani MDLP discretization algorithm based implementation. Parameters ---------- - algorithm : int, default=0 - The type of algorithm to use computing the cut points. - 0 - Definitive implementation - 1 - Alternative proposal - 2 - Classic proposal n_jobs : int, default=-1 The number of jobs to run in parallel. :meth:`fit` and :meth:`transform`, are parallelized over the features. ``-1`` means @@ -73,6 +70,10 @@ class FImdlp(TransformerMixin, BaseEstimator): self.n_classes_ = self.classes_.shape[0] self.n_features_in_ = X.shape[1] + @staticmethod + def get_version(): + return f"{__version__}({CFImdlp().get_version().decode()})" + def fit(self, X, y, **kwargs): """A reference implementation of a fitting function for a transformer. Parameters @@ -104,7 +105,7 @@ class FImdlp(TransformerMixin, BaseEstimator): def _fit_discretizer(self, feature): if feature in self.features_: - self.discretizer_[feature] = CFImdlp(algorithm=self.algorithm) + self.discretizer_[feature] = CFImdlp() self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.cut_points_[feature] = self.discretizer_[ feature @@ -205,6 +206,8 @@ class FImdlp(TransformerMixin, BaseEstimator): index of the features to join with the labels target : [int] index of the target variable to discretize + data: [array] shape (n_samples, n_features) + dataset that contains the features to join Returns ------- @@ -227,11 +230,13 @@ class FImdlp(TransformerMixin, BaseEstimator): raise ValueError( f"Target {target} not in range [0, {self.n_features_in_})" ) + if target in features: + raise ValueError("Target cannot in features to join") y_join = [ f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() for item_y, items_x in zip(self.y_, data[:, features]) ] - self.y_join = y_join + self.y_join_ = y_join self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) self.cut_points_[target] = self.discretizer_[target].get_cut_points() # return the discretized target variable with the new cut points diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 5e522f3..fcfd888 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -3,67 +3,44 @@ import sklearn import numpy as np from sklearn.datasets import load_iris from sklearn.utils.estimator_checks import check_estimator -from ..cppfimdlp import factorize +from ..cppfimdlp import CFImdlp, factorize from ..mdlp import FImdlp -from .. import version -from .._version import __version__ +from .. import __version__ + +# from .._version import __version__ class FImdlpTest(unittest.TestCase): def test_version(self): - self.assertEqual(version(), __version__) + clf = FImdlp() + self.assertEqual( + clf.get_version(), + f"{__version__}({CFImdlp().get_version().decode()})", + ) def test_init(self): clf = FImdlp() self.assertEqual(-1, clf.n_jobs) - self.assertEqual(0, clf.algorithm) - clf = FImdlp(algorithm=1, n_jobs=7) - self.assertEqual(1, clf.algorithm) + clf = FImdlp(n_jobs=7) self.assertEqual(7, clf.n_jobs) def test_fit_definitive(self): - clf = FImdlp(algorithm=0) - clf.fit([[1, 2], [3, 4]], [1, 2]) - self.assertEqual(clf.n_features_in_, 2) - self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]]) - self.assertListEqual(clf.y_.tolist(), [1, 2]) - self.assertListEqual([[2.0], [3.0]], clf.get_cut_points()) + clf = FImdlp() X, y = load_iris(return_X_y=True) clf.fit(X, y) self.assertEqual(clf.n_features_in_, 4) self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(y, clf.y_)) - expected = [ - [5.449999809265137, 6.25], - [2.8499999046325684, 3.0, 3.049999952316284, 3.3499999046325684], - [2.450000047683716, 4.75, 5.050000190734863], - [0.800000011920929, 1.4500000476837158, 1.75], - ] - self.assertListEqual(expected, clf.get_cut_points()) - self.assertListEqual([0, 1, 2, 3], clf.features_) - clf.fit(X, y, features=[0, 2, 3]) - self.assertListEqual([0, 2, 3], clf.features_) - - def test_fit_alternative(self): - clf = FImdlp(algorithm=1) - clf.fit([[1, 2], [3, 4]], [1, 2]) - self.assertEqual(clf.n_features_in_, 2) - self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]]) - self.assertListEqual(clf.y_.tolist(), [1, 2]) - self.assertListEqual([[2], [3]], clf.get_cut_points()) - X, y = load_iris(return_X_y=True) - clf.fit(X, y) - self.assertEqual(clf.n_features_in_, 4) - self.assertTrue(np.array_equal(X, clf.X_)) - self.assertTrue(np.array_equal(y, clf.y_)) - expected = [ [5.449999809265137, 5.75], - [2.8499999046325684, 3.3499999046325684], - [2.450000047683716, 4.75], - [0.800000011920929, 1.75], + [2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684], + [2.45, 4.75, 5.050000190734863], + [0.8, 1.75], ] - self.assertListEqual(expected, clf.get_cut_points()) + computed = clf.get_cut_points() + for item_computed, item_expected in zip(computed, expected): + for x_, y_ in zip(item_computed, item_expected): + self.assertAlmostEqual(x_, y_) self.assertListEqual([0, 1, 2, 3], clf.features_) clf.fit(X, y, features=[0, 2, 3]) self.assertListEqual([0, 2, 3], clf.features_) @@ -84,8 +61,12 @@ class FImdlpTest(unittest.TestCase): clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2]) def test_fit_features(self): - clf = FImdlp() + clf = FImdlp(n_jobs=-1) + # Two samples doesn't have enough information to split clf.fit([[1, -2], [3, 4]], [1, 2], features=[0]) + self.assertListEqual(clf.get_cut_points(), [[], []]) + clf.fit([[1, -2], [3, 4], [5, 6]], [1, 2, 2], features=[0]) + self.assertListEqual(clf.get_cut_points(), [[2], []]) res = clf.transform([[1, -2], [3, 4]]) self.assertListEqual(res.tolist(), [[0, -2], [1, 4]]) X, y = load_iris(return_X_y=True) @@ -100,9 +81,9 @@ class FImdlpTest(unittest.TestCase): ) self.assertEqual(X_computed.dtype, np.float64) - def test_transform_definitive(self): - clf = FImdlp(algorithm=0) - clf.fit([[1, 2], [3, 4]], [1, 2]) + def test_transform(self): + clf = FImdlp() + clf.fit([[1, 2], [3, 4], [5, 6]], [1, 2, 2]) self.assertEqual( clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]] ) @@ -118,48 +99,18 @@ class FImdlpTest(unittest.TestCase): self.assertEqual(X_transformed.dtype, np.int32) expected = [ [1, 0, 1, 1], - [1, 1, 1, 1], - [1, 0, 1, 1], - [0, 0, 1, 1], - [1, 0, 1, 1], - [1, 1, 1, 1], - [1, 1, 1, 1], - ] - self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected)) - with self.assertRaises(ValueError): - clf.transform([[1, 2, 3], [4, 5, 6]]) - with self.assertRaises(sklearn.exceptions.NotFittedError): - clf = FImdlp(algorithm=0) - clf.transform([[1, 2], [3, 4]]) - - def test_transform_alternative(self): - clf = FImdlp(algorithm=1) - clf.fit([[1, 2], [3, 4]], [1, 2]) - self.assertEqual( - clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]] - ) - X, y = load_iris(return_X_y=True) - clf.fit(X, y) - self.assertEqual(clf.n_features_in_, 4) - self.assertTrue(np.array_equal(X, clf.X_)) - self.assertTrue(np.array_equal(y, clf.y_)) - self.assertListEqual( - clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist() - ) - expected = [ - [1, 0, 1, 1], - [2, 1, 1, 1], + [2, 3, 1, 1], [2, 0, 1, 1], [0, 0, 1, 1], [1, 0, 1, 1], - [1, 1, 1, 1], - [1, 1, 1, 1], + [1, 3, 1, 1], + [1, 2, 1, 1], ] self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected)) with self.assertRaises(ValueError): clf.transform([[1, 2, 3], [4, 5, 6]]) with self.assertRaises(sklearn.exceptions.NotFittedError): - clf = FImdlp(algorithm=1) + clf = FImdlp() clf.transform([[1, 2], [3, 4]]) def test_cppfactorize(self): @@ -180,40 +131,69 @@ class FImdlpTest(unittest.TestCase): computed = factorize(source) self.assertListEqual(expected, computed) - def test_join_transform(self): - y = ["f0", "f0", "f2", "f3", "f4"] - x = [ - [0, 1, 2, 3, 4], - [0, 1, 2, 3, 4], - [1, 2, 3, 4, 5], - [2, 3, 4, 5, 6], - [3, 4, 5, 6, 7], - ] - expected = [ - [0, 0, 0, 0], - [0, 0, 0, 0], - [1, 1, 1, 1], - [2, 2, 2, 2], - [2, 2, 2, 2], - ] + def test_join_fit(self): + y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) + x = np.array( + [ + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [1, 2, 3, 4, 5], + [2, 3, 4, 5, 6], + [3, 4, 5, 6, 7], + ] + ) + expected = [0, 0, 1, 2, 2] clf = FImdlp() - computed = clf.join_transform(x, y, 0) - for computed, expected in zip(computed, expected): - self.assertListEqual(expected, computed.tolist()) - expected_y = [b"f00", b"f00", b"f21", b"f32", b"f43"] + clf.fit(x, factorize(y)) + computed = clf.join_fit([0, 2], 1, x) + self.assertListEqual(computed.tolist(), expected) + expected_y = [b"002", b"002", b"113", b"224", b"335"] self.assertListEqual(expected_y, clf.y_join_) - def test_join_transform_error(self): - y = ["f0", "f0", "f2", "f3", "f4"] - x = [ - [0, 1, 2, 3, 4], - [0, 1, 2, 3, 4], - [1, 2, 3, 4, 5], - [2, 3, 4, 5, 6], - [3, 4, 5, 6, 7], - ] - with self.assertRaises(ValueError): - FImdlp().join_transform(x, y, 5) + def test_join_fit_error(self): + y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) + x = np.array( + [ + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [1, 2, 3, 4, 5], + [2, 3, 4, 5, 6], + [3, 4, 5, 6, 7], + ] + ) + clf = FImdlp() + clf.fit(x, factorize(y)) + with self.assertRaises(ValueError) as exception: + clf.join_fit([], 1, x) + self.assertEqual( + str(exception.exception), + "Number of features must be in range [1, 5]", + ) + with self.assertRaises(ValueError) as exception: + FImdlp().join_fit([0, 4], 1, x) + self.assertTrue( + str(exception.exception).startswith( + "This FImdlp instance is not fitted yet." + ) + ) + with self.assertRaises(ValueError) as exception: + clf.join_fit([0, 5], 1, x) + self.assertEqual( + str(exception.exception), + "Feature 5 not in range [0, 5)", + ) + with self.assertRaises(ValueError) as exception: + clf.join_fit([0, 2], 5, x) + self.assertEqual( + str(exception.exception), + "Target 5 not in range [0, 5)", + ) + with self.assertRaises(ValueError) as exception: + clf.join_fit([0, 2], 2, x) + self.assertEqual( + str(exception.exception), + "Target cannot in features to join", + ) def test_factorize(self): y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) @@ -228,3 +208,21 @@ class FImdlpTest(unittest.TestCase): def test_sklearn_transformer(self): for check, test in check_estimator(FImdlp(), generate_only=True): test(check) + + def test_states_feature(self): + clf = FImdlp() + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + expected = [] + for i in [3, 6, 4, 3]: + expected.append(list(range(i))) + for feature in range(X.shape[1]): + self.assertListEqual( + expected[feature], clf.get_states_feature(feature) + ) + + def test_states_no_feature(self): + clf = FImdlp() + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + self.assertIsNone(clf.get_states_feature(4))