From 0768d68a36d02fe4b99bd12092ac01b5d0247644 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 8 Apr 2023 12:22:03 +0200 Subject: [PATCH] add join_fit target info --- k.py | 12 ------------ src/fimdlp/mdlp.py | 13 ++++++++++--- src/fimdlp/tests/FImdlp_test.py | 12 +++++++++++- 3 files changed, 21 insertions(+), 16 deletions(-) delete mode 100644 k.py diff --git a/k.py b/k.py deleted file mode 100644 index 47e0856..0000000 --- a/k.py +++ /dev/null @@ -1,12 +0,0 @@ -from sklearn.datasets import load_wine -from fimdlp.mdlp import FImdlp - -X, y = load_wine(return_X_y=True) -trans = FImdlp() -Xt = trans.join_transform(X, y, 12) -print("X shape = ", X.shape) -print("Xt.shape=", Xt.shape) -print("Xt ", Xt[:10]) -print("trans.X_ shape = ", trans.X_.shape) -print("trans.y_ ", trans.y_[:10]) -print("y_join ", trans.y_join_[:10]) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 5a8ea8c..36ce3a0 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -6,8 +6,6 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from joblib import Parallel, delayed from ._version import __version__ -# from ._version import __version__ - class FImdlp(TransformerMixin, BaseEstimator): def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0): @@ -24,6 +22,12 @@ class FImdlp(TransformerMixin, BaseEstimator): The number of jobs to run in parallel. :meth:`fit` and :meth:`transform`, are parallelized over the features. ``-1`` means using all cores available. + min_length: int, default=3 + The minimum length of an interval to be considered to be discretized. + max_depth: int, default=1e6 + The maximum depth of the discretization process. + max_cuts: float, default=0 + The maximum number of cut points to be computed for each feature. Attributes ---------- @@ -109,6 +113,8 @@ class FImdlp(TransformerMixin, BaseEstimator): delayed(self._fit_discretizer)(feature) for feature in range(self.n_features_in_) ) + # target of every feature. Start with -1 => y (see join_fit) + self.target_ = [-1] * self.n_features_in_ return self def _fit_discretizer(self, feature): @@ -244,11 +250,12 @@ class FImdlp(TransformerMixin, BaseEstimator): f"Target {target} not in range [0, {self.n_features_in_})" ) if target in features: - raise ValueError("Target cannot in features to join") + raise ValueError("Target cannot be in features to join") y_join = [ f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() for item_y, items_x in zip(self.y_, data[:, features]) ] + self.target_[target] = features + [-1] self.y_join_ = y_join self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) self.cut_points_[target] = self.discretizer_[target].get_cut_points() diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 111f960..0215509 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -196,7 +196,7 @@ class FImdlpTest(unittest.TestCase): clf.join_fit([0, 2], 2, x) self.assertEqual( str(exception.exception), - "Target cannot in features to join", + "Target cannot be in features to join", ) def test_factorize(self): @@ -209,6 +209,16 @@ class FImdlpTest(unittest.TestCase): computed = clf.factorize(y) self.assertListEqual([0, 1, 1, 2, 3], computed) + def test_join_fit_info(self): + clf = FImdlp() + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + clf.join_fit([0, 2], 1, X) + clf.join_fit([0, 3], 2, X) + clf.join_fit([1, 2], 3, X) + expected = [-1, [0, 2, -1], [0, 3, -1], [1, 2, -1]] + self.assertListEqual(expected, clf.target_) + @staticmethod def test_sklearn_transformer(): for check, test in check_estimator(FImdlp(), generate_only=True):