add join_fit target info

2025-08-16 16:05:52 +00:00 · 2023-04-08 12:22:03 +02:00
parent e44bca0420
commit 0768d68a36
3 changed files with 21 additions and 16 deletions
--- a/k.py
+++ b/k.py
@@ -1,12 +0,0 @@
-from sklearn.datasets import load_wine
-from fimdlp.mdlp import FImdlp
-
-X, y = load_wine(return_X_y=True)
-trans = FImdlp()
-Xt = trans.join_transform(X, y, 12)
-print("X shape = ", X.shape)
-print("Xt.shape=", Xt.shape)
-print("Xt ", Xt[:10])
-print("trans.X_ shape = ", trans.X_.shape)
-print("trans.y_ ", trans.y_[:10])
-print("y_join ", trans.y_join_[:10])
--- a/src/fimdlp/mdlp.py
+++ b/src/fimdlp/mdlp.py
@@ -6,8 +6,6 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from joblib import Parallel, delayed
 from ._version import __version__

-# from ._version import __version__
-

 class FImdlp(TransformerMixin, BaseEstimator):
    def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0):
@@ -24,6 +22,12 @@ class FImdlp(TransformerMixin, BaseEstimator):
        The number of jobs to run in parallel. :meth:`fit` and
        :meth:`transform`, are parallelized over the features. ``-1`` means
        using all cores available.
+    min_length: int, default=3
+        The minimum length of an interval to be considered to be discretized.
+    max_depth: int, default=1e6
+        The maximum depth of the discretization process.
+    max_cuts: float, default=0
+        The maximum number of cut points to be computed for each feature. 

    Attributes
    ----------
@@ -109,6 +113,8 @@ class FImdlp(TransformerMixin, BaseEstimator):
            delayed(self._fit_discretizer)(feature)
            for feature in range(self.n_features_in_)
        )
+        # target of every feature. Start with -1 => y (see join_fit)
+        self.target_ = [-1] * self.n_features_in_
        return self

    def _fit_discretizer(self, feature):
@@ -244,11 +250,12 @@ class FImdlp(TransformerMixin, BaseEstimator):
                f"Target {target} not in range [0, {self.n_features_in_})"
            )
        if target in features:
-            raise ValueError("Target cannot in features to join")
+            raise ValueError("Target cannot be in features to join")
        y_join = [
            f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode()
            for item_y, items_x in zip(self.y_, data[:, features])
        ]
+        self.target_[target] = features + [-1]
        self.y_join_ = y_join
        self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))
        self.cut_points_[target] = self.discretizer_[target].get_cut_points()
--- a/src/fimdlp/tests/FImdlp_test.py
+++ b/src/fimdlp/tests/FImdlp_test.py
@@ -196,7 +196,7 @@ class FImdlpTest(unittest.TestCase):
            clf.join_fit([0, 2], 2, x)
        self.assertEqual(
            str(exception.exception),
-            "Target cannot in features to join",
+            "Target cannot be in features to join",
        )

    def test_factorize(self):
@@ -209,6 +209,16 @@ class FImdlpTest(unittest.TestCase):
        computed = clf.factorize(y)
        self.assertListEqual([0, 1, 1, 2, 3], computed)

+    def test_join_fit_info(self):
+        clf = FImdlp()
+        X, y = load_iris(return_X_y=True)
+        clf.fit(X, y)
+        clf.join_fit([0, 2], 1, X)
+        clf.join_fit([0, 3], 2, X)
+        clf.join_fit([1, 2], 3, X)
+        expected = [-1, [0, 2, -1], [0, 3, -1], [1, 2, -1]]
+        self.assertListEqual(expected, clf.target_)
+
    @staticmethod
    def test_sklearn_transformer():
        for check, test in check_estimator(FImdlp(), generate_only=True):