feat: ⚗️ Add join_transform method and cpp factorize

2025-08-17 16:35:52 +00:00 · 2023-01-26 10:47:27 +01:00
parent 34cd54f77e
commit ca7d158ac8
7 changed files with 148 additions and 19 deletions
--- a/k.py
+++ b/k.py
@@ -0,0 +1,12 @@
+from sklearn.datasets import load_wine
+from fimdlp.mdlp import FImdlp
+
+X, y = load_wine(return_X_y=True)
+trans = FImdlp()
+Xt = trans.join_transform(X, y, 12)
+print("X shape = ", X.shape)
+print("Xt.shape=", Xt.shape)
+print("Xt ", Xt[:10])
+print("trans.X_ shape = ", trans.X_.shape)
+print("trans.y_ ", trans.y_[:10])
+print("y_join ", trans.y_join_[:10])
--- a/setup.py
+++ b/setup.py
@@ -14,10 +14,13 @@ setup(
                "src/fimdlp/cfimdlp.pyx",
                "src/cppmdlp/CPPFImdlp.cpp",
                "src/cppmdlp/Metrics.cpp",
+                "src/fimdlp/Factorize.cpp",
            ],
            language="c++",
            include_dirs=["fimdlp"],
-            extra_compile_args=["-std=c++2a"],
+            extra_compile_args=[
+                "-std=c++11",
+            ],
        ),
    ]
 )
--- a/src/fimdlp/Factorize.cpp
+++ b/src/fimdlp/Factorize.cpp
@@ -0,0 +1,18 @@
+#include "Factorize.h"
+
+namespace utils {
+    vector<int> cppFactorize(const vector<string>& labels_t)
+    {
+        vector<int> yy;
+        yy.reserve(labels_t.size());
+        map<string, int> labelMap;
+        int i = 0;
+        for (string label : labels_t) {
+            if (labelMap.find(label) == labelMap.end()) {
+                labelMap[label] = i++;
+            }
+            yy.push_back(labelMap[label]);
+        }
+        return yy;
+    }
+}
--- a/src/fimdlp/Factorize.h
+++ b/src/fimdlp/Factorize.h
@@ -0,0 +1,10 @@
+#ifndef FACTORIZE_H
+#define FACTORIZE_H
+#include <vector>
+#include <map>
+#include <string>
+namespace utils {
+    using namespace std;
+    vector<int> cppFactorize(const vector<string>&);
+}
+#endif
--- a/src/fimdlp/cfimdlp.pyx
+++ b/src/fimdlp/cfimdlp.pyx
@@ -24,3 +24,8 @@ cdef class CFImdlp:
        return self.thisptr.getCutPoints()
    def get_version(self):
        return self.thisptr.version()
+
+cdef extern from "Factorize.h" namespace "utils":
+    vector[int] cppFactorize(vector[string] &input_vector)
+def factorize(input_vector):
+    return cppFactorize(input_vector)
--- a/src/fimdlp/mdlp.py
+++ b/src/fimdlp/mdlp.py
@@ -1,5 +1,5 @@
 import numpy as np
-from .cppfimdlp import CFImdlp
+from .cppfimdlp import CFImdlp, factorize
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
@@ -33,21 +33,17 @@ class FImdlp(TransformerMixin, BaseEstimator):
        The list of discretizers, one for each feature.
    cut_points_ : list
        The list of cut points for each feature.
-    X_ : array
-        the samples used to fit, shape (n_samples, n_features)
-    y_ : array
-        the labels used to fit, shape (n_samples,)
+    X_ : array, shape (n_samples, n_features)
+        the samples used to fit
+    y_ : array, shape(n_samples,)
+        the labels used to fit
    features_ : list
        the list of features to be discretized
    """

-    def _check_params_fit(self, X, y, expected_args, kwargs):
-        """Check the common parameters passed to fit"""
+    def _check_args(self, X, y, expected_args, kwargs):
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
-        # Store the classes seen during fit
-        self.classes_ = unique_labels(y)
-        self.n_classes_ = self.classes_.shape[0]
        # Default values
        self.features_ = [i for i in range(X.shape[1])]
        for key, value in kwargs.items():
@@ -68,15 +64,20 @@ class FImdlp(TransformerMixin, BaseEstimator):
            raise ValueError("Feature index out of range")
        return X, y

+    def _update_params(self, X, y):
+        # Store the classes seen during fit
+        self.classes_ = unique_labels(y)
+        self.n_classes_ = self.classes_.shape[0]
+        self.n_features_ = X.shape[1]
+
    def fit(self, X, y, **kwargs):
        """A reference implementation of a fitting function for a transformer.
        Parameters
        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : array, shape (n_samples, n_features)
            The training input samples.
-        y : None
-            There is no need of a target in a transformer, yet the pipeline API
-            requires this parameter.
+        y : array, shape (n_samples,)
+            the labels used to fit
        features : list, default=[i for i in range(n_features)]
            The list of features to be discretized.
        Returns
@@ -84,10 +85,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
        self : object
            Returns self.
        """
-        X, y = self._check_params_fit(
+        X, y = self._check_args(
            X, y, expected_args=["features"], kwargs=kwargs
        )
-        self.n_features_ = X.shape[1]
+        self._update_params(X, y)
        self.X_ = X
        self.y_ = y
        self.discretizer_ = [None] * self.n_features_
@@ -119,7 +120,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
        """Discretize X values.
        Parameters
        ----------
-        X : {array-like}, shape (n_samples, n_features)
+        X : array, shape (n_samples, n_features)
            The input samples.
        Returns
        -------
@@ -146,6 +147,34 @@ class FImdlp(TransformerMixin, BaseEstimator):
        )
        return result

+    def join_transform(self, X, y, feature, **kwargs):
+        """Join the selected feature with the labels and discretize the values
+        join - fit - transform
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features)
+            The input samples.
+        y : array
+            the labels used to fit
+        feature : int
+            index of the feature to join with the labels
+        """
+        X, y = self._check_args(
+            X, y, expected_args=["features"], kwargs=kwargs
+        )
+        if feature < 0 or feature >= X.shape[1]:
+            raise ValueError(
+                f"Feature {feature} not in range [0, {X.shape[1]})"
+            )
+        self.y_join_ = [
+            f"{str(item_y)}{str(item_x)}".encode()
+            for item_y, item_x in zip(y, X[:, feature])
+        ]
+        yy = factorize(self.y_join_)
+        XX = np.delete(X, feature, axis=1)
+        return self.fit(XX, yy).transform(XX)
+
    def get_cut_points(self):
        """Get the cut points for each feature.
        Returns
--- a/src/fimdlp/tests/FImdlp_test.py
+++ b/src/fimdlp/tests/FImdlp_test.py
@@ -1,7 +1,8 @@
 import unittest
 import sklearn
-from sklearn.datasets import load_iris
 import numpy as np
+from sklearn.datasets import load_iris
+from ..cppfimdlp import factorize
 from ..mdlp import FImdlp
 from .. import version
 from .._version import __version__
@@ -159,3 +160,54 @@ class FImdlpTest(unittest.TestCase):
        with self.assertRaises(sklearn.exceptions.NotFittedError):
            clf = FImdlp(algorithm=1)
            clf.transform([[1, 2], [3, 4]])
+
+    def test_factorize(self):
+        source = [
+            b"f0",
+            b"f1",
+            b"f2",
+            b"f3",
+            b"f4",
+            b"f5",
+            b"f6",
+            b"f1",
+            b"f1",
+            b"f7",
+            b"f8",
+        ]
+        expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8]
+        computed = factorize(source)
+        self.assertListEqual(expected, computed)
+
+    def test_join_transform(self):
+        y = ["f0", "f0", "f2", "f3", "f4"]
+        x = [
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [1, 2, 3, 4, 5],
+            [2, 3, 4, 5, 6],
+            [3, 4, 5, 6, 7],
+        ]
+        expected = [
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+            [1, 1, 1, 1],
+            [2, 2, 2, 2],
+            [2, 2, 2, 2],
+        ]
+        clf = FImdlp()
+        computed = clf.join_transform(x, y, 0)
+        for computed, expected in zip(computed, expected):
+            self.assertListEqual(expected, computed.tolist())
+
+    def test_join_transform_error(self):
+        y = ["f0", "f0", "f2", "f3", "f4"]
+        x = [
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [1, 2, 3, 4, 5],
+            [2, 3, 4, 5, 6],
+            [3, 4, 5, 6, 7],
+        ]
+        with self.assertRaises(ValueError):
+            FImdlp().join_transform(x, y, 5)