feat: ⚗️ Add join_transform method and cpp factorize

2025-08-17 16:35:52 +00:00 · 2023-01-26 10:47:27 +01:00
parent 34cd54f77e
commit ca7d158ac8
7 changed files with 148 additions and 19 deletions
--- a/k.py
+++ b/k.py
@@ -0,0 +1,12 @@
 from sklearn.datasets import load_wine
 from fimdlp.mdlp import FImdlp
 X, y = load_wine(return_X_y=True)
 trans = FImdlp()
 Xt = trans.join_transform(X, y, 12)
 print("X shape = ", X.shape)
 print("Xt.shape=", Xt.shape)
 print("Xt ", Xt[:10])
 print("trans.X_ shape = ", trans.X_.shape)
 print("trans.y_ ", trans.y_[:10])
 print("y_join ", trans.y_join_[:10])
--- a/setup.py
+++ b/setup.py
@@ -14,10 +14,13 @@ setup(
                "src/fimdlp/cfimdlp.pyx",
                "src/cppmdlp/CPPFImdlp.cpp",
                "src/cppmdlp/Metrics.cpp",
                "src/fimdlp/Factorize.cpp",
            ],
            language="c++",
            include_dirs=["fimdlp"],
-            extra_compile_args=["-std=c++2a"],
+            extra_compile_args=[
                "-std=c++11",
            ],
        ),
    ]
 )
--- a/src/fimdlp/Factorize.cpp
+++ b/src/fimdlp/Factorize.cpp
@@ -0,0 +1,18 @@
 #include "Factorize.h"
 namespace utils {
    vector<int> cppFactorize(const vector<string>& labels_t)
    {
        vector<int> yy;
        yy.reserve(labels_t.size());
        map<string, int> labelMap;
        int i = 0;
        for (string label : labels_t) {
            if (labelMap.find(label) == labelMap.end()) {
                labelMap[label] = i++;
            }
            yy.push_back(labelMap[label]);
        }
        return yy;
    }
 }
--- a/src/fimdlp/Factorize.h
+++ b/src/fimdlp/Factorize.h
@@ -0,0 +1,10 @@
 #ifndef FACTORIZE_H
 #define FACTORIZE_H
 #include <vector>
 #include <map>
 #include <string>
 namespace utils {
    using namespace std;
    vector<int> cppFactorize(const vector<string>&);
 }
 #endif
--- a/src/fimdlp/cfimdlp.pyx
+++ b/src/fimdlp/cfimdlp.pyx
@@ -24,3 +24,8 @@ cdef class CFImdlp:
        return self.thisptr.getCutPoints()
    def get_version(self):
        return self.thisptr.version()
 cdef extern from "Factorize.h" namespace "utils":
    vector[int] cppFactorize(vector[string] &input_vector)
 def factorize(input_vector):
    return cppFactorize(input_vector)
--- a/src/fimdlp/mdlp.py
+++ b/src/fimdlp/mdlp.py
@@ -1,5 +1,5 @@
 import numpy as np
-from .cppfimdlp import CFImdlp
+from .cppfimdlp import CFImdlp, factorize
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
@@ -33,21 +33,17 @@ class FImdlp(TransformerMixin, BaseEstimator):
        The list of discretizers, one for each feature.
    cut_points_ : list
        The list of cut points for each feature.
-    X_ : array
+    X_ : array, shape (n_samples, n_features)
-        the samples used to fit, shape (n_samples, n_features)
+        the samples used to fit
-    y_ : array
+    y_ : array, shape(n_samples,)
-        the labels used to fit, shape (n_samples,)
+        the labels used to fit
    features_ : list
        the list of features to be discretized
    """
-    def _check_params_fit(self, X, y, expected_args, kwargs):
+    def _check_args(self, X, y, expected_args, kwargs):
        """Check the common parameters passed to fit"""
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.n_classes_ = self.classes_.shape[0]
        # Default values
        self.features_ = [i for i in range(X.shape[1])]
        for key, value in kwargs.items():
@@ -68,15 +64,20 @@ class FImdlp(TransformerMixin, BaseEstimator):
            raise ValueError("Feature index out of range")
        return X, y
    def _update_params(self, X, y):
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.n_classes_ = self.classes_.shape[0]
        self.n_features_ = X.shape[1]
    def fit(self, X, y, **kwargs):
        """A reference implementation of a fitting function for a transformer.
        Parameters
        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : array, shape (n_samples, n_features)
            The training input samples.
-        y : None
+        y : array, shape (n_samples,)
-            There is no need of a target in a transformer, yet the pipeline API
+            the labels used to fit
            requires this parameter.
        features : list, default=[i for i in range(n_features)]
            The list of features to be discretized.
        Returns
@@ -84,10 +85,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
        self : object
            Returns self.
        """
-        X, y = self._check_params_fit(
+        X, y = self._check_args(
            X, y, expected_args=["features"], kwargs=kwargs
        )
-        self.n_features_ = X.shape[1]
+        self._update_params(X, y)
        self.X_ = X
        self.y_ = y
        self.discretizer_ = [None] * self.n_features_
@@ -119,7 +120,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
        """Discretize X values.
        Parameters
        ----------
-        X : {array-like}, shape (n_samples, n_features)
+        X : array, shape (n_samples, n_features)
            The input samples.
        Returns
        -------
@@ -146,6 +147,34 @@ class FImdlp(TransformerMixin, BaseEstimator):
        )
        return result
    def join_transform(self, X, y, feature, **kwargs):
        """Join the selected feature with the labels and discretize the values
        join - fit - transform
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            The input samples.
        y : array
            the labels used to fit
        feature : int
            index of the feature to join with the labels
        """
        X, y = self._check_args(
            X, y, expected_args=["features"], kwargs=kwargs
        )
        if feature < 0 or feature >= X.shape[1]:
            raise ValueError(
                f"Feature {feature} not in range [0, {X.shape[1]})"
            )
        self.y_join_ = [
            f"{str(item_y)}{str(item_x)}".encode()
            for item_y, item_x in zip(y, X[:, feature])
        ]
        yy = factorize(self.y_join_)
        XX = np.delete(X, feature, axis=1)
        return self.fit(XX, yy).transform(XX)
    def get_cut_points(self):
        """Get the cut points for each feature.
        Returns
--- a/src/fimdlp/tests/FImdlp_test.py
+++ b/src/fimdlp/tests/FImdlp_test.py
@@ -1,7 +1,8 @@
 import unittest
 import sklearn
 from sklearn.datasets import load_iris
 import numpy as np
 from sklearn.datasets import load_iris
 from ..cppfimdlp import factorize
 from ..mdlp import FImdlp
 from .. import version
 from .._version import __version__
@@ -159,3 +160,54 @@ class FImdlpTest(unittest.TestCase):
        with self.assertRaises(sklearn.exceptions.NotFittedError):
            clf = FImdlp(algorithm=1)
            clf.transform([[1, 2], [3, 4]])
    def test_factorize(self):
        source = [
            b"f0",
            b"f1",
            b"f2",
            b"f3",
            b"f4",
            b"f5",
            b"f6",
            b"f1",
            b"f1",
            b"f7",
            b"f8",
        ]
        expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8]
        computed = factorize(source)
        self.assertListEqual(expected, computed)
    def test_join_transform(self):
        y = ["f0", "f0", "f2", "f3", "f4"]
        x = [
            [0, 1, 2, 3, 4],
            [0, 1, 2, 3, 4],
            [1, 2, 3, 4, 5],
            [2, 3, 4, 5, 6],
            [3, 4, 5, 6, 7],
        ]
        expected = [
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [1, 1, 1, 1],
            [2, 2, 2, 2],
            [2, 2, 2, 2],
        ]
        clf = FImdlp()
        computed = clf.join_transform(x, y, 0)
        for computed, expected in zip(computed, expected):
            self.assertListEqual(expected, computed.tolist())
    def test_join_transform_error(self):
        y = ["f0", "f0", "f2", "f3", "f4"]
        x = [
            [0, 1, 2, 3, 4],
            [0, 1, 2, 3, 4],
            [1, 2, 3, 4, 5],
            [2, 3, 4, 5, 6],
            [3, 4, 5, 6, 7],
        ]
        with self.assertRaises(ValueError):
            FImdlp().join_transform(x, y, 5)