From e0b7cae9a0902b1af661b434963038e32752f969 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= <rmontanana@gmail.com>
Date: Mon, 20 Feb 2023 18:26:51 +0100
Subject: [PATCH] Remove algorithm hyperparameter in  discretizer

---
 src/fimdlp/__init__.py          |   4 -
 src/fimdlp/cfimdlp.pyx          |  10 +-
 src/fimdlp/mdlp.py              |  23 ++--
 src/fimdlp/tests/FImdlp_test.py | 218 ++++++++++++++++----------------
 4 files changed, 126 insertions(+), 129 deletions(-)

diff --git a/src/fimdlp/__init__.py b/src/fimdlp/__init__.py
index 3a99d3b..0abf8ef 100644
--- a/src/fimdlp/__init__.py
+++ b/src/fimdlp/__init__.py
@@ -1,8 +1,4 @@
 from ._version import __version__
 
 
-def version():
-    return __version__
-
-
 all = ["FImdlp", "__version__"]
diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx
index 18e1d81..8892e8b 100644
--- a/src/fimdlp/cfimdlp.pyx
+++ b/src/fimdlp/cfimdlp.pyx
@@ -6,17 +6,15 @@ from libcpp.string cimport string
 cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
     ctypedef float precision_t
     cdef cppclass CPPFImdlp:
-        CPPFImdlp(int) except + 
+        CPPFImdlp() except + 
         CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
         vector[precision_t] getCutPoints()
         string version()
         
 cdef class CFImdlp:
     cdef CPPFImdlp *thisptr
-    cdef int algorithm
-    def __cinit__(self, algorithm:int ):
-        self.algorithm = algorithm
-        self.thisptr = new CPPFImdlp(algorithm)
+    def __cinit__(self):
+        self.thisptr = new CPPFImdlp()
     def __dealloc__(self):
         del self.thisptr
     def fit(self, X, y):
@@ -27,7 +25,7 @@ cdef class CFImdlp:
     def get_version(self):
         return self.thisptr.version()
     def __reduce__(self):
-        return (CFImdlp, (self.algorithm,))
+        return (CFImdlp, ())
 
 cdef extern from "Factorize.h" namespace "utils":
     vector[int] cppFactorize(vector[string] &input_vector)
diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py
index 808e75a..0378ebf 100644
--- a/src/fimdlp/mdlp.py
+++ b/src/fimdlp/mdlp.py
@@ -4,22 +4,19 @@ from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from joblib import Parallel, delayed
+from ._version import __version__
+
+# from ._version import __version__
 
 
 class FImdlp(TransformerMixin, BaseEstimator):
-    def __init__(self, algorithm=0, n_jobs=-1):
-        self.algorithm = algorithm
+    def __init__(self, n_jobs=-1):
         self.n_jobs = n_jobs
 
     """Fayyad - Irani MDLP discretization algorithm based implementation.
 
     Parameters
     ----------
-    algorithm : int, default=0
-        The type of algorithm to use computing the cut points.
-        0 - Definitive implementation
-        1 - Alternative proposal
-        2 - Classic proposal
     n_jobs : int, default=-1
         The number of jobs to run in parallel. :meth:`fit` and
         :meth:`transform`, are parallelized over the features. ``-1`` means
@@ -73,6 +70,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
         self.n_classes_ = self.classes_.shape[0]
         self.n_features_in_ = X.shape[1]
 
+    @staticmethod
+    def get_version():
+        return f"{__version__}({CFImdlp().get_version().decode()})"
+
     def fit(self, X, y, **kwargs):
         """A reference implementation of a fitting function for a transformer.
         Parameters
@@ -104,7 +105,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
 
     def _fit_discretizer(self, feature):
         if feature in self.features_:
-            self.discretizer_[feature] = CFImdlp(algorithm=self.algorithm)
+            self.discretizer_[feature] = CFImdlp()
             self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
             self.cut_points_[feature] = self.discretizer_[
                 feature
@@ -205,6 +206,8 @@ class FImdlp(TransformerMixin, BaseEstimator):
             index of the features to join with the labels
         target : [int]
             index of the target variable to discretize
+        data: [array] shape (n_samples, n_features)
+            dataset that contains the features to join
 
         Returns
         -------
@@ -227,11 +230,13 @@ class FImdlp(TransformerMixin, BaseEstimator):
             raise ValueError(
                 f"Target {target} not in range [0, {self.n_features_in_})"
             )
+        if target in features:
+            raise ValueError("Target cannot in features to join")
         y_join = [
             f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode()
             for item_y, items_x in zip(self.y_, data[:, features])
         ]
-        self.y_join = y_join
+        self.y_join_ = y_join
         self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))
         self.cut_points_[target] = self.discretizer_[target].get_cut_points()
         # return the discretized target variable with the new cut points
diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py
index 5e522f3..fcfd888 100644
--- a/src/fimdlp/tests/FImdlp_test.py
+++ b/src/fimdlp/tests/FImdlp_test.py
@@ -3,67 +3,44 @@ import sklearn
 import numpy as np
 from sklearn.datasets import load_iris
 from sklearn.utils.estimator_checks import check_estimator
-from ..cppfimdlp import factorize
+from ..cppfimdlp import CFImdlp, factorize
 from ..mdlp import FImdlp
-from .. import version
-from .._version import __version__
+from .. import __version__
+
+# from .._version import __version__
 
 
 class FImdlpTest(unittest.TestCase):
     def test_version(self):
-        self.assertEqual(version(), __version__)
+        clf = FImdlp()
+        self.assertEqual(
+            clf.get_version(),
+            f"{__version__}({CFImdlp().get_version().decode()})",
+        )
 
     def test_init(self):
         clf = FImdlp()
         self.assertEqual(-1, clf.n_jobs)
-        self.assertEqual(0, clf.algorithm)
-        clf = FImdlp(algorithm=1, n_jobs=7)
-        self.assertEqual(1, clf.algorithm)
+        clf = FImdlp(n_jobs=7)
         self.assertEqual(7, clf.n_jobs)
 
     def test_fit_definitive(self):
-        clf = FImdlp(algorithm=0)
-        clf.fit([[1, 2], [3, 4]], [1, 2])
-        self.assertEqual(clf.n_features_in_, 2)
-        self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
-        self.assertListEqual(clf.y_.tolist(), [1, 2])
-        self.assertListEqual([[2.0], [3.0]], clf.get_cut_points())
+        clf = FImdlp()
         X, y = load_iris(return_X_y=True)
         clf.fit(X, y)
         self.assertEqual(clf.n_features_in_, 4)
         self.assertTrue(np.array_equal(X, clf.X_))
         self.assertTrue(np.array_equal(y, clf.y_))
-        expected = [
-            [5.449999809265137, 6.25],
-            [2.8499999046325684, 3.0, 3.049999952316284, 3.3499999046325684],
-            [2.450000047683716, 4.75, 5.050000190734863],
-            [0.800000011920929, 1.4500000476837158, 1.75],
-        ]
-        self.assertListEqual(expected, clf.get_cut_points())
-        self.assertListEqual([0, 1, 2, 3], clf.features_)
-        clf.fit(X, y, features=[0, 2, 3])
-        self.assertListEqual([0, 2, 3], clf.features_)
-
-    def test_fit_alternative(self):
-        clf = FImdlp(algorithm=1)
-        clf.fit([[1, 2], [3, 4]], [1, 2])
-        self.assertEqual(clf.n_features_in_, 2)
-        self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
-        self.assertListEqual(clf.y_.tolist(), [1, 2])
-        self.assertListEqual([[2], [3]], clf.get_cut_points())
-        X, y = load_iris(return_X_y=True)
-        clf.fit(X, y)
-        self.assertEqual(clf.n_features_in_, 4)
-        self.assertTrue(np.array_equal(X, clf.X_))
-        self.assertTrue(np.array_equal(y, clf.y_))
-
         expected = [
             [5.449999809265137, 5.75],
-            [2.8499999046325684, 3.3499999046325684],
-            [2.450000047683716, 4.75],
-            [0.800000011920929, 1.75],
+            [2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684],
+            [2.45, 4.75, 5.050000190734863],
+            [0.8, 1.75],
         ]
-        self.assertListEqual(expected, clf.get_cut_points())
+        computed = clf.get_cut_points()
+        for item_computed, item_expected in zip(computed, expected):
+            for x_, y_ in zip(item_computed, item_expected):
+                self.assertAlmostEqual(x_, y_)
         self.assertListEqual([0, 1, 2, 3], clf.features_)
         clf.fit(X, y, features=[0, 2, 3])
         self.assertListEqual([0, 2, 3], clf.features_)
@@ -84,8 +61,12 @@ class FImdlpTest(unittest.TestCase):
             clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2])
 
     def test_fit_features(self):
-        clf = FImdlp()
+        clf = FImdlp(n_jobs=-1)
+        # Two samples doesn't have enough information to split
         clf.fit([[1, -2], [3, 4]], [1, 2], features=[0])
+        self.assertListEqual(clf.get_cut_points(), [[], []])
+        clf.fit([[1, -2], [3, 4], [5, 6]], [1, 2, 2], features=[0])
+        self.assertListEqual(clf.get_cut_points(), [[2], []])
         res = clf.transform([[1, -2], [3, 4]])
         self.assertListEqual(res.tolist(), [[0, -2], [1, 4]])
         X, y = load_iris(return_X_y=True)
@@ -100,9 +81,9 @@ class FImdlpTest(unittest.TestCase):
         )
         self.assertEqual(X_computed.dtype, np.float64)
 
-    def test_transform_definitive(self):
-        clf = FImdlp(algorithm=0)
-        clf.fit([[1, 2], [3, 4]], [1, 2])
+    def test_transform(self):
+        clf = FImdlp()
+        clf.fit([[1, 2], [3, 4], [5, 6]], [1, 2, 2])
         self.assertEqual(
             clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
         )
@@ -118,48 +99,18 @@ class FImdlpTest(unittest.TestCase):
         self.assertEqual(X_transformed.dtype, np.int32)
         expected = [
             [1, 0, 1, 1],
-            [1, 1, 1, 1],
-            [1, 0, 1, 1],
-            [0, 0, 1, 1],
-            [1, 0, 1, 1],
-            [1, 1, 1, 1],
-            [1, 1, 1, 1],
-        ]
-        self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
-        with self.assertRaises(ValueError):
-            clf.transform([[1, 2, 3], [4, 5, 6]])
-        with self.assertRaises(sklearn.exceptions.NotFittedError):
-            clf = FImdlp(algorithm=0)
-            clf.transform([[1, 2], [3, 4]])
-
-    def test_transform_alternative(self):
-        clf = FImdlp(algorithm=1)
-        clf.fit([[1, 2], [3, 4]], [1, 2])
-        self.assertEqual(
-            clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
-        )
-        X, y = load_iris(return_X_y=True)
-        clf.fit(X, y)
-        self.assertEqual(clf.n_features_in_, 4)
-        self.assertTrue(np.array_equal(X, clf.X_))
-        self.assertTrue(np.array_equal(y, clf.y_))
-        self.assertListEqual(
-            clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
-        )
-        expected = [
-            [1, 0, 1, 1],
-            [2, 1, 1, 1],
+            [2, 3, 1, 1],
             [2, 0, 1, 1],
             [0, 0, 1, 1],
             [1, 0, 1, 1],
-            [1, 1, 1, 1],
-            [1, 1, 1, 1],
+            [1, 3, 1, 1],
+            [1, 2, 1, 1],
         ]
         self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
         with self.assertRaises(ValueError):
             clf.transform([[1, 2, 3], [4, 5, 6]])
         with self.assertRaises(sklearn.exceptions.NotFittedError):
-            clf = FImdlp(algorithm=1)
+            clf = FImdlp()
             clf.transform([[1, 2], [3, 4]])
 
     def test_cppfactorize(self):
@@ -180,40 +131,69 @@ class FImdlpTest(unittest.TestCase):
         computed = factorize(source)
         self.assertListEqual(expected, computed)
 
-    def test_join_transform(self):
-        y = ["f0", "f0", "f2", "f3", "f4"]
-        x = [
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [1, 2, 3, 4, 5],
-            [2, 3, 4, 5, 6],
-            [3, 4, 5, 6, 7],
-        ]
-        expected = [
-            [0, 0, 0, 0],
-            [0, 0, 0, 0],
-            [1, 1, 1, 1],
-            [2, 2, 2, 2],
-            [2, 2, 2, 2],
-        ]
+    def test_join_fit(self):
+        y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
+        x = np.array(
+            [
+                [0, 1, 2, 3, 4],
+                [0, 1, 2, 3, 4],
+                [1, 2, 3, 4, 5],
+                [2, 3, 4, 5, 6],
+                [3, 4, 5, 6, 7],
+            ]
+        )
+        expected = [0, 0, 1, 2, 2]
         clf = FImdlp()
-        computed = clf.join_transform(x, y, 0)
-        for computed, expected in zip(computed, expected):
-            self.assertListEqual(expected, computed.tolist())
-        expected_y = [b"f00", b"f00", b"f21", b"f32", b"f43"]
+        clf.fit(x, factorize(y))
+        computed = clf.join_fit([0, 2], 1, x)
+        self.assertListEqual(computed.tolist(), expected)
+        expected_y = [b"002", b"002", b"113", b"224", b"335"]
         self.assertListEqual(expected_y, clf.y_join_)
 
-    def test_join_transform_error(self):
-        y = ["f0", "f0", "f2", "f3", "f4"]
-        x = [
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [1, 2, 3, 4, 5],
-            [2, 3, 4, 5, 6],
-            [3, 4, 5, 6, 7],
-        ]
-        with self.assertRaises(ValueError):
-            FImdlp().join_transform(x, y, 5)
+    def test_join_fit_error(self):
+        y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
+        x = np.array(
+            [
+                [0, 1, 2, 3, 4],
+                [0, 1, 2, 3, 4],
+                [1, 2, 3, 4, 5],
+                [2, 3, 4, 5, 6],
+                [3, 4, 5, 6, 7],
+            ]
+        )
+        clf = FImdlp()
+        clf.fit(x, factorize(y))
+        with self.assertRaises(ValueError) as exception:
+            clf.join_fit([], 1, x)
+        self.assertEqual(
+            str(exception.exception),
+            "Number of features must be in range [1, 5]",
+        )
+        with self.assertRaises(ValueError) as exception:
+            FImdlp().join_fit([0, 4], 1, x)
+        self.assertTrue(
+            str(exception.exception).startswith(
+                "This FImdlp instance is not fitted yet."
+            )
+        )
+        with self.assertRaises(ValueError) as exception:
+            clf.join_fit([0, 5], 1, x)
+        self.assertEqual(
+            str(exception.exception),
+            "Feature 5 not in range [0, 5)",
+        )
+        with self.assertRaises(ValueError) as exception:
+            clf.join_fit([0, 2], 5, x)
+        self.assertEqual(
+            str(exception.exception),
+            "Target 5 not in range [0, 5)",
+        )
+        with self.assertRaises(ValueError) as exception:
+            clf.join_fit([0, 2], 2, x)
+        self.assertEqual(
+            str(exception.exception),
+            "Target cannot in features to join",
+        )
 
     def test_factorize(self):
         y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
@@ -228,3 +208,21 @@ class FImdlpTest(unittest.TestCase):
     def test_sklearn_transformer(self):
         for check, test in check_estimator(FImdlp(), generate_only=True):
             test(check)
+
+    def test_states_feature(self):
+        clf = FImdlp()
+        X, y = load_iris(return_X_y=True)
+        clf.fit(X, y)
+        expected = []
+        for i in [3, 6, 4, 3]:
+            expected.append(list(range(i)))
+        for feature in range(X.shape[1]):
+            self.assertListEqual(
+                expected[feature], clf.get_states_feature(feature)
+            )
+
+    def test_states_no_feature(self):
+        clf = FImdlp()
+        X, y = load_iris(return_X_y=True)
+        clf.fit(X, y)
+        self.assertIsNone(clf.get_states_feature(4))