From ca7d158ac8fa99fe3a06542a71a18d5ac4049b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Thu, 26 Jan 2023 10:47:27 +0100 Subject: [PATCH 01/14] feat: :alembic: Add join_transform method and cpp factorize --- k.py | 12 +++++++ setup.py | 5 ++- src/fimdlp/Factorize.cpp | 18 ++++++++++ src/fimdlp/Factorize.h | 10 ++++++ src/fimdlp/cfimdlp.pyx | 5 +++ src/fimdlp/mdlp.py | 63 ++++++++++++++++++++++++--------- src/fimdlp/tests/FImdlp_test.py | 54 +++++++++++++++++++++++++++- 7 files changed, 148 insertions(+), 19 deletions(-) create mode 100644 k.py create mode 100644 src/fimdlp/Factorize.cpp create mode 100644 src/fimdlp/Factorize.h diff --git a/k.py b/k.py new file mode 100644 index 0000000..47e0856 --- /dev/null +++ b/k.py @@ -0,0 +1,12 @@ +from sklearn.datasets import load_wine +from fimdlp.mdlp import FImdlp + +X, y = load_wine(return_X_y=True) +trans = FImdlp() +Xt = trans.join_transform(X, y, 12) +print("X shape = ", X.shape) +print("Xt.shape=", Xt.shape) +print("Xt ", Xt[:10]) +print("trans.X_ shape = ", trans.X_.shape) +print("trans.y_ ", trans.y_[:10]) +print("y_join ", trans.y_join_[:10]) diff --git a/setup.py b/setup.py index db8a696..0ba294e 100644 --- a/setup.py +++ b/setup.py @@ -14,10 +14,13 @@ setup( "src/fimdlp/cfimdlp.pyx", "src/cppmdlp/CPPFImdlp.cpp", "src/cppmdlp/Metrics.cpp", + "src/fimdlp/Factorize.cpp", ], language="c++", include_dirs=["fimdlp"], - extra_compile_args=["-std=c++2a"], + extra_compile_args=[ + "-std=c++11", + ], ), ] ) diff --git a/src/fimdlp/Factorize.cpp b/src/fimdlp/Factorize.cpp new file mode 100644 index 0000000..f814d6f --- /dev/null +++ b/src/fimdlp/Factorize.cpp @@ -0,0 +1,18 @@ +#include "Factorize.h" + +namespace utils { + vector cppFactorize(const vector& labels_t) + { + vector yy; + yy.reserve(labels_t.size()); + map labelMap; + int i = 0; + for (string label : labels_t) { + if (labelMap.find(label) == labelMap.end()) { + labelMap[label] = i++; + } + yy.push_back(labelMap[label]); + } + return yy; + } +} \ No newline at end of file diff --git a/src/fimdlp/Factorize.h b/src/fimdlp/Factorize.h new file mode 100644 index 0000000..28f4c74 --- /dev/null +++ b/src/fimdlp/Factorize.h @@ -0,0 +1,10 @@ +#ifndef FACTORIZE_H +#define FACTORIZE_H +#include +#include +#include +namespace utils { + using namespace std; + vector cppFactorize(const vector&); +} +#endif \ No newline at end of file diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx index 9b548dd..c09d3e1 100644 --- a/src/fimdlp/cfimdlp.pyx +++ b/src/fimdlp/cfimdlp.pyx @@ -24,3 +24,8 @@ cdef class CFImdlp: return self.thisptr.getCutPoints() def get_version(self): return self.thisptr.version() + +cdef extern from "Factorize.h" namespace "utils": + vector[int] cppFactorize(vector[string] &input_vector) +def factorize(input_vector): + return cppFactorize(input_vector) \ No newline at end of file diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index e4e4437..0010dfb 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -1,5 +1,5 @@ import numpy as np -from .cppfimdlp import CFImdlp +from .cppfimdlp import CFImdlp, factorize from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_X_y, check_array, check_is_fitted @@ -33,21 +33,17 @@ class FImdlp(TransformerMixin, BaseEstimator): The list of discretizers, one for each feature. cut_points_ : list The list of cut points for each feature. - X_ : array - the samples used to fit, shape (n_samples, n_features) - y_ : array - the labels used to fit, shape (n_samples,) + X_ : array, shape (n_samples, n_features) + the samples used to fit + y_ : array, shape(n_samples,) + the labels used to fit features_ : list the list of features to be discretized """ - def _check_params_fit(self, X, y, expected_args, kwargs): - """Check the common parameters passed to fit""" + def _check_args(self, X, y, expected_args, kwargs): # Check that X and y have correct shape X, y = check_X_y(X, y) - # Store the classes seen during fit - self.classes_ = unique_labels(y) - self.n_classes_ = self.classes_.shape[0] # Default values self.features_ = [i for i in range(X.shape[1])] for key, value in kwargs.items(): @@ -68,15 +64,20 @@ class FImdlp(TransformerMixin, BaseEstimator): raise ValueError("Feature index out of range") return X, y + def _update_params(self, X, y): + # Store the classes seen during fit + self.classes_ = unique_labels(y) + self.n_classes_ = self.classes_.shape[0] + self.n_features_ = X.shape[1] + def fit(self, X, y, **kwargs): """A reference implementation of a fitting function for a transformer. Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : array, shape (n_samples, n_features) The training input samples. - y : None - There is no need of a target in a transformer, yet the pipeline API - requires this parameter. + y : array, shape (n_samples,) + the labels used to fit features : list, default=[i for i in range(n_features)] The list of features to be discretized. Returns @@ -84,10 +85,10 @@ class FImdlp(TransformerMixin, BaseEstimator): self : object Returns self. """ - X, y = self._check_params_fit( + X, y = self._check_args( X, y, expected_args=["features"], kwargs=kwargs ) - self.n_features_ = X.shape[1] + self._update_params(X, y) self.X_ = X self.y_ = y self.discretizer_ = [None] * self.n_features_ @@ -119,7 +120,7 @@ class FImdlp(TransformerMixin, BaseEstimator): """Discretize X values. Parameters ---------- - X : {array-like}, shape (n_samples, n_features) + X : array, shape (n_samples, n_features) The input samples. Returns ------- @@ -146,6 +147,34 @@ class FImdlp(TransformerMixin, BaseEstimator): ) return result + def join_transform(self, X, y, feature, **kwargs): + """Join the selected feature with the labels and discretize the values + join - fit - transform + + Parameters + ---------- + X : array, shape (n_samples, n_features) + The input samples. + y : array + the labels used to fit + feature : int + index of the feature to join with the labels + """ + X, y = self._check_args( + X, y, expected_args=["features"], kwargs=kwargs + ) + if feature < 0 or feature >= X.shape[1]: + raise ValueError( + f"Feature {feature} not in range [0, {X.shape[1]})" + ) + self.y_join_ = [ + f"{str(item_y)}{str(item_x)}".encode() + for item_y, item_x in zip(y, X[:, feature]) + ] + yy = factorize(self.y_join_) + XX = np.delete(X, feature, axis=1) + return self.fit(XX, yy).transform(XX) + def get_cut_points(self): """Get the cut points for each feature. Returns diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 99c5864..ec7096d 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -1,7 +1,8 @@ import unittest import sklearn -from sklearn.datasets import load_iris import numpy as np +from sklearn.datasets import load_iris +from ..cppfimdlp import factorize from ..mdlp import FImdlp from .. import version from .._version import __version__ @@ -159,3 +160,54 @@ class FImdlpTest(unittest.TestCase): with self.assertRaises(sklearn.exceptions.NotFittedError): clf = FImdlp(algorithm=1) clf.transform([[1, 2], [3, 4]]) + + def test_factorize(self): + source = [ + b"f0", + b"f1", + b"f2", + b"f3", + b"f4", + b"f5", + b"f6", + b"f1", + b"f1", + b"f7", + b"f8", + ] + expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8] + computed = factorize(source) + self.assertListEqual(expected, computed) + + def test_join_transform(self): + y = ["f0", "f0", "f2", "f3", "f4"] + x = [ + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [1, 2, 3, 4, 5], + [2, 3, 4, 5, 6], + [3, 4, 5, 6, 7], + ] + expected = [ + [0, 0, 0, 0], + [0, 0, 0, 0], + [1, 1, 1, 1], + [2, 2, 2, 2], + [2, 2, 2, 2], + ] + clf = FImdlp() + computed = clf.join_transform(x, y, 0) + for computed, expected in zip(computed, expected): + self.assertListEqual(expected, computed.tolist()) + + def test_join_transform_error(self): + y = ["f0", "f0", "f2", "f3", "f4"] + x = [ + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [1, 2, 3, 4, 5], + [2, 3, 4, 5, 6], + [3, 4, 5, 6, 7], + ] + with self.assertRaises(ValueError): + FImdlp().join_transform(x, y, 5) From 16b31ec29333417038ba544e64047f2837bec5fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Thu, 26 Jan 2023 11:17:10 +0100 Subject: [PATCH 02/14] test: :white_check_mark: Complete join_transform test --- src/fimdlp/tests/FImdlp_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index ec7096d..315c8b8 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -199,6 +199,8 @@ class FImdlpTest(unittest.TestCase): computed = clf.join_transform(x, y, 0) for computed, expected in zip(computed, expected): self.assertListEqual(expected, computed.tolist()) + expected_y = [b"f00", b"f00", b"f21", b"f32", b"f43"] + self.assertListEqual(expected_y, clf.y_join_) def test_join_transform_error(self): y = ["f0", "f0", "f2", "f3", "f4"] From 29fc88cecc0bad78aefa427affda895adb68ff7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Thu, 26 Jan 2023 23:20:51 +0100 Subject: [PATCH 03/14] test: :zap: Add scikit learn compatibility check_estimator test --- src/fimdlp/cfimdlp.pyx | 6 +++++- src/fimdlp/mdlp.py | 23 +++++++++++++---------- src/fimdlp/tests/FImdlp_test.py | 17 +++++++++++------ 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx index c09d3e1..18e1d81 100644 --- a/src/fimdlp/cfimdlp.pyx +++ b/src/fimdlp/cfimdlp.pyx @@ -13,7 +13,9 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": cdef class CFImdlp: cdef CPPFImdlp *thisptr - def __cinit__(self, algorithm): + cdef int algorithm + def __cinit__(self, algorithm:int ): + self.algorithm = algorithm self.thisptr = new CPPFImdlp(algorithm) def __dealloc__(self): del self.thisptr @@ -24,6 +26,8 @@ cdef class CFImdlp: return self.thisptr.getCutPoints() def get_version(self): return self.thisptr.version() + def __reduce__(self): + return (CFImdlp, (self.algorithm,)) cdef extern from "Factorize.h" namespace "utils": vector[int] cppFactorize(vector[string] &input_vector) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 0010dfb..a9ec2b8 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -27,7 +27,7 @@ class FImdlp(TransformerMixin, BaseEstimator): Attributes ---------- - n_features_ : int + n_features_in_ : int The number of features of the data passed to :meth:`fit`. discretizer_ : list The list of discretizers, one for each feature. @@ -41,6 +41,9 @@ class FImdlp(TransformerMixin, BaseEstimator): the list of features to be discretized """ + def _more_tags(self): + return {"preserves_dtype": [np.int32], "requires_y": True} + def _check_args(self, X, y, expected_args, kwargs): # Check that X and y have correct shape X, y = check_X_y(X, y) @@ -68,7 +71,7 @@ class FImdlp(TransformerMixin, BaseEstimator): # Store the classes seen during fit self.classes_ = unique_labels(y) self.n_classes_ = self.classes_.shape[0] - self.n_features_ = X.shape[1] + self.n_features_in_ = X.shape[1] def fit(self, X, y, **kwargs): """A reference implementation of a fitting function for a transformer. @@ -91,11 +94,11 @@ class FImdlp(TransformerMixin, BaseEstimator): self._update_params(X, y) self.X_ = X self.y_ = y - self.discretizer_ = [None] * self.n_features_ - self.cut_points_ = [None] * self.n_features_ + self.discretizer_ = [None] * self.n_features_in_ + self.cut_points_ = [None] * self.n_features_in_ Parallel(n_jobs=self.n_jobs, prefer="threads")( delayed(self._fit_discretizer)(feature) - for feature in range(self.n_features_) + for feature in range(self.n_features_in_) ) return self @@ -128,22 +131,22 @@ class FImdlp(TransformerMixin, BaseEstimator): The array containing the discretized values of ``X``. """ # Check is fit had been called - check_is_fitted(self, "n_features_") + check_is_fitted(self, "n_features_in_") # Input validation X = check_array(X) # Check that the input is of the same shape as the one passed # during fit. - if X.shape[1] != self.n_features_: + if X.shape[1] != self.n_features_in_: raise ValueError( "Shape of input is different from what was seen in `fit`" ) - if len(self.features_) == self.n_features_: + if len(self.features_) == self.n_features_in_: result = np.zeros_like(X, dtype=np.int32) - 1 else: result = np.zeros_like(X) - 1 Parallel(n_jobs=self.n_jobs, prefer="threads")( delayed(self._discretize_feature)(feature, X[:, feature], result) - for feature in range(self.n_features_) + for feature in range(self.n_features_in_) ) return result @@ -183,6 +186,6 @@ class FImdlp(TransformerMixin, BaseEstimator): The list of cut points for each feature. """ result = [] - for feature in range(self.n_features_): + for feature in range(self.n_features_in_): result.append(self.cut_points_[feature]) return result diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 315c8b8..2de67ab 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -2,6 +2,7 @@ import unittest import sklearn import numpy as np from sklearn.datasets import load_iris +from sklearn.utils.estimator_checks import check_estimator from ..cppfimdlp import factorize from ..mdlp import FImdlp from .. import version @@ -23,13 +24,13 @@ class FImdlpTest(unittest.TestCase): def test_fit_definitive(self): clf = FImdlp(algorithm=0) clf.fit([[1, 2], [3, 4]], [1, 2]) - self.assertEqual(clf.n_features_, 2) + self.assertEqual(clf.n_features_in_, 2) self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]]) self.assertListEqual(clf.y_.tolist(), [1, 2]) self.assertListEqual([[2.0], [3.0]], clf.get_cut_points()) X, y = load_iris(return_X_y=True) clf.fit(X, y) - self.assertEqual(clf.n_features_, 4) + self.assertEqual(clf.n_features_in_, 4) self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(y, clf.y_)) expected = [ @@ -46,13 +47,13 @@ class FImdlpTest(unittest.TestCase): def test_fit_alternative(self): clf = FImdlp(algorithm=1) clf.fit([[1, 2], [3, 4]], [1, 2]) - self.assertEqual(clf.n_features_, 2) + self.assertEqual(clf.n_features_in_, 2) self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]]) self.assertListEqual(clf.y_.tolist(), [1, 2]) self.assertListEqual([[2], [3]], clf.get_cut_points()) X, y = load_iris(return_X_y=True) clf.fit(X, y) - self.assertEqual(clf.n_features_, 4) + self.assertEqual(clf.n_features_in_, 4) self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(y, clf.y_)) @@ -107,7 +108,7 @@ class FImdlpTest(unittest.TestCase): ) X, y = load_iris(return_X_y=True) clf.fit(X, y) - self.assertEqual(clf.n_features_, 4) + self.assertEqual(clf.n_features_in_, 4) self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(y, clf.y_)) X_transformed = clf.transform(X) @@ -139,7 +140,7 @@ class FImdlpTest(unittest.TestCase): ) X, y = load_iris(return_X_y=True) clf.fit(X, y) - self.assertEqual(clf.n_features_, 4) + self.assertEqual(clf.n_features_in_, 4) self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(y, clf.y_)) self.assertListEqual( @@ -213,3 +214,7 @@ class FImdlpTest(unittest.TestCase): ] with self.assertRaises(ValueError): FImdlp().join_transform(x, y, 5) + + def test_sklearn_transformer(self): + for check, test in check_estimator(FImdlp(), generate_only=True): + test(check) From 050b9236316c4fc49873e29f82ce5df4780c53b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 28 Jan 2023 10:35:07 +0100 Subject: [PATCH 04/14] feat: :zap: Add factorize method to transformer --- src/fimdlp/mdlp.py | 17 ++++++++++++++++- src/fimdlp/tests/FImdlp_test.py | 12 +++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index a9ec2b8..415705b 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -150,6 +150,21 @@ class FImdlp(TransformerMixin, BaseEstimator): ) return result + def factorize(self, yy): + """Factorize the input labels + + Parameters + ---------- + yy : array, shape (n_samples,) + Labels to be factorized, MUST be bytes, i.e. b"0", b"1", ... + + Returns + ------- + array, shape (n_samples,) + Factorized labels + """ + return factorize(yy) + def join_transform(self, X, y, feature, **kwargs): """Join the selected feature with the labels and discretize the values join - fit - transform @@ -174,7 +189,7 @@ class FImdlp(TransformerMixin, BaseEstimator): f"{str(item_y)}{str(item_x)}".encode() for item_y, item_x in zip(y, X[:, feature]) ] - yy = factorize(self.y_join_) + yy = self.factorize(self.y_join_) XX = np.delete(X, feature, axis=1) return self.fit(XX, yy).transform(XX) diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 2de67ab..5e522f3 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -162,7 +162,7 @@ class FImdlpTest(unittest.TestCase): clf = FImdlp(algorithm=1) clf.transform([[1, 2], [3, 4]]) - def test_factorize(self): + def test_cppfactorize(self): source = [ b"f0", b"f1", @@ -215,6 +215,16 @@ class FImdlpTest(unittest.TestCase): with self.assertRaises(ValueError): FImdlp().join_transform(x, y, 5) + def test_factorize(self): + y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) + clf = FImdlp() + computed = clf.factorize(y) + self.assertListEqual([0, 0, 1, 2, 3], computed) + y = [b"f4", b"f0", b"f0", b"f2", b"f3"] + clf = FImdlp() + computed = clf.factorize(y) + self.assertListEqual([0, 1, 1, 2, 3], computed) + def test_sklearn_transformer(self): for check, test in check_estimator(FImdlp(), generate_only=True): test(check) From 7913f5151ee0a4843b7c04974aa730ea4c8521ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 28 Jan 2023 19:14:32 +0100 Subject: [PATCH 05/14] Add version command to Makefile --- Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile b/Makefile index 53b8218..ffc0216 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,11 @@ install: ## Build extension audit: ## Audit pip pip-audit +version: + @echo "Current Python version .: $(shell python --version)" + @echo "Current FImdlp version .: $(shell python -c "from fimdlp import _version; print(_version.__version__)")" + @echo "Installed FImdlp version: $(shell pip show fimdlp | grep Version | cut -d' ' -f2)" + help: ## Show help message @IFS=$$'\n' ; \ help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ From 1186e4ad5361a7a63c1a120f412c7e8b0364871e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 28 Jan 2023 19:15:26 +0100 Subject: [PATCH 06/14] chore: :bookmark: Upgrade version number to 0.9.3 --- src/fimdlp/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fimdlp/_version.py b/src/fimdlp/_version.py index a2fecb4..c598173 100644 --- a/src/fimdlp/_version.py +++ b/src/fimdlp/_version.py @@ -1 +1 @@ -__version__ = "0.9.2" +__version__ = "0.9.3" From cf09d92cccea4404acb0da442958d6b18bae549e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 4 Feb 2023 17:45:36 +0100 Subject: [PATCH 07/14] add MultiDiscretizer --- src/fimdlp/mdlp.py | 104 +++++++++++++++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 28 deletions(-) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 415705b..438f2db 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -165,34 +165,6 @@ class FImdlp(TransformerMixin, BaseEstimator): """ return factorize(yy) - def join_transform(self, X, y, feature, **kwargs): - """Join the selected feature with the labels and discretize the values - join - fit - transform - - Parameters - ---------- - X : array, shape (n_samples, n_features) - The input samples. - y : array - the labels used to fit - feature : int - index of the feature to join with the labels - """ - X, y = self._check_args( - X, y, expected_args=["features"], kwargs=kwargs - ) - if feature < 0 or feature >= X.shape[1]: - raise ValueError( - f"Feature {feature} not in range [0, {X.shape[1]})" - ) - self.y_join_ = [ - f"{str(item_y)}{str(item_x)}".encode() - for item_y, item_x in zip(y, X[:, feature]) - ] - yy = self.factorize(self.y_join_) - XX = np.delete(X, feature, axis=1) - return self.fit(XX, yy).transform(XX) - def get_cut_points(self): """Get the cut points for each feature. Returns @@ -204,3 +176,79 @@ class FImdlp(TransformerMixin, BaseEstimator): for feature in range(self.n_features_in_): result.append(self.cut_points_[feature]) return result + + +class MultiDiscretizer: + def __init__(self, algorithm=0, n_jobs=-1): + self.algorithm = algorithm + self.n_jobs = n_jobs + + def fit_transform(self, X, y, **kwargs): + X, y = check_X_y(X, y) + self.X_ = X + self.y_ = y + self.n_features_in_ = X.shape[1] + self.discretizer_ = FImdlp( + algorithm=self.algorithm, n_jobs=self.n_jobs + ) + self.discretizers_ = [None] * self.n_features_in_ + self.discretized_ = [None] * self.n_features_in_ + self.yy_ = [None] * self.n_features_in_ + self.X_d_ = self.discretizer_.fit_transform(X, y, **kwargs) + return self.X_d_ + + def transform(self, X): + X = check_array(X) + if not hasattr(self, "discretizer_"): + raise ValueError("Must call fit_transform first") + return self.discretizer_.transform(X) + + def join_transform(self, features, target): + """Join the selected features with the labels and discretize the values + of the target variable + join - fit - transform + + Parameters + ---------- + features : [list] + index of the features to join with the labels + target : [int] + index of the target variable to discretize + """ + # Check is fit had been called + check_is_fitted(self, "n_features_in_") + if len(features) < 1 or len(features) > self.n_features_in_: + raise ValueError( + "Number of features must be in range [1, " + f"{self.n_features_in_}]" + ) + for feature in features: + if feature < 0 or feature >= self.n_features_in_: + raise ValueError( + f"Feature {feature} not in range [0, " + f"{self.n_features_in_})" + ) + if target < 0 or target >= self.n_features_in_: + raise ValueError( + f"Target {target} not in range [0, {self.n_features_in_})" + ) + y_join = [ + f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() + for item_y, items_x in zip(self.y_, self.X_d_[:, features]) + ] + self.yy_[target] = self.discretizer_.factorize(y_join) + self.discretizers_[target] = FImdlp( + algorithm=self.algorithm, n_jobs=self.n_jobs + ) + self.discretized_[target] = self.discretizers_[target].fit_transform( + self.X_[:, target].reshape(-1, 1), self.yy_[target] + ) + return self.discretized_[target] + + +# from sklearn.datasets import load_wine +# X, y = load_wine(return_X_y=True) +# from fimdlp.mdlp import MultiDiscretizer +# clf = MultiDiscretizer() +# clf.fit(X, y) +# clf.join_transform([1, 3, 5], 7) From f20496203e36571eb5597dbd891e7569b5057935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 4 Feb 2023 19:23:15 +0100 Subject: [PATCH 08/14] refactor Multidiscretizer to use one per column --- src/fimdlp/mdlp.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 438f2db..9b55cf0 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -183,25 +183,38 @@ class MultiDiscretizer: self.algorithm = algorithm self.n_jobs = n_jobs - def fit_transform(self, X, y, **kwargs): + def initial_fit_transform(self, X, y): X, y = check_X_y(X, y) self.X_ = X self.y_ = y self.n_features_in_ = X.shape[1] - self.discretizer_ = FImdlp( - algorithm=self.algorithm, n_jobs=self.n_jobs - ) self.discretizers_ = [None] * self.n_features_in_ self.discretized_ = [None] * self.n_features_in_ - self.yy_ = [None] * self.n_features_in_ - self.X_d_ = self.discretizer_.fit_transform(X, y, **kwargs) + # self.yy_ = [None] * self.n_features_in_ + self.X_d_ = np.zeros_like(X, dtype=np.int32) - 1 + for feature in range(self.n_features_in_): + self.discretizers_[feature] = FImdlp( + algorithm=self.algorithm, n_jobs=self.n_jobs + ) + self.discretized_[feature] = self.discretizers_[ + feature + ].fit_transform(X[:, feature].reshape(-1, 1), y) + # self.yy_[feature] = self.discretizers_[feature].factorize(y) + self.X_d_[:, feature] = self.discretized_[feature].ravel() return self.X_d_ def transform(self, X): X = check_array(X) - if not hasattr(self, "discretizer_"): + if not hasattr(self, "discretizers_"): raise ValueError("Must call fit_transform first") - return self.discretizer_.transform(X) + result = np.zeros_like(X, dtype=np.int32) - 1 + for feature in range(self.n_features_in_): + result[:, feature] = ( + self.discretizers_[feature] + .transform(X[:, feature].reshape(-1, 1)) + .ravel() + ) + return result def join_transform(self, features, target): """Join the selected features with the labels and discretize the values From 9899781640911f8acdacb8e5de4b16aeadc578b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 5 Feb 2023 00:30:03 +0100 Subject: [PATCH 09/14] Complete join_fit and remove MultiDiscretizer --- src/fimdlp/mdlp.py | 71 ++++++++-------------------------------------- 1 file changed, 12 insertions(+), 59 deletions(-) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 9b55cf0..b119688 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -177,47 +177,8 @@ class FImdlp(TransformerMixin, BaseEstimator): result.append(self.cut_points_[feature]) return result - -class MultiDiscretizer: - def __init__(self, algorithm=0, n_jobs=-1): - self.algorithm = algorithm - self.n_jobs = n_jobs - - def initial_fit_transform(self, X, y): - X, y = check_X_y(X, y) - self.X_ = X - self.y_ = y - self.n_features_in_ = X.shape[1] - self.discretizers_ = [None] * self.n_features_in_ - self.discretized_ = [None] * self.n_features_in_ - # self.yy_ = [None] * self.n_features_in_ - self.X_d_ = np.zeros_like(X, dtype=np.int32) - 1 - for feature in range(self.n_features_in_): - self.discretizers_[feature] = FImdlp( - algorithm=self.algorithm, n_jobs=self.n_jobs - ) - self.discretized_[feature] = self.discretizers_[ - feature - ].fit_transform(X[:, feature].reshape(-1, 1), y) - # self.yy_[feature] = self.discretizers_[feature].factorize(y) - self.X_d_[:, feature] = self.discretized_[feature].ravel() - return self.X_d_ - - def transform(self, X): - X = check_array(X) - if not hasattr(self, "discretizers_"): - raise ValueError("Must call fit_transform first") - result = np.zeros_like(X, dtype=np.int32) - 1 - for feature in range(self.n_features_in_): - result[:, feature] = ( - self.discretizers_[feature] - .transform(X[:, feature].reshape(-1, 1)) - .ravel() - ) - return result - - def join_transform(self, features, target): - """Join the selected features with the labels and discretize the values + def join_fit(self, features, target, data): + """Join the selected features with the labels and fit the discretizer of the target variable join - fit - transform @@ -227,8 +188,12 @@ class MultiDiscretizer: index of the features to join with the labels target : [int] index of the target variable to discretize + + Returns + ------- + result: np.array + The target variable newly discretized """ - # Check is fit had been called check_is_fitted(self, "n_features_in_") if len(features) < 1 or len(features) > self.n_features_in_: raise ValueError( @@ -247,21 +212,9 @@ class MultiDiscretizer: ) y_join = [ f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() - for item_y, items_x in zip(self.y_, self.X_d_[:, features]) + for item_y, items_x in zip(self.y_, data[:, features]) ] - self.yy_[target] = self.discretizer_.factorize(y_join) - self.discretizers_[target] = FImdlp( - algorithm=self.algorithm, n_jobs=self.n_jobs - ) - self.discretized_[target] = self.discretizers_[target].fit_transform( - self.X_[:, target].reshape(-1, 1), self.yy_[target] - ) - return self.discretized_[target] - - -# from sklearn.datasets import load_wine -# X, y = load_wine(return_X_y=True) -# from fimdlp.mdlp import MultiDiscretizer -# clf = MultiDiscretizer() -# clf.fit(X, y) -# clf.join_transform([1, 3, 5], 7) + self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) + self.cut_points_[target] = self.discretizer_[target].get_cut_points() + # return the discretized target variable with the new cut points + return np.searchsorted(self.cut_points_[target], self.X_[:, target]) From 2d495293bb9ba462bbdbbbcad1fa8b04b4cdccac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 13 Feb 2023 16:15:50 +0100 Subject: [PATCH 10/14] Add range_features method --- src/fimdlp/mdlp.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index b119688..ac0dd05 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -119,6 +119,15 @@ class FImdlp(TransformerMixin, BaseEstimator): else: result[:, feature] = X + def range_features(self): + res = [] + for i in range(self.n_features_in_): + if i in self.features_: + res.append(list(range(len(self.cut_points_[i])))) + else: + res.append([]) + return res + def transform(self, X): """Discretize X values. Parameters @@ -214,6 +223,7 @@ class FImdlp(TransformerMixin, BaseEstimator): f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() for item_y, items_x in zip(self.y_, data[:, features]) ] + self.y_join = y_join self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) self.cut_points_[target] = self.discretizer_[target].get_cut_points() # return the discretized target variable with the new cut points From 31d79a77fa82fb1bc0869cbf3eeec5814a5d8b5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 13 Feb 2023 17:34:50 +0100 Subject: [PATCH 11/14] Add get_states_feature method --- src/fimdlp/mdlp.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index ac0dd05..808e75a 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -119,15 +119,6 @@ class FImdlp(TransformerMixin, BaseEstimator): else: result[:, feature] = X - def range_features(self): - res = [] - for i in range(self.n_features_in_): - if i in self.features_: - res.append(list(range(len(self.cut_points_[i])))) - else: - res.append([]) - return res - def transform(self, X): """Discretize X values. Parameters @@ -186,6 +177,23 @@ class FImdlp(TransformerMixin, BaseEstimator): result.append(self.cut_points_[feature]) return result + def get_states_feature(self, feature): + """Return the states a feature can take + + Parameters + ---------- + feature : int + feature to get the states + + Returns + ------- + list + states of the feature + """ + if feature in self.features_: + return list(range(len(self.cut_points_[feature]) + 1)) + return None + def join_fit(self, features, target, data): """Join the selected features with the labels and fit the discretizer of the target variable From e0b7cae9a0902b1af661b434963038e32752f969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 20 Feb 2023 18:26:51 +0100 Subject: [PATCH 12/14] Remove algorithm hyperparameter in discretizer --- src/fimdlp/__init__.py | 4 - src/fimdlp/cfimdlp.pyx | 10 +- src/fimdlp/mdlp.py | 23 ++-- src/fimdlp/tests/FImdlp_test.py | 218 ++++++++++++++++---------------- 4 files changed, 126 insertions(+), 129 deletions(-) diff --git a/src/fimdlp/__init__.py b/src/fimdlp/__init__.py index 3a99d3b..0abf8ef 100644 --- a/src/fimdlp/__init__.py +++ b/src/fimdlp/__init__.py @@ -1,8 +1,4 @@ from ._version import __version__ -def version(): - return __version__ - - all = ["FImdlp", "__version__"] diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx index 18e1d81..8892e8b 100644 --- a/src/fimdlp/cfimdlp.pyx +++ b/src/fimdlp/cfimdlp.pyx @@ -6,17 +6,15 @@ from libcpp.string cimport string cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": ctypedef float precision_t cdef cppclass CPPFImdlp: - CPPFImdlp(int) except + + CPPFImdlp() except + CPPFImdlp& fit(vector[precision_t]&, vector[int]&) vector[precision_t] getCutPoints() string version() cdef class CFImdlp: cdef CPPFImdlp *thisptr - cdef int algorithm - def __cinit__(self, algorithm:int ): - self.algorithm = algorithm - self.thisptr = new CPPFImdlp(algorithm) + def __cinit__(self): + self.thisptr = new CPPFImdlp() def __dealloc__(self): del self.thisptr def fit(self, X, y): @@ -27,7 +25,7 @@ cdef class CFImdlp: def get_version(self): return self.thisptr.version() def __reduce__(self): - return (CFImdlp, (self.algorithm,)) + return (CFImdlp, ()) cdef extern from "Factorize.h" namespace "utils": vector[int] cppFactorize(vector[string] &input_vector) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 808e75a..0378ebf 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -4,22 +4,19 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from joblib import Parallel, delayed +from ._version import __version__ + +# from ._version import __version__ class FImdlp(TransformerMixin, BaseEstimator): - def __init__(self, algorithm=0, n_jobs=-1): - self.algorithm = algorithm + def __init__(self, n_jobs=-1): self.n_jobs = n_jobs """Fayyad - Irani MDLP discretization algorithm based implementation. Parameters ---------- - algorithm : int, default=0 - The type of algorithm to use computing the cut points. - 0 - Definitive implementation - 1 - Alternative proposal - 2 - Classic proposal n_jobs : int, default=-1 The number of jobs to run in parallel. :meth:`fit` and :meth:`transform`, are parallelized over the features. ``-1`` means @@ -73,6 +70,10 @@ class FImdlp(TransformerMixin, BaseEstimator): self.n_classes_ = self.classes_.shape[0] self.n_features_in_ = X.shape[1] + @staticmethod + def get_version(): + return f"{__version__}({CFImdlp().get_version().decode()})" + def fit(self, X, y, **kwargs): """A reference implementation of a fitting function for a transformer. Parameters @@ -104,7 +105,7 @@ class FImdlp(TransformerMixin, BaseEstimator): def _fit_discretizer(self, feature): if feature in self.features_: - self.discretizer_[feature] = CFImdlp(algorithm=self.algorithm) + self.discretizer_[feature] = CFImdlp() self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.cut_points_[feature] = self.discretizer_[ feature @@ -205,6 +206,8 @@ class FImdlp(TransformerMixin, BaseEstimator): index of the features to join with the labels target : [int] index of the target variable to discretize + data: [array] shape (n_samples, n_features) + dataset that contains the features to join Returns ------- @@ -227,11 +230,13 @@ class FImdlp(TransformerMixin, BaseEstimator): raise ValueError( f"Target {target} not in range [0, {self.n_features_in_})" ) + if target in features: + raise ValueError("Target cannot in features to join") y_join = [ f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() for item_y, items_x in zip(self.y_, data[:, features]) ] - self.y_join = y_join + self.y_join_ = y_join self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) self.cut_points_[target] = self.discretizer_[target].get_cut_points() # return the discretized target variable with the new cut points diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 5e522f3..fcfd888 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -3,67 +3,44 @@ import sklearn import numpy as np from sklearn.datasets import load_iris from sklearn.utils.estimator_checks import check_estimator -from ..cppfimdlp import factorize +from ..cppfimdlp import CFImdlp, factorize from ..mdlp import FImdlp -from .. import version -from .._version import __version__ +from .. import __version__ + +# from .._version import __version__ class FImdlpTest(unittest.TestCase): def test_version(self): - self.assertEqual(version(), __version__) + clf = FImdlp() + self.assertEqual( + clf.get_version(), + f"{__version__}({CFImdlp().get_version().decode()})", + ) def test_init(self): clf = FImdlp() self.assertEqual(-1, clf.n_jobs) - self.assertEqual(0, clf.algorithm) - clf = FImdlp(algorithm=1, n_jobs=7) - self.assertEqual(1, clf.algorithm) + clf = FImdlp(n_jobs=7) self.assertEqual(7, clf.n_jobs) def test_fit_definitive(self): - clf = FImdlp(algorithm=0) - clf.fit([[1, 2], [3, 4]], [1, 2]) - self.assertEqual(clf.n_features_in_, 2) - self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]]) - self.assertListEqual(clf.y_.tolist(), [1, 2]) - self.assertListEqual([[2.0], [3.0]], clf.get_cut_points()) + clf = FImdlp() X, y = load_iris(return_X_y=True) clf.fit(X, y) self.assertEqual(clf.n_features_in_, 4) self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(y, clf.y_)) - expected = [ - [5.449999809265137, 6.25], - [2.8499999046325684, 3.0, 3.049999952316284, 3.3499999046325684], - [2.450000047683716, 4.75, 5.050000190734863], - [0.800000011920929, 1.4500000476837158, 1.75], - ] - self.assertListEqual(expected, clf.get_cut_points()) - self.assertListEqual([0, 1, 2, 3], clf.features_) - clf.fit(X, y, features=[0, 2, 3]) - self.assertListEqual([0, 2, 3], clf.features_) - - def test_fit_alternative(self): - clf = FImdlp(algorithm=1) - clf.fit([[1, 2], [3, 4]], [1, 2]) - self.assertEqual(clf.n_features_in_, 2) - self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]]) - self.assertListEqual(clf.y_.tolist(), [1, 2]) - self.assertListEqual([[2], [3]], clf.get_cut_points()) - X, y = load_iris(return_X_y=True) - clf.fit(X, y) - self.assertEqual(clf.n_features_in_, 4) - self.assertTrue(np.array_equal(X, clf.X_)) - self.assertTrue(np.array_equal(y, clf.y_)) - expected = [ [5.449999809265137, 5.75], - [2.8499999046325684, 3.3499999046325684], - [2.450000047683716, 4.75], - [0.800000011920929, 1.75], + [2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684], + [2.45, 4.75, 5.050000190734863], + [0.8, 1.75], ] - self.assertListEqual(expected, clf.get_cut_points()) + computed = clf.get_cut_points() + for item_computed, item_expected in zip(computed, expected): + for x_, y_ in zip(item_computed, item_expected): + self.assertAlmostEqual(x_, y_) self.assertListEqual([0, 1, 2, 3], clf.features_) clf.fit(X, y, features=[0, 2, 3]) self.assertListEqual([0, 2, 3], clf.features_) @@ -84,8 +61,12 @@ class FImdlpTest(unittest.TestCase): clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2]) def test_fit_features(self): - clf = FImdlp() + clf = FImdlp(n_jobs=-1) + # Two samples doesn't have enough information to split clf.fit([[1, -2], [3, 4]], [1, 2], features=[0]) + self.assertListEqual(clf.get_cut_points(), [[], []]) + clf.fit([[1, -2], [3, 4], [5, 6]], [1, 2, 2], features=[0]) + self.assertListEqual(clf.get_cut_points(), [[2], []]) res = clf.transform([[1, -2], [3, 4]]) self.assertListEqual(res.tolist(), [[0, -2], [1, 4]]) X, y = load_iris(return_X_y=True) @@ -100,9 +81,9 @@ class FImdlpTest(unittest.TestCase): ) self.assertEqual(X_computed.dtype, np.float64) - def test_transform_definitive(self): - clf = FImdlp(algorithm=0) - clf.fit([[1, 2], [3, 4]], [1, 2]) + def test_transform(self): + clf = FImdlp() + clf.fit([[1, 2], [3, 4], [5, 6]], [1, 2, 2]) self.assertEqual( clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]] ) @@ -118,48 +99,18 @@ class FImdlpTest(unittest.TestCase): self.assertEqual(X_transformed.dtype, np.int32) expected = [ [1, 0, 1, 1], - [1, 1, 1, 1], - [1, 0, 1, 1], - [0, 0, 1, 1], - [1, 0, 1, 1], - [1, 1, 1, 1], - [1, 1, 1, 1], - ] - self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected)) - with self.assertRaises(ValueError): - clf.transform([[1, 2, 3], [4, 5, 6]]) - with self.assertRaises(sklearn.exceptions.NotFittedError): - clf = FImdlp(algorithm=0) - clf.transform([[1, 2], [3, 4]]) - - def test_transform_alternative(self): - clf = FImdlp(algorithm=1) - clf.fit([[1, 2], [3, 4]], [1, 2]) - self.assertEqual( - clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]] - ) - X, y = load_iris(return_X_y=True) - clf.fit(X, y) - self.assertEqual(clf.n_features_in_, 4) - self.assertTrue(np.array_equal(X, clf.X_)) - self.assertTrue(np.array_equal(y, clf.y_)) - self.assertListEqual( - clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist() - ) - expected = [ - [1, 0, 1, 1], - [2, 1, 1, 1], + [2, 3, 1, 1], [2, 0, 1, 1], [0, 0, 1, 1], [1, 0, 1, 1], - [1, 1, 1, 1], - [1, 1, 1, 1], + [1, 3, 1, 1], + [1, 2, 1, 1], ] self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected)) with self.assertRaises(ValueError): clf.transform([[1, 2, 3], [4, 5, 6]]) with self.assertRaises(sklearn.exceptions.NotFittedError): - clf = FImdlp(algorithm=1) + clf = FImdlp() clf.transform([[1, 2], [3, 4]]) def test_cppfactorize(self): @@ -180,40 +131,69 @@ class FImdlpTest(unittest.TestCase): computed = factorize(source) self.assertListEqual(expected, computed) - def test_join_transform(self): - y = ["f0", "f0", "f2", "f3", "f4"] - x = [ - [0, 1, 2, 3, 4], - [0, 1, 2, 3, 4], - [1, 2, 3, 4, 5], - [2, 3, 4, 5, 6], - [3, 4, 5, 6, 7], - ] - expected = [ - [0, 0, 0, 0], - [0, 0, 0, 0], - [1, 1, 1, 1], - [2, 2, 2, 2], - [2, 2, 2, 2], - ] + def test_join_fit(self): + y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) + x = np.array( + [ + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [1, 2, 3, 4, 5], + [2, 3, 4, 5, 6], + [3, 4, 5, 6, 7], + ] + ) + expected = [0, 0, 1, 2, 2] clf = FImdlp() - computed = clf.join_transform(x, y, 0) - for computed, expected in zip(computed, expected): - self.assertListEqual(expected, computed.tolist()) - expected_y = [b"f00", b"f00", b"f21", b"f32", b"f43"] + clf.fit(x, factorize(y)) + computed = clf.join_fit([0, 2], 1, x) + self.assertListEqual(computed.tolist(), expected) + expected_y = [b"002", b"002", b"113", b"224", b"335"] self.assertListEqual(expected_y, clf.y_join_) - def test_join_transform_error(self): - y = ["f0", "f0", "f2", "f3", "f4"] - x = [ - [0, 1, 2, 3, 4], - [0, 1, 2, 3, 4], - [1, 2, 3, 4, 5], - [2, 3, 4, 5, 6], - [3, 4, 5, 6, 7], - ] - with self.assertRaises(ValueError): - FImdlp().join_transform(x, y, 5) + def test_join_fit_error(self): + y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) + x = np.array( + [ + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [1, 2, 3, 4, 5], + [2, 3, 4, 5, 6], + [3, 4, 5, 6, 7], + ] + ) + clf = FImdlp() + clf.fit(x, factorize(y)) + with self.assertRaises(ValueError) as exception: + clf.join_fit([], 1, x) + self.assertEqual( + str(exception.exception), + "Number of features must be in range [1, 5]", + ) + with self.assertRaises(ValueError) as exception: + FImdlp().join_fit([0, 4], 1, x) + self.assertTrue( + str(exception.exception).startswith( + "This FImdlp instance is not fitted yet." + ) + ) + with self.assertRaises(ValueError) as exception: + clf.join_fit([0, 5], 1, x) + self.assertEqual( + str(exception.exception), + "Feature 5 not in range [0, 5)", + ) + with self.assertRaises(ValueError) as exception: + clf.join_fit([0, 2], 5, x) + self.assertEqual( + str(exception.exception), + "Target 5 not in range [0, 5)", + ) + with self.assertRaises(ValueError) as exception: + clf.join_fit([0, 2], 2, x) + self.assertEqual( + str(exception.exception), + "Target cannot in features to join", + ) def test_factorize(self): y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) @@ -228,3 +208,21 @@ class FImdlpTest(unittest.TestCase): def test_sklearn_transformer(self): for check, test in check_estimator(FImdlp(), generate_only=True): test(check) + + def test_states_feature(self): + clf = FImdlp() + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + expected = [] + for i in [3, 6, 4, 3]: + expected.append(list(range(i))) + for feature in range(X.shape[1]): + self.assertListEqual( + expected[feature], clf.get_states_feature(feature) + ) + + def test_states_no_feature(self): + clf = FImdlp() + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + self.assertIsNone(clf.get_states_feature(4)) From 718c9d0e63872b4797c5d3f1ebbfd9dc449ef089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 20 Feb 2023 20:12:36 +0100 Subject: [PATCH 13/14] make static methods factorize and test_sklrn_trans --- src/fimdlp/mdlp.py | 3 ++- src/fimdlp/tests/FImdlp_test.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 0378ebf..2a2114a 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -151,7 +151,8 @@ class FImdlp(TransformerMixin, BaseEstimator): ) return result - def factorize(self, yy): + @staticmethod + def factorize(yy): """Factorize the input labels Parameters diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index fcfd888..068a4e8 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -205,7 +205,8 @@ class FImdlpTest(unittest.TestCase): computed = clf.factorize(y) self.assertListEqual([0, 1, 1, 2, 3], computed) - def test_sklearn_transformer(self): + @staticmethod + def test_sklearn_transformer(): for check, test in check_estimator(FImdlp(), generate_only=True): test(check) From 40871f128d61fc897ce117d9adf00008a840a43c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 22 Feb 2023 10:15:33 +0100 Subject: [PATCH 14/14] Add 1.1.0 version of mdlp --- src/cppmdlp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cppmdlp b/src/cppmdlp index 1b89f59..e97aea2 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit 1b89f5927c3add921b19fe29094d354780f98b5f +Subproject commit e97aea2a4de7e4e4a24e87744d8987b899b1a239