diff --git a/k.py b/k.py new file mode 100644 index 0000000..47e0856 --- /dev/null +++ b/k.py @@ -0,0 +1,12 @@ +from sklearn.datasets import load_wine +from fimdlp.mdlp import FImdlp + +X, y = load_wine(return_X_y=True) +trans = FImdlp() +Xt = trans.join_transform(X, y, 12) +print("X shape = ", X.shape) +print("Xt.shape=", Xt.shape) +print("Xt ", Xt[:10]) +print("trans.X_ shape = ", trans.X_.shape) +print("trans.y_ ", trans.y_[:10]) +print("y_join ", trans.y_join_[:10]) diff --git a/setup.py b/setup.py index db8a696..0ba294e 100644 --- a/setup.py +++ b/setup.py @@ -14,10 +14,13 @@ setup( "src/fimdlp/cfimdlp.pyx", "src/cppmdlp/CPPFImdlp.cpp", "src/cppmdlp/Metrics.cpp", + "src/fimdlp/Factorize.cpp", ], language="c++", include_dirs=["fimdlp"], - extra_compile_args=["-std=c++2a"], + extra_compile_args=[ + "-std=c++11", + ], ), ] ) diff --git a/src/fimdlp/Factorize.cpp b/src/fimdlp/Factorize.cpp new file mode 100644 index 0000000..f814d6f --- /dev/null +++ b/src/fimdlp/Factorize.cpp @@ -0,0 +1,18 @@ +#include "Factorize.h" + +namespace utils { + vector cppFactorize(const vector& labels_t) + { + vector yy; + yy.reserve(labels_t.size()); + map labelMap; + int i = 0; + for (string label : labels_t) { + if (labelMap.find(label) == labelMap.end()) { + labelMap[label] = i++; + } + yy.push_back(labelMap[label]); + } + return yy; + } +} \ No newline at end of file diff --git a/src/fimdlp/Factorize.h b/src/fimdlp/Factorize.h new file mode 100644 index 0000000..28f4c74 --- /dev/null +++ b/src/fimdlp/Factorize.h @@ -0,0 +1,10 @@ +#ifndef FACTORIZE_H +#define FACTORIZE_H +#include +#include +#include +namespace utils { + using namespace std; + vector cppFactorize(const vector&); +} +#endif \ No newline at end of file diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx index 9b548dd..c09d3e1 100644 --- a/src/fimdlp/cfimdlp.pyx +++ b/src/fimdlp/cfimdlp.pyx @@ -24,3 +24,8 @@ cdef class CFImdlp: return self.thisptr.getCutPoints() def get_version(self): return self.thisptr.version() + +cdef extern from "Factorize.h" namespace "utils": + vector[int] cppFactorize(vector[string] &input_vector) +def factorize(input_vector): + return cppFactorize(input_vector) \ No newline at end of file diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index e4e4437..0010dfb 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -1,5 +1,5 @@ import numpy as np -from .cppfimdlp import CFImdlp +from .cppfimdlp import CFImdlp, factorize from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_X_y, check_array, check_is_fitted @@ -33,21 +33,17 @@ class FImdlp(TransformerMixin, BaseEstimator): The list of discretizers, one for each feature. cut_points_ : list The list of cut points for each feature. - X_ : array - the samples used to fit, shape (n_samples, n_features) - y_ : array - the labels used to fit, shape (n_samples,) + X_ : array, shape (n_samples, n_features) + the samples used to fit + y_ : array, shape(n_samples,) + the labels used to fit features_ : list the list of features to be discretized """ - def _check_params_fit(self, X, y, expected_args, kwargs): - """Check the common parameters passed to fit""" + def _check_args(self, X, y, expected_args, kwargs): # Check that X and y have correct shape X, y = check_X_y(X, y) - # Store the classes seen during fit - self.classes_ = unique_labels(y) - self.n_classes_ = self.classes_.shape[0] # Default values self.features_ = [i for i in range(X.shape[1])] for key, value in kwargs.items(): @@ -68,15 +64,20 @@ class FImdlp(TransformerMixin, BaseEstimator): raise ValueError("Feature index out of range") return X, y + def _update_params(self, X, y): + # Store the classes seen during fit + self.classes_ = unique_labels(y) + self.n_classes_ = self.classes_.shape[0] + self.n_features_ = X.shape[1] + def fit(self, X, y, **kwargs): """A reference implementation of a fitting function for a transformer. Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : array, shape (n_samples, n_features) The training input samples. - y : None - There is no need of a target in a transformer, yet the pipeline API - requires this parameter. + y : array, shape (n_samples,) + the labels used to fit features : list, default=[i for i in range(n_features)] The list of features to be discretized. Returns @@ -84,10 +85,10 @@ class FImdlp(TransformerMixin, BaseEstimator): self : object Returns self. """ - X, y = self._check_params_fit( + X, y = self._check_args( X, y, expected_args=["features"], kwargs=kwargs ) - self.n_features_ = X.shape[1] + self._update_params(X, y) self.X_ = X self.y_ = y self.discretizer_ = [None] * self.n_features_ @@ -119,7 +120,7 @@ class FImdlp(TransformerMixin, BaseEstimator): """Discretize X values. Parameters ---------- - X : {array-like}, shape (n_samples, n_features) + X : array, shape (n_samples, n_features) The input samples. Returns ------- @@ -146,6 +147,34 @@ class FImdlp(TransformerMixin, BaseEstimator): ) return result + def join_transform(self, X, y, feature, **kwargs): + """Join the selected feature with the labels and discretize the values + join - fit - transform + + Parameters + ---------- + X : array, shape (n_samples, n_features) + The input samples. + y : array + the labels used to fit + feature : int + index of the feature to join with the labels + """ + X, y = self._check_args( + X, y, expected_args=["features"], kwargs=kwargs + ) + if feature < 0 or feature >= X.shape[1]: + raise ValueError( + f"Feature {feature} not in range [0, {X.shape[1]})" + ) + self.y_join_ = [ + f"{str(item_y)}{str(item_x)}".encode() + for item_y, item_x in zip(y, X[:, feature]) + ] + yy = factorize(self.y_join_) + XX = np.delete(X, feature, axis=1) + return self.fit(XX, yy).transform(XX) + def get_cut_points(self): """Get the cut points for each feature. Returns diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 99c5864..ec7096d 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -1,7 +1,8 @@ import unittest import sklearn -from sklearn.datasets import load_iris import numpy as np +from sklearn.datasets import load_iris +from ..cppfimdlp import factorize from ..mdlp import FImdlp from .. import version from .._version import __version__ @@ -159,3 +160,54 @@ class FImdlpTest(unittest.TestCase): with self.assertRaises(sklearn.exceptions.NotFittedError): clf = FImdlp(algorithm=1) clf.transform([[1, 2], [3, 4]]) + + def test_factorize(self): + source = [ + b"f0", + b"f1", + b"f2", + b"f3", + b"f4", + b"f5", + b"f6", + b"f1", + b"f1", + b"f7", + b"f8", + ] + expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8] + computed = factorize(source) + self.assertListEqual(expected, computed) + + def test_join_transform(self): + y = ["f0", "f0", "f2", "f3", "f4"] + x = [ + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [1, 2, 3, 4, 5], + [2, 3, 4, 5, 6], + [3, 4, 5, 6, 7], + ] + expected = [ + [0, 0, 0, 0], + [0, 0, 0, 0], + [1, 1, 1, 1], + [2, 2, 2, 2], + [2, 2, 2, 2], + ] + clf = FImdlp() + computed = clf.join_transform(x, y, 0) + for computed, expected in zip(computed, expected): + self.assertListEqual(expected, computed.tolist()) + + def test_join_transform_error(self): + y = ["f0", "f0", "f2", "f3", "f4"] + x = [ + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [1, 2, 3, 4, 5], + [2, 3, 4, 5, 6], + [3, 4, 5, 6, 7], + ] + with self.assertRaises(ValueError): + FImdlp().join_transform(x, y, 5)