feat: ⚗️ Add join_transform method and cpp factorize

This commit is contained in:
2023-01-26 10:47:27 +01:00
parent 34cd54f77e
commit ca7d158ac8
7 changed files with 148 additions and 19 deletions

12
k.py Normal file
View File

@@ -0,0 +1,12 @@
from sklearn.datasets import load_wine
from fimdlp.mdlp import FImdlp
X, y = load_wine(return_X_y=True)
trans = FImdlp()
Xt = trans.join_transform(X, y, 12)
print("X shape = ", X.shape)
print("Xt.shape=", Xt.shape)
print("Xt ", Xt[:10])
print("trans.X_ shape = ", trans.X_.shape)
print("trans.y_ ", trans.y_[:10])
print("y_join ", trans.y_join_[:10])

View File

@@ -14,10 +14,13 @@ setup(
"src/fimdlp/cfimdlp.pyx", "src/fimdlp/cfimdlp.pyx",
"src/cppmdlp/CPPFImdlp.cpp", "src/cppmdlp/CPPFImdlp.cpp",
"src/cppmdlp/Metrics.cpp", "src/cppmdlp/Metrics.cpp",
"src/fimdlp/Factorize.cpp",
], ],
language="c++", language="c++",
include_dirs=["fimdlp"], include_dirs=["fimdlp"],
extra_compile_args=["-std=c++2a"], extra_compile_args=[
"-std=c++11",
],
), ),
] ]
) )

18
src/fimdlp/Factorize.cpp Normal file
View File

@@ -0,0 +1,18 @@
#include "Factorize.h"
namespace utils {
vector<int> cppFactorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}
}

10
src/fimdlp/Factorize.h Normal file
View File

@@ -0,0 +1,10 @@
#ifndef FACTORIZE_H
#define FACTORIZE_H
#include <vector>
#include <map>
#include <string>
namespace utils {
using namespace std;
vector<int> cppFactorize(const vector<string>&);
}
#endif

View File

@@ -24,3 +24,8 @@ cdef class CFImdlp:
return self.thisptr.getCutPoints() return self.thisptr.getCutPoints()
def get_version(self): def get_version(self):
return self.thisptr.version() return self.thisptr.version()
cdef extern from "Factorize.h" namespace "utils":
vector[int] cppFactorize(vector[string] &input_vector)
def factorize(input_vector):
return cppFactorize(input_vector)

View File

@@ -1,5 +1,5 @@
import numpy as np import numpy as np
from .cppfimdlp import CFImdlp from .cppfimdlp import CFImdlp, factorize
from sklearn.base import BaseEstimator, TransformerMixin from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import unique_labels from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
@@ -33,21 +33,17 @@ class FImdlp(TransformerMixin, BaseEstimator):
The list of discretizers, one for each feature. The list of discretizers, one for each feature.
cut_points_ : list cut_points_ : list
The list of cut points for each feature. The list of cut points for each feature.
X_ : array X_ : array, shape (n_samples, n_features)
the samples used to fit, shape (n_samples, n_features) the samples used to fit
y_ : array y_ : array, shape(n_samples,)
the labels used to fit, shape (n_samples,) the labels used to fit
features_ : list features_ : list
the list of features to be discretized the list of features to be discretized
""" """
def _check_params_fit(self, X, y, expected_args, kwargs): def _check_args(self, X, y, expected_args, kwargs):
"""Check the common parameters passed to fit"""
# Check that X and y have correct shape # Check that X and y have correct shape
X, y = check_X_y(X, y) X, y = check_X_y(X, y)
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
# Default values # Default values
self.features_ = [i for i in range(X.shape[1])] self.features_ = [i for i in range(X.shape[1])]
for key, value in kwargs.items(): for key, value in kwargs.items():
@@ -68,15 +64,20 @@ class FImdlp(TransformerMixin, BaseEstimator):
raise ValueError("Feature index out of range") raise ValueError("Feature index out of range")
return X, y return X, y
def _update_params(self, X, y):
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
self.n_features_ = X.shape[1]
def fit(self, X, y, **kwargs): def fit(self, X, y, **kwargs):
"""A reference implementation of a fitting function for a transformer. """A reference implementation of a fitting function for a transformer.
Parameters Parameters
---------- ----------
X : {array-like, sparse matrix}, shape (n_samples, n_features) X : array, shape (n_samples, n_features)
The training input samples. The training input samples.
y : None y : array, shape (n_samples,)
There is no need of a target in a transformer, yet the pipeline API the labels used to fit
requires this parameter.
features : list, default=[i for i in range(n_features)] features : list, default=[i for i in range(n_features)]
The list of features to be discretized. The list of features to be discretized.
Returns Returns
@@ -84,10 +85,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
self : object self : object
Returns self. Returns self.
""" """
X, y = self._check_params_fit( X, y = self._check_args(
X, y, expected_args=["features"], kwargs=kwargs X, y, expected_args=["features"], kwargs=kwargs
) )
self.n_features_ = X.shape[1] self._update_params(X, y)
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
self.discretizer_ = [None] * self.n_features_ self.discretizer_ = [None] * self.n_features_
@@ -119,7 +120,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
"""Discretize X values. """Discretize X values.
Parameters Parameters
---------- ----------
X : {array-like}, shape (n_samples, n_features) X : array, shape (n_samples, n_features)
The input samples. The input samples.
Returns Returns
------- -------
@@ -146,6 +147,34 @@ class FImdlp(TransformerMixin, BaseEstimator):
) )
return result return result
def join_transform(self, X, y, feature, **kwargs):
"""Join the selected feature with the labels and discretize the values
join - fit - transform
Parameters
----------
X : array, shape (n_samples, n_features)
The input samples.
y : array
the labels used to fit
feature : int
index of the feature to join with the labels
"""
X, y = self._check_args(
X, y, expected_args=["features"], kwargs=kwargs
)
if feature < 0 or feature >= X.shape[1]:
raise ValueError(
f"Feature {feature} not in range [0, {X.shape[1]})"
)
self.y_join_ = [
f"{str(item_y)}{str(item_x)}".encode()
for item_y, item_x in zip(y, X[:, feature])
]
yy = factorize(self.y_join_)
XX = np.delete(X, feature, axis=1)
return self.fit(XX, yy).transform(XX)
def get_cut_points(self): def get_cut_points(self):
"""Get the cut points for each feature. """Get the cut points for each feature.
Returns Returns

View File

@@ -1,7 +1,8 @@
import unittest import unittest
import sklearn import sklearn
from sklearn.datasets import load_iris
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from ..cppfimdlp import factorize
from ..mdlp import FImdlp from ..mdlp import FImdlp
from .. import version from .. import version
from .._version import __version__ from .._version import __version__
@@ -159,3 +160,54 @@ class FImdlpTest(unittest.TestCase):
with self.assertRaises(sklearn.exceptions.NotFittedError): with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(algorithm=1) clf = FImdlp(algorithm=1)
clf.transform([[1, 2], [3, 4]]) clf.transform([[1, 2], [3, 4]])
def test_factorize(self):
source = [
b"f0",
b"f1",
b"f2",
b"f3",
b"f4",
b"f5",
b"f6",
b"f1",
b"f1",
b"f7",
b"f8",
]
expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8]
computed = factorize(source)
self.assertListEqual(expected, computed)
def test_join_transform(self):
y = ["f0", "f0", "f2", "f3", "f4"]
x = [
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[1, 2, 3, 4, 5],
[2, 3, 4, 5, 6],
[3, 4, 5, 6, 7],
]
expected = [
[0, 0, 0, 0],
[0, 0, 0, 0],
[1, 1, 1, 1],
[2, 2, 2, 2],
[2, 2, 2, 2],
]
clf = FImdlp()
computed = clf.join_transform(x, y, 0)
for computed, expected in zip(computed, expected):
self.assertListEqual(expected, computed.tolist())
def test_join_transform_error(self):
y = ["f0", "f0", "f2", "f3", "f4"]
x = [
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[1, 2, 3, 4, 5],
[2, 3, 4, 5, 6],
[3, 4, 5, 6, 7],
]
with self.assertRaises(ValueError):
FImdlp().join_transform(x, y, 5)