feat: ⚗️ Add join_transform method and cpp factorize

This commit is contained in:
2023-01-26 10:47:27 +01:00
parent 34cd54f77e
commit ca7d158ac8
7 changed files with 148 additions and 19 deletions

12
k.py Normal file
View File

@@ -0,0 +1,12 @@
from sklearn.datasets import load_wine
from fimdlp.mdlp import FImdlp
X, y = load_wine(return_X_y=True)
trans = FImdlp()
Xt = trans.join_transform(X, y, 12)
print("X shape = ", X.shape)
print("Xt.shape=", Xt.shape)
print("Xt ", Xt[:10])
print("trans.X_ shape = ", trans.X_.shape)
print("trans.y_ ", trans.y_[:10])
print("y_join ", trans.y_join_[:10])

View File

@@ -14,10 +14,13 @@ setup(
"src/fimdlp/cfimdlp.pyx",
"src/cppmdlp/CPPFImdlp.cpp",
"src/cppmdlp/Metrics.cpp",
"src/fimdlp/Factorize.cpp",
],
language="c++",
include_dirs=["fimdlp"],
extra_compile_args=["-std=c++2a"],
extra_compile_args=[
"-std=c++11",
],
),
]
)

18
src/fimdlp/Factorize.cpp Normal file
View File

@@ -0,0 +1,18 @@
#include "Factorize.h"
namespace utils {
vector<int> cppFactorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}
}

10
src/fimdlp/Factorize.h Normal file
View File

@@ -0,0 +1,10 @@
#ifndef FACTORIZE_H
#define FACTORIZE_H
#include <vector>
#include <map>
#include <string>
namespace utils {
using namespace std;
vector<int> cppFactorize(const vector<string>&);
}
#endif

View File

@@ -24,3 +24,8 @@ cdef class CFImdlp:
return self.thisptr.getCutPoints()
def get_version(self):
return self.thisptr.version()
cdef extern from "Factorize.h" namespace "utils":
vector[int] cppFactorize(vector[string] &input_vector)
def factorize(input_vector):
return cppFactorize(input_vector)

View File

@@ -1,5 +1,5 @@
import numpy as np
from .cppfimdlp import CFImdlp
from .cppfimdlp import CFImdlp, factorize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
@@ -33,21 +33,17 @@ class FImdlp(TransformerMixin, BaseEstimator):
The list of discretizers, one for each feature.
cut_points_ : list
The list of cut points for each feature.
X_ : array
the samples used to fit, shape (n_samples, n_features)
y_ : array
the labels used to fit, shape (n_samples,)
X_ : array, shape (n_samples, n_features)
the samples used to fit
y_ : array, shape(n_samples,)
the labels used to fit
features_ : list
the list of features to be discretized
"""
def _check_params_fit(self, X, y, expected_args, kwargs):
"""Check the common parameters passed to fit"""
def _check_args(self, X, y, expected_args, kwargs):
# Check that X and y have correct shape
X, y = check_X_y(X, y)
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
# Default values
self.features_ = [i for i in range(X.shape[1])]
for key, value in kwargs.items():
@@ -68,15 +64,20 @@ class FImdlp(TransformerMixin, BaseEstimator):
raise ValueError("Feature index out of range")
return X, y
def _update_params(self, X, y):
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
self.n_features_ = X.shape[1]
def fit(self, X, y, **kwargs):
"""A reference implementation of a fitting function for a transformer.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
X : array, shape (n_samples, n_features)
The training input samples.
y : None
There is no need of a target in a transformer, yet the pipeline API
requires this parameter.
y : array, shape (n_samples,)
the labels used to fit
features : list, default=[i for i in range(n_features)]
The list of features to be discretized.
Returns
@@ -84,10 +85,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
self : object
Returns self.
"""
X, y = self._check_params_fit(
X, y = self._check_args(
X, y, expected_args=["features"], kwargs=kwargs
)
self.n_features_ = X.shape[1]
self._update_params(X, y)
self.X_ = X
self.y_ = y
self.discretizer_ = [None] * self.n_features_
@@ -119,7 +120,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
"""Discretize X values.
Parameters
----------
X : {array-like}, shape (n_samples, n_features)
X : array, shape (n_samples, n_features)
The input samples.
Returns
-------
@@ -146,6 +147,34 @@ class FImdlp(TransformerMixin, BaseEstimator):
)
return result
def join_transform(self, X, y, feature, **kwargs):
"""Join the selected feature with the labels and discretize the values
join - fit - transform
Parameters
----------
X : array, shape (n_samples, n_features)
The input samples.
y : array
the labels used to fit
feature : int
index of the feature to join with the labels
"""
X, y = self._check_args(
X, y, expected_args=["features"], kwargs=kwargs
)
if feature < 0 or feature >= X.shape[1]:
raise ValueError(
f"Feature {feature} not in range [0, {X.shape[1]})"
)
self.y_join_ = [
f"{str(item_y)}{str(item_x)}".encode()
for item_y, item_x in zip(y, X[:, feature])
]
yy = factorize(self.y_join_)
XX = np.delete(X, feature, axis=1)
return self.fit(XX, yy).transform(XX)
def get_cut_points(self):
"""Get the cut points for each feature.
Returns

View File

@@ -1,7 +1,8 @@
import unittest
import sklearn
from sklearn.datasets import load_iris
import numpy as np
from sklearn.datasets import load_iris
from ..cppfimdlp import factorize
from ..mdlp import FImdlp
from .. import version
from .._version import __version__
@@ -159,3 +160,54 @@ class FImdlpTest(unittest.TestCase):
with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(algorithm=1)
clf.transform([[1, 2], [3, 4]])
def test_factorize(self):
source = [
b"f0",
b"f1",
b"f2",
b"f3",
b"f4",
b"f5",
b"f6",
b"f1",
b"f1",
b"f7",
b"f8",
]
expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8]
computed = factorize(source)
self.assertListEqual(expected, computed)
def test_join_transform(self):
y = ["f0", "f0", "f2", "f3", "f4"]
x = [
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[1, 2, 3, 4, 5],
[2, 3, 4, 5, 6],
[3, 4, 5, 6, 7],
]
expected = [
[0, 0, 0, 0],
[0, 0, 0, 0],
[1, 1, 1, 1],
[2, 2, 2, 2],
[2, 2, 2, 2],
]
clf = FImdlp()
computed = clf.join_transform(x, y, 0)
for computed, expected in zip(computed, expected):
self.assertListEqual(expected, computed.tolist())
def test_join_transform_error(self):
y = ["f0", "f0", "f2", "f3", "f4"]
x = [
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[1, 2, 3, 4, 5],
[2, 3, 4, 5, 6],
[3, 4, 5, 6, 7],
]
with self.assertRaises(ValueError):
FImdlp().join_transform(x, y, 5)