mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-16 16:05:52 +00:00
feat: ⚗️ Add join_transform method and cpp factorize
This commit is contained in:
12
k.py
Normal file
12
k.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from sklearn.datasets import load_wine
|
||||
from fimdlp.mdlp import FImdlp
|
||||
|
||||
X, y = load_wine(return_X_y=True)
|
||||
trans = FImdlp()
|
||||
Xt = trans.join_transform(X, y, 12)
|
||||
print("X shape = ", X.shape)
|
||||
print("Xt.shape=", Xt.shape)
|
||||
print("Xt ", Xt[:10])
|
||||
print("trans.X_ shape = ", trans.X_.shape)
|
||||
print("trans.y_ ", trans.y_[:10])
|
||||
print("y_join ", trans.y_join_[:10])
|
5
setup.py
5
setup.py
@@ -14,10 +14,13 @@ setup(
|
||||
"src/fimdlp/cfimdlp.pyx",
|
||||
"src/cppmdlp/CPPFImdlp.cpp",
|
||||
"src/cppmdlp/Metrics.cpp",
|
||||
"src/fimdlp/Factorize.cpp",
|
||||
],
|
||||
language="c++",
|
||||
include_dirs=["fimdlp"],
|
||||
extra_compile_args=["-std=c++2a"],
|
||||
extra_compile_args=[
|
||||
"-std=c++11",
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
18
src/fimdlp/Factorize.cpp
Normal file
18
src/fimdlp/Factorize.cpp
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "Factorize.h"
|
||||
|
||||
namespace utils {
|
||||
vector<int> cppFactorize(const vector<string>& labels_t)
|
||||
{
|
||||
vector<int> yy;
|
||||
yy.reserve(labels_t.size());
|
||||
map<string, int> labelMap;
|
||||
int i = 0;
|
||||
for (string label : labels_t) {
|
||||
if (labelMap.find(label) == labelMap.end()) {
|
||||
labelMap[label] = i++;
|
||||
}
|
||||
yy.push_back(labelMap[label]);
|
||||
}
|
||||
return yy;
|
||||
}
|
||||
}
|
10
src/fimdlp/Factorize.h
Normal file
10
src/fimdlp/Factorize.h
Normal file
@@ -0,0 +1,10 @@
|
||||
#ifndef FACTORIZE_H
|
||||
#define FACTORIZE_H
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <string>
|
||||
namespace utils {
|
||||
using namespace std;
|
||||
vector<int> cppFactorize(const vector<string>&);
|
||||
}
|
||||
#endif
|
@@ -24,3 +24,8 @@ cdef class CFImdlp:
|
||||
return self.thisptr.getCutPoints()
|
||||
def get_version(self):
|
||||
return self.thisptr.version()
|
||||
|
||||
cdef extern from "Factorize.h" namespace "utils":
|
||||
vector[int] cppFactorize(vector[string] &input_vector)
|
||||
def factorize(input_vector):
|
||||
return cppFactorize(input_vector)
|
@@ -1,5 +1,5 @@
|
||||
import numpy as np
|
||||
from .cppfimdlp import CFImdlp
|
||||
from .cppfimdlp import CFImdlp, factorize
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.utils.multiclass import unique_labels
|
||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||
@@ -33,21 +33,17 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
The list of discretizers, one for each feature.
|
||||
cut_points_ : list
|
||||
The list of cut points for each feature.
|
||||
X_ : array
|
||||
the samples used to fit, shape (n_samples, n_features)
|
||||
y_ : array
|
||||
the labels used to fit, shape (n_samples,)
|
||||
X_ : array, shape (n_samples, n_features)
|
||||
the samples used to fit
|
||||
y_ : array, shape(n_samples,)
|
||||
the labels used to fit
|
||||
features_ : list
|
||||
the list of features to be discretized
|
||||
"""
|
||||
|
||||
def _check_params_fit(self, X, y, expected_args, kwargs):
|
||||
"""Check the common parameters passed to fit"""
|
||||
def _check_args(self, X, y, expected_args, kwargs):
|
||||
# Check that X and y have correct shape
|
||||
X, y = check_X_y(X, y)
|
||||
# Store the classes seen during fit
|
||||
self.classes_ = unique_labels(y)
|
||||
self.n_classes_ = self.classes_.shape[0]
|
||||
# Default values
|
||||
self.features_ = [i for i in range(X.shape[1])]
|
||||
for key, value in kwargs.items():
|
||||
@@ -68,15 +64,20 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
raise ValueError("Feature index out of range")
|
||||
return X, y
|
||||
|
||||
def _update_params(self, X, y):
|
||||
# Store the classes seen during fit
|
||||
self.classes_ = unique_labels(y)
|
||||
self.n_classes_ = self.classes_.shape[0]
|
||||
self.n_features_ = X.shape[1]
|
||||
|
||||
def fit(self, X, y, **kwargs):
|
||||
"""A reference implementation of a fitting function for a transformer.
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
X : array, shape (n_samples, n_features)
|
||||
The training input samples.
|
||||
y : None
|
||||
There is no need of a target in a transformer, yet the pipeline API
|
||||
requires this parameter.
|
||||
y : array, shape (n_samples,)
|
||||
the labels used to fit
|
||||
features : list, default=[i for i in range(n_features)]
|
||||
The list of features to be discretized.
|
||||
Returns
|
||||
@@ -84,10 +85,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
self : object
|
||||
Returns self.
|
||||
"""
|
||||
X, y = self._check_params_fit(
|
||||
X, y = self._check_args(
|
||||
X, y, expected_args=["features"], kwargs=kwargs
|
||||
)
|
||||
self.n_features_ = X.shape[1]
|
||||
self._update_params(X, y)
|
||||
self.X_ = X
|
||||
self.y_ = y
|
||||
self.discretizer_ = [None] * self.n_features_
|
||||
@@ -119,7 +120,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
"""Discretize X values.
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like}, shape (n_samples, n_features)
|
||||
X : array, shape (n_samples, n_features)
|
||||
The input samples.
|
||||
Returns
|
||||
-------
|
||||
@@ -146,6 +147,34 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
)
|
||||
return result
|
||||
|
||||
def join_transform(self, X, y, feature, **kwargs):
|
||||
"""Join the selected feature with the labels and discretize the values
|
||||
join - fit - transform
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array, shape (n_samples, n_features)
|
||||
The input samples.
|
||||
y : array
|
||||
the labels used to fit
|
||||
feature : int
|
||||
index of the feature to join with the labels
|
||||
"""
|
||||
X, y = self._check_args(
|
||||
X, y, expected_args=["features"], kwargs=kwargs
|
||||
)
|
||||
if feature < 0 or feature >= X.shape[1]:
|
||||
raise ValueError(
|
||||
f"Feature {feature} not in range [0, {X.shape[1]})"
|
||||
)
|
||||
self.y_join_ = [
|
||||
f"{str(item_y)}{str(item_x)}".encode()
|
||||
for item_y, item_x in zip(y, X[:, feature])
|
||||
]
|
||||
yy = factorize(self.y_join_)
|
||||
XX = np.delete(X, feature, axis=1)
|
||||
return self.fit(XX, yy).transform(XX)
|
||||
|
||||
def get_cut_points(self):
|
||||
"""Get the cut points for each feature.
|
||||
Returns
|
||||
|
@@ -1,7 +1,8 @@
|
||||
import unittest
|
||||
import sklearn
|
||||
from sklearn.datasets import load_iris
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris
|
||||
from ..cppfimdlp import factorize
|
||||
from ..mdlp import FImdlp
|
||||
from .. import version
|
||||
from .._version import __version__
|
||||
@@ -159,3 +160,54 @@ class FImdlpTest(unittest.TestCase):
|
||||
with self.assertRaises(sklearn.exceptions.NotFittedError):
|
||||
clf = FImdlp(algorithm=1)
|
||||
clf.transform([[1, 2], [3, 4]])
|
||||
|
||||
def test_factorize(self):
|
||||
source = [
|
||||
b"f0",
|
||||
b"f1",
|
||||
b"f2",
|
||||
b"f3",
|
||||
b"f4",
|
||||
b"f5",
|
||||
b"f6",
|
||||
b"f1",
|
||||
b"f1",
|
||||
b"f7",
|
||||
b"f8",
|
||||
]
|
||||
expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8]
|
||||
computed = factorize(source)
|
||||
self.assertListEqual(expected, computed)
|
||||
|
||||
def test_join_transform(self):
|
||||
y = ["f0", "f0", "f2", "f3", "f4"]
|
||||
x = [
|
||||
[0, 1, 2, 3, 4],
|
||||
[0, 1, 2, 3, 4],
|
||||
[1, 2, 3, 4, 5],
|
||||
[2, 3, 4, 5, 6],
|
||||
[3, 4, 5, 6, 7],
|
||||
]
|
||||
expected = [
|
||||
[0, 0, 0, 0],
|
||||
[0, 0, 0, 0],
|
||||
[1, 1, 1, 1],
|
||||
[2, 2, 2, 2],
|
||||
[2, 2, 2, 2],
|
||||
]
|
||||
clf = FImdlp()
|
||||
computed = clf.join_transform(x, y, 0)
|
||||
for computed, expected in zip(computed, expected):
|
||||
self.assertListEqual(expected, computed.tolist())
|
||||
|
||||
def test_join_transform_error(self):
|
||||
y = ["f0", "f0", "f2", "f3", "f4"]
|
||||
x = [
|
||||
[0, 1, 2, 3, 4],
|
||||
[0, 1, 2, 3, 4],
|
||||
[1, 2, 3, 4, 5],
|
||||
[2, 3, 4, 5, 6],
|
||||
[3, 4, 5, 6, 7],
|
||||
]
|
||||
with self.assertRaises(ValueError):
|
||||
FImdlp().join_transform(x, y, 5)
|
||||
|
Reference in New Issue
Block a user