mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-17 16:35:52 +00:00
feat: ⚗️ Add join_transform method and cpp factorize
This commit is contained in:
12
k.py
Normal file
12
k.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from sklearn.datasets import load_wine
|
||||||
|
from fimdlp.mdlp import FImdlp
|
||||||
|
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
trans = FImdlp()
|
||||||
|
Xt = trans.join_transform(X, y, 12)
|
||||||
|
print("X shape = ", X.shape)
|
||||||
|
print("Xt.shape=", Xt.shape)
|
||||||
|
print("Xt ", Xt[:10])
|
||||||
|
print("trans.X_ shape = ", trans.X_.shape)
|
||||||
|
print("trans.y_ ", trans.y_[:10])
|
||||||
|
print("y_join ", trans.y_join_[:10])
|
5
setup.py
5
setup.py
@@ -14,10 +14,13 @@ setup(
|
|||||||
"src/fimdlp/cfimdlp.pyx",
|
"src/fimdlp/cfimdlp.pyx",
|
||||||
"src/cppmdlp/CPPFImdlp.cpp",
|
"src/cppmdlp/CPPFImdlp.cpp",
|
||||||
"src/cppmdlp/Metrics.cpp",
|
"src/cppmdlp/Metrics.cpp",
|
||||||
|
"src/fimdlp/Factorize.cpp",
|
||||||
],
|
],
|
||||||
language="c++",
|
language="c++",
|
||||||
include_dirs=["fimdlp"],
|
include_dirs=["fimdlp"],
|
||||||
extra_compile_args=["-std=c++2a"],
|
extra_compile_args=[
|
||||||
|
"-std=c++11",
|
||||||
|
],
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
18
src/fimdlp/Factorize.cpp
Normal file
18
src/fimdlp/Factorize.cpp
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
#include "Factorize.h"
|
||||||
|
|
||||||
|
namespace utils {
|
||||||
|
vector<int> cppFactorize(const vector<string>& labels_t)
|
||||||
|
{
|
||||||
|
vector<int> yy;
|
||||||
|
yy.reserve(labels_t.size());
|
||||||
|
map<string, int> labelMap;
|
||||||
|
int i = 0;
|
||||||
|
for (string label : labels_t) {
|
||||||
|
if (labelMap.find(label) == labelMap.end()) {
|
||||||
|
labelMap[label] = i++;
|
||||||
|
}
|
||||||
|
yy.push_back(labelMap[label]);
|
||||||
|
}
|
||||||
|
return yy;
|
||||||
|
}
|
||||||
|
}
|
10
src/fimdlp/Factorize.h
Normal file
10
src/fimdlp/Factorize.h
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
#ifndef FACTORIZE_H
|
||||||
|
#define FACTORIZE_H
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
namespace utils {
|
||||||
|
using namespace std;
|
||||||
|
vector<int> cppFactorize(const vector<string>&);
|
||||||
|
}
|
||||||
|
#endif
|
@@ -24,3 +24,8 @@ cdef class CFImdlp:
|
|||||||
return self.thisptr.getCutPoints()
|
return self.thisptr.getCutPoints()
|
||||||
def get_version(self):
|
def get_version(self):
|
||||||
return self.thisptr.version()
|
return self.thisptr.version()
|
||||||
|
|
||||||
|
cdef extern from "Factorize.h" namespace "utils":
|
||||||
|
vector[int] cppFactorize(vector[string] &input_vector)
|
||||||
|
def factorize(input_vector):
|
||||||
|
return cppFactorize(input_vector)
|
@@ -1,5 +1,5 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from .cppfimdlp import CFImdlp
|
from .cppfimdlp import CFImdlp, factorize
|
||||||
from sklearn.base import BaseEstimator, TransformerMixin
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
from sklearn.utils.multiclass import unique_labels
|
from sklearn.utils.multiclass import unique_labels
|
||||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||||
@@ -33,21 +33,17 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
The list of discretizers, one for each feature.
|
The list of discretizers, one for each feature.
|
||||||
cut_points_ : list
|
cut_points_ : list
|
||||||
The list of cut points for each feature.
|
The list of cut points for each feature.
|
||||||
X_ : array
|
X_ : array, shape (n_samples, n_features)
|
||||||
the samples used to fit, shape (n_samples, n_features)
|
the samples used to fit
|
||||||
y_ : array
|
y_ : array, shape(n_samples,)
|
||||||
the labels used to fit, shape (n_samples,)
|
the labels used to fit
|
||||||
features_ : list
|
features_ : list
|
||||||
the list of features to be discretized
|
the list of features to be discretized
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _check_params_fit(self, X, y, expected_args, kwargs):
|
def _check_args(self, X, y, expected_args, kwargs):
|
||||||
"""Check the common parameters passed to fit"""
|
|
||||||
# Check that X and y have correct shape
|
# Check that X and y have correct shape
|
||||||
X, y = check_X_y(X, y)
|
X, y = check_X_y(X, y)
|
||||||
# Store the classes seen during fit
|
|
||||||
self.classes_ = unique_labels(y)
|
|
||||||
self.n_classes_ = self.classes_.shape[0]
|
|
||||||
# Default values
|
# Default values
|
||||||
self.features_ = [i for i in range(X.shape[1])]
|
self.features_ = [i for i in range(X.shape[1])]
|
||||||
for key, value in kwargs.items():
|
for key, value in kwargs.items():
|
||||||
@@ -68,15 +64,20 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
raise ValueError("Feature index out of range")
|
raise ValueError("Feature index out of range")
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
def _update_params(self, X, y):
|
||||||
|
# Store the classes seen during fit
|
||||||
|
self.classes_ = unique_labels(y)
|
||||||
|
self.n_classes_ = self.classes_.shape[0]
|
||||||
|
self.n_features_ = X.shape[1]
|
||||||
|
|
||||||
def fit(self, X, y, **kwargs):
|
def fit(self, X, y, **kwargs):
|
||||||
"""A reference implementation of a fitting function for a transformer.
|
"""A reference implementation of a fitting function for a transformer.
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
X : array, shape (n_samples, n_features)
|
||||||
The training input samples.
|
The training input samples.
|
||||||
y : None
|
y : array, shape (n_samples,)
|
||||||
There is no need of a target in a transformer, yet the pipeline API
|
the labels used to fit
|
||||||
requires this parameter.
|
|
||||||
features : list, default=[i for i in range(n_features)]
|
features : list, default=[i for i in range(n_features)]
|
||||||
The list of features to be discretized.
|
The list of features to be discretized.
|
||||||
Returns
|
Returns
|
||||||
@@ -84,10 +85,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
self : object
|
self : object
|
||||||
Returns self.
|
Returns self.
|
||||||
"""
|
"""
|
||||||
X, y = self._check_params_fit(
|
X, y = self._check_args(
|
||||||
X, y, expected_args=["features"], kwargs=kwargs
|
X, y, expected_args=["features"], kwargs=kwargs
|
||||||
)
|
)
|
||||||
self.n_features_ = X.shape[1]
|
self._update_params(X, y)
|
||||||
self.X_ = X
|
self.X_ = X
|
||||||
self.y_ = y
|
self.y_ = y
|
||||||
self.discretizer_ = [None] * self.n_features_
|
self.discretizer_ = [None] * self.n_features_
|
||||||
@@ -119,7 +120,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
"""Discretize X values.
|
"""Discretize X values.
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
X : {array-like}, shape (n_samples, n_features)
|
X : array, shape (n_samples, n_features)
|
||||||
The input samples.
|
The input samples.
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -146,6 +147,34 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def join_transform(self, X, y, feature, **kwargs):
|
||||||
|
"""Join the selected feature with the labels and discretize the values
|
||||||
|
join - fit - transform
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : array, shape (n_samples, n_features)
|
||||||
|
The input samples.
|
||||||
|
y : array
|
||||||
|
the labels used to fit
|
||||||
|
feature : int
|
||||||
|
index of the feature to join with the labels
|
||||||
|
"""
|
||||||
|
X, y = self._check_args(
|
||||||
|
X, y, expected_args=["features"], kwargs=kwargs
|
||||||
|
)
|
||||||
|
if feature < 0 or feature >= X.shape[1]:
|
||||||
|
raise ValueError(
|
||||||
|
f"Feature {feature} not in range [0, {X.shape[1]})"
|
||||||
|
)
|
||||||
|
self.y_join_ = [
|
||||||
|
f"{str(item_y)}{str(item_x)}".encode()
|
||||||
|
for item_y, item_x in zip(y, X[:, feature])
|
||||||
|
]
|
||||||
|
yy = factorize(self.y_join_)
|
||||||
|
XX = np.delete(X, feature, axis=1)
|
||||||
|
return self.fit(XX, yy).transform(XX)
|
||||||
|
|
||||||
def get_cut_points(self):
|
def get_cut_points(self):
|
||||||
"""Get the cut points for each feature.
|
"""Get the cut points for each feature.
|
||||||
Returns
|
Returns
|
||||||
|
@@ -1,7 +1,8 @@
|
|||||||
import unittest
|
import unittest
|
||||||
import sklearn
|
import sklearn
|
||||||
from sklearn.datasets import load_iris
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from sklearn.datasets import load_iris
|
||||||
|
from ..cppfimdlp import factorize
|
||||||
from ..mdlp import FImdlp
|
from ..mdlp import FImdlp
|
||||||
from .. import version
|
from .. import version
|
||||||
from .._version import __version__
|
from .._version import __version__
|
||||||
@@ -159,3 +160,54 @@ class FImdlpTest(unittest.TestCase):
|
|||||||
with self.assertRaises(sklearn.exceptions.NotFittedError):
|
with self.assertRaises(sklearn.exceptions.NotFittedError):
|
||||||
clf = FImdlp(algorithm=1)
|
clf = FImdlp(algorithm=1)
|
||||||
clf.transform([[1, 2], [3, 4]])
|
clf.transform([[1, 2], [3, 4]])
|
||||||
|
|
||||||
|
def test_factorize(self):
|
||||||
|
source = [
|
||||||
|
b"f0",
|
||||||
|
b"f1",
|
||||||
|
b"f2",
|
||||||
|
b"f3",
|
||||||
|
b"f4",
|
||||||
|
b"f5",
|
||||||
|
b"f6",
|
||||||
|
b"f1",
|
||||||
|
b"f1",
|
||||||
|
b"f7",
|
||||||
|
b"f8",
|
||||||
|
]
|
||||||
|
expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8]
|
||||||
|
computed = factorize(source)
|
||||||
|
self.assertListEqual(expected, computed)
|
||||||
|
|
||||||
|
def test_join_transform(self):
|
||||||
|
y = ["f0", "f0", "f2", "f3", "f4"]
|
||||||
|
x = [
|
||||||
|
[0, 1, 2, 3, 4],
|
||||||
|
[0, 1, 2, 3, 4],
|
||||||
|
[1, 2, 3, 4, 5],
|
||||||
|
[2, 3, 4, 5, 6],
|
||||||
|
[3, 4, 5, 6, 7],
|
||||||
|
]
|
||||||
|
expected = [
|
||||||
|
[0, 0, 0, 0],
|
||||||
|
[0, 0, 0, 0],
|
||||||
|
[1, 1, 1, 1],
|
||||||
|
[2, 2, 2, 2],
|
||||||
|
[2, 2, 2, 2],
|
||||||
|
]
|
||||||
|
clf = FImdlp()
|
||||||
|
computed = clf.join_transform(x, y, 0)
|
||||||
|
for computed, expected in zip(computed, expected):
|
||||||
|
self.assertListEqual(expected, computed.tolist())
|
||||||
|
|
||||||
|
def test_join_transform_error(self):
|
||||||
|
y = ["f0", "f0", "f2", "f3", "f4"]
|
||||||
|
x = [
|
||||||
|
[0, 1, 2, 3, 4],
|
||||||
|
[0, 1, 2, 3, 4],
|
||||||
|
[1, 2, 3, 4, 5],
|
||||||
|
[2, 3, 4, 5, 6],
|
||||||
|
[3, 4, 5, 6, 7],
|
||||||
|
]
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
FImdlp().join_transform(x, y, 5)
|
||||||
|
Reference in New Issue
Block a user