add join_fit target info

This commit is contained in:
2023-04-08 12:22:03 +02:00
parent e44bca0420
commit 0768d68a36
3 changed files with 21 additions and 16 deletions

12
k.py
View File

@@ -1,12 +0,0 @@
from sklearn.datasets import load_wine
from fimdlp.mdlp import FImdlp
X, y = load_wine(return_X_y=True)
trans = FImdlp()
Xt = trans.join_transform(X, y, 12)
print("X shape = ", X.shape)
print("Xt.shape=", Xt.shape)
print("Xt ", Xt[:10])
print("trans.X_ shape = ", trans.X_.shape)
print("trans.y_ ", trans.y_[:10])
print("y_join ", trans.y_join_[:10])

View File

@@ -6,8 +6,6 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from joblib import Parallel, delayed
from ._version import __version__
# from ._version import __version__
class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0):
@@ -24,6 +22,12 @@ class FImdlp(TransformerMixin, BaseEstimator):
The number of jobs to run in parallel. :meth:`fit` and
:meth:`transform`, are parallelized over the features. ``-1`` means
using all cores available.
min_length: int, default=3
The minimum length of an interval to be considered to be discretized.
max_depth: int, default=1e6
The maximum depth of the discretization process.
max_cuts: float, default=0
The maximum number of cut points to be computed for each feature.
Attributes
----------
@@ -109,6 +113,8 @@ class FImdlp(TransformerMixin, BaseEstimator):
delayed(self._fit_discretizer)(feature)
for feature in range(self.n_features_in_)
)
# target of every feature. Start with -1 => y (see join_fit)
self.target_ = [-1] * self.n_features_in_
return self
def _fit_discretizer(self, feature):
@@ -244,11 +250,12 @@ class FImdlp(TransformerMixin, BaseEstimator):
f"Target {target} not in range [0, {self.n_features_in_})"
)
if target in features:
raise ValueError("Target cannot in features to join")
raise ValueError("Target cannot be in features to join")
y_join = [
f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode()
for item_y, items_x in zip(self.y_, data[:, features])
]
self.target_[target] = features + [-1]
self.y_join_ = y_join
self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))
self.cut_points_[target] = self.discretizer_[target].get_cut_points()

View File

@@ -196,7 +196,7 @@ class FImdlpTest(unittest.TestCase):
clf.join_fit([0, 2], 2, x)
self.assertEqual(
str(exception.exception),
"Target cannot in features to join",
"Target cannot be in features to join",
)
def test_factorize(self):
@@ -209,6 +209,16 @@ class FImdlpTest(unittest.TestCase):
computed = clf.factorize(y)
self.assertListEqual([0, 1, 1, 2, 3], computed)
def test_join_fit_info(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
clf.join_fit([0, 2], 1, X)
clf.join_fit([0, 3], 2, X)
clf.join_fit([1, 2], 3, X)
expected = [-1, [0, 2, -1], [0, 3, -1], [1, 2, -1]]
self.assertListEqual(expected, clf.target_)
@staticmethod
def test_sklearn_transformer():
for check, test in check_estimator(FImdlp(), generate_only=True):