25 Commits

Author SHA1 Message Date
f65efe3dfd Update the c++ sources with new version 2023-02-24 11:04:06 +01:00
e9d19d41da Add changed submodule 2023-02-22 11:56:39 +01:00
6450ccb9bd Add changed submodule 2023-02-22 11:34:27 +01:00
5d2f32bb0e Add needed header file to MANIFEST 2023-02-22 11:33:26 +01:00
Ricardo Montañana Gómez
8249e55b0c Merge pull request #6 from Doctorado-ML/joinfeatures
- Add a join_fit feature that can update a fitted discretizer. Making it possible to discretize a variable by taking into account the label and a list of other features of the dataset. Used in local discretization with bayesian estimators.
- Add factorize method to be able to simulate the pandas factorize method.
- Remove the algorithm hyperparameter as it is no longer needed
- Add get_states_feature method to obtain a list of states of any feature based on the number of cut points computed while fitting the discretizer
2023-02-22 10:44:43 +01:00
40871f128d Add 1.1.0 version of mdlp 2023-02-22 10:15:33 +01:00
718c9d0e63 make static methods factorize and test_sklrn_trans 2023-02-20 20:12:36 +01:00
e0b7cae9a0 Remove algorithm hyperparameter in discretizer 2023-02-20 18:26:51 +01:00
31d79a77fa Add get_states_feature method 2023-02-13 17:34:50 +01:00
2d495293bb Add range_features method 2023-02-13 16:15:50 +01:00
9899781640 Complete join_fit and remove MultiDiscretizer 2023-02-05 00:30:03 +01:00
f20496203e refactor Multidiscretizer to use one per column 2023-02-04 19:23:15 +01:00
cf09d92ccc add MultiDiscretizer 2023-02-04 17:45:36 +01:00
1186e4ad53 chore: 🔖 Upgrade version number to 0.9.3 2023-01-28 19:15:26 +01:00
7913f5151e Add version command to Makefile 2023-01-28 19:14:32 +01:00
050b923631 feat: Add factorize method to transformer 2023-01-28 10:35:07 +01:00
29fc88cecc test: Add scikit learn compatibility check_estimator test 2023-01-26 23:20:51 +01:00
16b31ec293 test: Complete join_transform test 2023-01-26 11:17:10 +01:00
ca7d158ac8 feat: ⚗️ Add join_transform method and cpp factorize 2023-01-26 10:47:27 +01:00
Ricardo Montañana Gómez
34cd54f77e feat: ♻️ Add Classic algorithm as number 2 to compare performance 2023-01-13 11:47:01 +01:00
70bf03155c Add scikit-learn as requirement 2022-12-23 14:07:36 +01:00
77b571af71 Update README to include link to pypi 2022-12-22 19:41:55 +01:00
ff7a91a7ec build: 🚀 2022-12-22 19:39:05 +01:00
621c19d00c style: 🎨 Remove unused variable in c++ module 2022-12-22 11:02:16 +01:00
Ricardo Montañana Gómez
790da5cc60 Merge pull request #5 from Doctorado-ML/fix_sdist
fix: 🐛 Fix a bug when pip install tries to build the package of F…
2022-12-22 10:29:46 +01:00
15 changed files with 337 additions and 130 deletions

1
.gitignore vendored
View File

@@ -136,3 +136,4 @@ cmake-build-debug/**
**/x/* **/x/*
**/*.so **/*.so
**/CMakeFiles **/CMakeFiles
wheelhouse

View File

@@ -1 +1,4 @@
include src/cppmdlp/CPPFImdlp.h include src/cppmdlp/CPPFImdlp.h
include src/cppmdlp/typesFImdlp.h
include src/cppmdlp/Metrics.h
include src/fimdlp/Factorize.h

View File

@@ -37,6 +37,11 @@ install: ## Build extension
audit: ## Audit pip audit: ## Audit pip
pip-audit pip-audit
version:
@echo "Current Python version .: $(shell python --version)"
@echo "Current FImdlp version .: $(shell python -c "from fimdlp import _version; print(_version.__version__)")"
@echo "Installed FImdlp version: $(shell pip show fimdlp | grep Version | cut -d' ' -f2)"
help: ## Show help message help: ## Show help message
@IFS=$$'\n' ; \ @IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \

View File

@@ -3,7 +3,7 @@
[![CodeQL](https://github.com/Doctorado-ML/FImdlp/actions/workflows/codeql.yml/badge.svg)](https://github.com/Doctorado-ML/FImdlp/actions/workflows/codeql.yml) [![CodeQL](https://github.com/Doctorado-ML/FImdlp/actions/workflows/codeql.yml/badge.svg)](https://github.com/Doctorado-ML/FImdlp/actions/workflows/codeql.yml)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/8b4d784fee13401588aa8c06532a2f6d)](https://www.codacy.com/gh/Doctorado-ML/FImdlp/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/FImdlp&utm_campaign=Badge_Grade) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/8b4d784fee13401588aa8c06532a2f6d)](https://www.codacy.com/gh/Doctorado-ML/FImdlp/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/FImdlp&utm_campaign=Badge_Grade)
[![codecov](https://codecov.io/gh/Doctorado-ML/FImdlp/branch/main/graph/badge.svg?token=W8I45B5Z3J)](https://codecov.io/gh/Doctorado-ML/FImdlp) [![codecov](https://codecov.io/gh/Doctorado-ML/FImdlp/branch/main/graph/badge.svg?token=W8I45B5Z3J)](https://codecov.io/gh/Doctorado-ML/FImdlp)
[![pypy](https://img.shields.io/pypi/v/FImdlp?color=g)](https://img.shields.io/pypi/v/FImdlp?color=g) [![pypy](https://img.shields.io/pypi/v/FImdlp?color=g)](https://pypi.org/project/FImdlp)
![https://img.shields.io/badge/python-3.9%2B-blue](https://img.shields.io/badge/python-3.9%2B-brightgreen) ![https://img.shields.io/badge/python-3.9%2B-blue](https://img.shields.io/badge/python-3.9%2B-brightgreen)
Discretization algorithm based on the paper by Usama M. Fayyad and Keki B. Irani Discretization algorithm based on the paper by Usama M. Fayyad and Keki B. Irani

12
k.py Normal file
View File

@@ -0,0 +1,12 @@
from sklearn.datasets import load_wine
from fimdlp.mdlp import FImdlp
X, y = load_wine(return_X_y=True)
trans = FImdlp()
Xt = trans.join_transform(X, y, 12)
print("X shape = ", X.shape)
print("Xt.shape=", Xt.shape)
print("Xt ", Xt[:10])
print("trans.X_ shape = ", trans.X_.shape)
print("trans.y_ ", trans.y_[:10])
print("y_join ", trans.y_join_[:10])

View File

@@ -18,7 +18,7 @@ authors = [
{ name = "Ricardo Montañana", email = "ricardo.montanana@alu.uclm.es" }, { name = "Ricardo Montañana", email = "ricardo.montanana@alu.uclm.es" },
] ]
dynamic = ['version'] dynamic = ['version']
dependencies = ["numpy", "joblib"] dependencies = ["numpy", "joblib", "scikit-learn"]
requires-python = ">=3.9" requires-python = ">=3.9"
classifiers = [ classifiers = [
"Development Status :: 3 - Alpha", "Development Status :: 3 - Alpha",

View File

@@ -14,10 +14,13 @@ setup(
"src/fimdlp/cfimdlp.pyx", "src/fimdlp/cfimdlp.pyx",
"src/cppmdlp/CPPFImdlp.cpp", "src/cppmdlp/CPPFImdlp.cpp",
"src/cppmdlp/Metrics.cpp", "src/cppmdlp/Metrics.cpp",
"src/fimdlp/Factorize.cpp",
], ],
language="c++", language="c++",
include_dirs=["fimdlp"], include_dirs=["fimdlp"],
extra_compile_args=["-std=c++2a"], extra_compile_args=[
"-std=c++11",
],
), ),
] ]
) )

18
src/fimdlp/Factorize.cpp Normal file
View File

@@ -0,0 +1,18 @@
#include "Factorize.h"
namespace utils {
vector<int> cppFactorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}
}

10
src/fimdlp/Factorize.h Normal file
View File

@@ -0,0 +1,10 @@
#ifndef FACTORIZE_H
#define FACTORIZE_H
#include <vector>
#include <map>
#include <string>
namespace utils {
using namespace std;
vector<int> cppFactorize(const vector<string>&);
}
#endif

View File

@@ -1,8 +1,4 @@
from ._version import __version__ from ._version import __version__
def version():
return __version__
all = ["FImdlp", "__version__"] all = ["FImdlp", "__version__"]

View File

@@ -1 +1 @@
__version__ = "0.9.2" __version__ = "0.9.3"

View File

@@ -6,16 +6,15 @@ from libcpp.string cimport string
cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
ctypedef float precision_t ctypedef float precision_t
cdef cppclass CPPFImdlp: cdef cppclass CPPFImdlp:
CPPFImdlp(int) except + CPPFImdlp() except +
CPPFImdlp& fit(vector[precision_t]&, vector[int]&) CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
vector[precision_t] getCutPoints() vector[precision_t] getCutPoints()
string version() string version()
cdef class CFImdlp: cdef class CFImdlp:
cdef CPPFImdlp *thisptr cdef CPPFImdlp *thisptr
def __cinit__(self, algorithm): def __cinit__(self):
self.thisptr = new CPPFImdlp(algorithm) self.thisptr = new CPPFImdlp()
def __dealloc__(self): def __dealloc__(self):
del self.thisptr del self.thisptr
def fit(self, X, y): def fit(self, X, y):
@@ -25,4 +24,10 @@ cdef class CFImdlp:
return self.thisptr.getCutPoints() return self.thisptr.getCutPoints()
def get_version(self): def get_version(self):
return self.thisptr.version() return self.thisptr.version()
def __reduce__(self):
return (CFImdlp, ())
cdef extern from "Factorize.h" namespace "utils":
vector[int] cppFactorize(vector[string] &input_vector)
def factorize(input_vector):
return cppFactorize(input_vector)

View File

@@ -1,24 +1,22 @@
import numpy as np import numpy as np
from .cppfimdlp import CFImdlp from .cppfimdlp import CFImdlp, factorize
from sklearn.base import BaseEstimator, TransformerMixin from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import unique_labels from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from joblib import Parallel, delayed from joblib import Parallel, delayed
from ._version import __version__
# from ._version import __version__
class FImdlp(TransformerMixin, BaseEstimator): class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, algorithm=0, n_jobs=-1): def __init__(self, n_jobs=-1):
self.algorithm = algorithm
self.n_jobs = n_jobs self.n_jobs = n_jobs
"""Fayyad - Irani MDLP discretization algorithm based implementation. """Fayyad - Irani MDLP discretization algorithm based implementation.
Parameters Parameters
---------- ----------
algorithm : int, default=0
The type of algorithm to use computing the cut points.
0 - Definitive implementation
1 - Alternative proposal
n_jobs : int, default=-1 n_jobs : int, default=-1
The number of jobs to run in parallel. :meth:`fit` and The number of jobs to run in parallel. :meth:`fit` and
:meth:`transform`, are parallelized over the features. ``-1`` means :meth:`transform`, are parallelized over the features. ``-1`` means
@@ -26,27 +24,26 @@ class FImdlp(TransformerMixin, BaseEstimator):
Attributes Attributes
---------- ----------
n_features_ : int n_features_in_ : int
The number of features of the data passed to :meth:`fit`. The number of features of the data passed to :meth:`fit`.
discretizer_ : list discretizer_ : list
The list of discretizers, one for each feature. The list of discretizers, one for each feature.
cut_points_ : list cut_points_ : list
The list of cut points for each feature. The list of cut points for each feature.
X_ : array X_ : array, shape (n_samples, n_features)
the samples used to fit, shape (n_samples, n_features) the samples used to fit
y_ : array y_ : array, shape(n_samples,)
the labels used to fit, shape (n_samples,) the labels used to fit
features_ : list features_ : list
the list of features to be discretized the list of features to be discretized
""" """
def _check_params_fit(self, X, y, expected_args, kwargs): def _more_tags(self):
"""Check the common parameters passed to fit""" return {"preserves_dtype": [np.int32], "requires_y": True}
def _check_args(self, X, y, expected_args, kwargs):
# Check that X and y have correct shape # Check that X and y have correct shape
X, y = check_X_y(X, y) X, y = check_X_y(X, y)
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
# Default values # Default values
self.features_ = [i for i in range(X.shape[1])] self.features_ = [i for i in range(X.shape[1])]
for key, value in kwargs.items(): for key, value in kwargs.items():
@@ -67,15 +64,24 @@ class FImdlp(TransformerMixin, BaseEstimator):
raise ValueError("Feature index out of range") raise ValueError("Feature index out of range")
return X, y return X, y
def _update_params(self, X, y):
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
self.n_features_in_ = X.shape[1]
@staticmethod
def get_version():
return f"{__version__}({CFImdlp().get_version().decode()})"
def fit(self, X, y, **kwargs): def fit(self, X, y, **kwargs):
"""A reference implementation of a fitting function for a transformer. """A reference implementation of a fitting function for a transformer.
Parameters Parameters
---------- ----------
X : {array-like, sparse matrix}, shape (n_samples, n_features) X : array, shape (n_samples, n_features)
The training input samples. The training input samples.
y : None y : array, shape (n_samples,)
There is no need of a target in a transformer, yet the pipeline API the labels used to fit
requires this parameter.
features : list, default=[i for i in range(n_features)] features : list, default=[i for i in range(n_features)]
The list of features to be discretized. The list of features to be discretized.
Returns Returns
@@ -83,23 +89,23 @@ class FImdlp(TransformerMixin, BaseEstimator):
self : object self : object
Returns self. Returns self.
""" """
X, y = self._check_params_fit( X, y = self._check_args(
X, y, expected_args=["features"], kwargs=kwargs X, y, expected_args=["features"], kwargs=kwargs
) )
self.n_features_ = X.shape[1] self._update_params(X, y)
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
self.discretizer_ = [None] * self.n_features_ self.discretizer_ = [None] * self.n_features_in_
self.cut_points_ = [None] * self.n_features_ self.cut_points_ = [None] * self.n_features_in_
Parallel(n_jobs=self.n_jobs, prefer="threads")( Parallel(n_jobs=self.n_jobs, prefer="threads")(
delayed(self._fit_discretizer)(feature) delayed(self._fit_discretizer)(feature)
for feature in range(self.n_features_) for feature in range(self.n_features_in_)
) )
return self return self
def _fit_discretizer(self, feature): def _fit_discretizer(self, feature):
if feature in self.features_: if feature in self.features_:
self.discretizer_[feature] = CFImdlp(algorithm=self.algorithm) self.discretizer_[feature] = CFImdlp()
self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[ self.cut_points_[feature] = self.discretizer_[
feature feature
@@ -118,7 +124,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
"""Discretize X values. """Discretize X values.
Parameters Parameters
---------- ----------
X : {array-like}, shape (n_samples, n_features) X : array, shape (n_samples, n_features)
The input samples. The input samples.
Returns Returns
------- -------
@@ -126,25 +132,41 @@ class FImdlp(TransformerMixin, BaseEstimator):
The array containing the discretized values of ``X``. The array containing the discretized values of ``X``.
""" """
# Check is fit had been called # Check is fit had been called
check_is_fitted(self, "n_features_") check_is_fitted(self, "n_features_in_")
# Input validation # Input validation
X = check_array(X) X = check_array(X)
# Check that the input is of the same shape as the one passed # Check that the input is of the same shape as the one passed
# during fit. # during fit.
if X.shape[1] != self.n_features_: if X.shape[1] != self.n_features_in_:
raise ValueError( raise ValueError(
"Shape of input is different from what was seen in `fit`" "Shape of input is different from what was seen in `fit`"
) )
if len(self.features_) == self.n_features_: if len(self.features_) == self.n_features_in_:
result = np.zeros_like(X, dtype=np.int32) - 1 result = np.zeros_like(X, dtype=np.int32) - 1
else: else:
result = np.zeros_like(X) - 1 result = np.zeros_like(X) - 1
Parallel(n_jobs=self.n_jobs, prefer="threads")( Parallel(n_jobs=self.n_jobs, prefer="threads")(
delayed(self._discretize_feature)(feature, X[:, feature], result) delayed(self._discretize_feature)(feature, X[:, feature], result)
for feature in range(self.n_features_) for feature in range(self.n_features_in_)
) )
return result return result
@staticmethod
def factorize(yy):
"""Factorize the input labels
Parameters
----------
yy : array, shape (n_samples,)
Labels to be factorized, MUST be bytes, i.e. b"0", b"1", ...
Returns
-------
array, shape (n_samples,)
Factorized labels
"""
return factorize(yy)
def get_cut_points(self): def get_cut_points(self):
"""Get the cut points for each feature. """Get the cut points for each feature.
Returns Returns
@@ -153,6 +175,70 @@ class FImdlp(TransformerMixin, BaseEstimator):
The list of cut points for each feature. The list of cut points for each feature.
""" """
result = [] result = []
for feature in range(self.n_features_): for feature in range(self.n_features_in_):
result.append(self.cut_points_[feature]) result.append(self.cut_points_[feature])
return result return result
def get_states_feature(self, feature):
"""Return the states a feature can take
Parameters
----------
feature : int
feature to get the states
Returns
-------
list
states of the feature
"""
if feature in self.features_:
return list(range(len(self.cut_points_[feature]) + 1))
return None
def join_fit(self, features, target, data):
"""Join the selected features with the labels and fit the discretizer
of the target variable
join - fit - transform
Parameters
----------
features : [list]
index of the features to join with the labels
target : [int]
index of the target variable to discretize
data: [array] shape (n_samples, n_features)
dataset that contains the features to join
Returns
-------
result: np.array
The target variable newly discretized
"""
check_is_fitted(self, "n_features_in_")
if len(features) < 1 or len(features) > self.n_features_in_:
raise ValueError(
"Number of features must be in range [1, "
f"{self.n_features_in_}]"
)
for feature in features:
if feature < 0 or feature >= self.n_features_in_:
raise ValueError(
f"Feature {feature} not in range [0, "
f"{self.n_features_in_})"
)
if target < 0 or target >= self.n_features_in_:
raise ValueError(
f"Target {target} not in range [0, {self.n_features_in_})"
)
if target in features:
raise ValueError("Target cannot in features to join")
y_join = [
f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode()
for item_y, items_x in zip(self.y_, data[:, features])
]
self.y_join_ = y_join
self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))
self.cut_points_[target] = self.discretizer_[target].get_cut_points()
# return the discretized target variable with the new cut points
return np.searchsorted(self.cut_points_[target], self.X_[:, target])

View File

@@ -1,67 +1,46 @@
import unittest import unittest
import sklearn import sklearn
from sklearn.datasets import load_iris
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.utils.estimator_checks import check_estimator
from ..cppfimdlp import CFImdlp, factorize
from ..mdlp import FImdlp from ..mdlp import FImdlp
from .. import version from .. import __version__
from .._version import __version__
# from .._version import __version__
class FImdlpTest(unittest.TestCase): class FImdlpTest(unittest.TestCase):
def test_version(self): def test_version(self):
self.assertEqual(version(), __version__) clf = FImdlp()
self.assertEqual(
clf.get_version(),
f"{__version__}({CFImdlp().get_version().decode()})",
)
def test_init(self): def test_init(self):
clf = FImdlp() clf = FImdlp()
self.assertEqual(-1, clf.n_jobs) self.assertEqual(-1, clf.n_jobs)
self.assertEqual(0, clf.algorithm) clf = FImdlp(n_jobs=7)
clf = FImdlp(algorithm=1, n_jobs=7)
self.assertEqual(1, clf.algorithm)
self.assertEqual(7, clf.n_jobs) self.assertEqual(7, clf.n_jobs)
def test_fit_definitive(self): def test_fit_definitive(self):
clf = FImdlp(algorithm=0) clf = FImdlp()
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[2.0], [3.0]], clf.get_cut_points())
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
clf.fit(X, y) clf.fit(X, y)
self.assertEqual(clf.n_features_, 4) self.assertEqual(clf.n_features_in_, 4)
self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_)) self.assertTrue(np.array_equal(y, clf.y_))
expected = [
[5.449999809265137, 6.25],
[2.8499999046325684, 3.0, 3.049999952316284, 3.3499999046325684],
[2.450000047683716, 4.75, 5.050000190734863],
[0.800000011920929, 1.4500000476837158, 1.75],
]
self.assertListEqual(expected, clf.get_cut_points())
self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_)
def test_fit_alternative(self):
clf = FImdlp(algorithm=1)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[2], [3]], clf.get_cut_points())
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
expected = [ expected = [
[5.449999809265137, 5.75], [5.449999809265137, 5.75],
[2.8499999046325684, 3.3499999046325684], [2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684],
[2.450000047683716, 4.75], [2.45, 4.75, 5.050000190734863],
[0.800000011920929, 1.75], [0.8, 1.75],
] ]
self.assertListEqual(expected, clf.get_cut_points()) computed = clf.get_cut_points()
for item_computed, item_expected in zip(computed, expected):
for x_, y_ in zip(item_computed, item_expected):
self.assertAlmostEqual(x_, y_)
self.assertListEqual([0, 1, 2, 3], clf.features_) self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3]) clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_) self.assertListEqual([0, 2, 3], clf.features_)
@@ -82,8 +61,12 @@ class FImdlpTest(unittest.TestCase):
clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2]) clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2])
def test_fit_features(self): def test_fit_features(self):
clf = FImdlp() clf = FImdlp(n_jobs=-1)
# Two samples doesn't have enough information to split
clf.fit([[1, -2], [3, 4]], [1, 2], features=[0]) clf.fit([[1, -2], [3, 4]], [1, 2], features=[0])
self.assertListEqual(clf.get_cut_points(), [[], []])
clf.fit([[1, -2], [3, 4], [5, 6]], [1, 2, 2], features=[0])
self.assertListEqual(clf.get_cut_points(), [[2], []])
res = clf.transform([[1, -2], [3, 4]]) res = clf.transform([[1, -2], [3, 4]])
self.assertListEqual(res.tolist(), [[0, -2], [1, 4]]) self.assertListEqual(res.tolist(), [[0, -2], [1, 4]])
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
@@ -98,15 +81,15 @@ class FImdlpTest(unittest.TestCase):
) )
self.assertEqual(X_computed.dtype, np.float64) self.assertEqual(X_computed.dtype, np.float64)
def test_transform_definitive(self): def test_transform(self):
clf = FImdlp(algorithm=0) clf = FImdlp()
clf.fit([[1, 2], [3, 4]], [1, 2]) clf.fit([[1, 2], [3, 4], [5, 6]], [1, 2, 2])
self.assertEqual( self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]] clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
) )
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
clf.fit(X, y) clf.fit(X, y)
self.assertEqual(clf.n_features_, 4) self.assertEqual(clf.n_features_in_, 4)
self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_)) self.assertTrue(np.array_equal(y, clf.y_))
X_transformed = clf.transform(X) X_transformed = clf.transform(X)
@@ -116,46 +99,131 @@ class FImdlpTest(unittest.TestCase):
self.assertEqual(X_transformed.dtype, np.int32) self.assertEqual(X_transformed.dtype, np.int32)
expected = [ expected = [
[1, 0, 1, 1], [1, 0, 1, 1],
[1, 1, 1, 1], [2, 3, 1, 1],
[1, 0, 1, 1],
[0, 0, 1, 1],
[1, 0, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1],
]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(algorithm=0)
clf.transform([[1, 2], [3, 4]])
def test_transform_alternative(self):
clf = FImdlp(algorithm=1)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
self.assertListEqual(
clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
)
expected = [
[1, 0, 1, 1],
[2, 1, 1, 1],
[2, 0, 1, 1], [2, 0, 1, 1],
[0, 0, 1, 1], [0, 0, 1, 1],
[1, 0, 1, 1], [1, 0, 1, 1],
[1, 1, 1, 1], [1, 3, 1, 1],
[1, 1, 1, 1], [1, 2, 1, 1],
] ]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected)) self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]]) clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError): with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(algorithm=1) clf = FImdlp()
clf.transform([[1, 2], [3, 4]]) clf.transform([[1, 2], [3, 4]])
def test_cppfactorize(self):
source = [
b"f0",
b"f1",
b"f2",
b"f3",
b"f4",
b"f5",
b"f6",
b"f1",
b"f1",
b"f7",
b"f8",
]
expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8]
computed = factorize(source)
self.assertListEqual(expected, computed)
def test_join_fit(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
x = np.array(
[
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[1, 2, 3, 4, 5],
[2, 3, 4, 5, 6],
[3, 4, 5, 6, 7],
]
)
expected = [0, 0, 1, 2, 2]
clf = FImdlp()
clf.fit(x, factorize(y))
computed = clf.join_fit([0, 2], 1, x)
self.assertListEqual(computed.tolist(), expected)
expected_y = [b"002", b"002", b"113", b"224", b"335"]
self.assertListEqual(expected_y, clf.y_join_)
def test_join_fit_error(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
x = np.array(
[
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[1, 2, 3, 4, 5],
[2, 3, 4, 5, 6],
[3, 4, 5, 6, 7],
]
)
clf = FImdlp()
clf.fit(x, factorize(y))
with self.assertRaises(ValueError) as exception:
clf.join_fit([], 1, x)
self.assertEqual(
str(exception.exception),
"Number of features must be in range [1, 5]",
)
with self.assertRaises(ValueError) as exception:
FImdlp().join_fit([0, 4], 1, x)
self.assertTrue(
str(exception.exception).startswith(
"This FImdlp instance is not fitted yet."
)
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 5], 1, x)
self.assertEqual(
str(exception.exception),
"Feature 5 not in range [0, 5)",
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 2], 5, x)
self.assertEqual(
str(exception.exception),
"Target 5 not in range [0, 5)",
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 2], 2, x)
self.assertEqual(
str(exception.exception),
"Target cannot in features to join",
)
def test_factorize(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
clf = FImdlp()
computed = clf.factorize(y)
self.assertListEqual([0, 0, 1, 2, 3], computed)
y = [b"f4", b"f0", b"f0", b"f2", b"f3"]
clf = FImdlp()
computed = clf.factorize(y)
self.assertListEqual([0, 1, 1, 2, 3], computed)
@staticmethod
def test_sklearn_transformer():
for check, test in check_estimator(FImdlp(), generate_only=True):
test(check)
def test_states_feature(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected = []
for i in [3, 6, 4, 3]:
expected.append(list(range(i)))
for feature in range(X.shape[1]):
self.assertListEqual(
expected[feature], clf.get_states_feature(feature)
)
def test_states_no_feature(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertIsNone(clf.get_states_feature(4))