Build sklearn transformer

This commit is contained in:
2022-11-27 11:35:21 +01:00
parent 6f4c650af9
commit f843c938fc
12 changed files with 152 additions and 44 deletions

2
.gitignore vendored
View File

@@ -4,7 +4,7 @@ __pycache__/
*$py.class
# C extensions
*.so
build/**/*.so
# Distribution / packaging
.Python

View File

@@ -1 +1 @@
include fimdlp/FImdlp.h
include fimdlp/CPPFImdlp.h

View File

@@ -16,9 +16,13 @@ push: ## Push code with tags
build: ## Build package
rm -fr dist/*
rm -fr build/*
#python setup.py build_ext
python -m build
buildext: ## Build extension
rm -fr dist/*
rm -fr build/*
python setup.py build_ext
audit: ## Audit pip
pip-audit

View File

@@ -1,13 +1,13 @@
#include "FImdlp.h"
namespace FImdlp
#include "CPPFImdlp.h"
namespace CPPFImdlp
{
FImdlp::FImdlp()
CPPFImdlp::CPPFImdlp()
{
}
FImdlp::~FImdlp()
CPPFImdlp::~CPPFImdlp()
{
}
std::vector<float> FImdlp::cutPoints(std::vector<int> &X, std::vector<int> &y)
std::vector<float> CPPFImdlp::cutPoints(std::vector<int> &X, std::vector<int> &y)
{
std::vector<float> cutPts;
int i, ant = X.at(0);

View File

@@ -1,14 +1,14 @@
#ifndef FIMDLP_H
#define FIMDLP_H
#ifndef CPPFIMDLP_H
#define CPPFIMDLP_H
#include <vector>
#include <Python.h>
namespace FImdlp
namespace CPPFImdlp
{
class FImdlp
class CPPFImdlp
{
public:
FImdlp();
~FImdlp();
CPPFImdlp();
~CPPFImdlp();
std::vector<float> cutPoints(std::vector<int> &, std::vector<int> &);
};
}

View File

@@ -1 +1,3 @@
from ._version import __version__
from ._version import __version__
all = ["FImdlp", "__version__"]

View File

@@ -2,15 +2,15 @@
# cython: language_level = 3
from libcpp.vector cimport vector
cdef extern from "FImdlp.h" namespace "FImdlp":
cdef cppclass FImdlp:
FImdlp() except +
cdef extern from "CPPFImdlp.h" namespace "CPPFImdlp":
cdef cppclass CPPFImdlp:
CPPFImdlp() except +
vector[float] cutPoints(vector[int]&, vector[int]&)
cdef class CFImdlp:
cdef FImdlp *thisptr
cdef CPPFImdlp *thisptr
def __cinit__(self):
self.thisptr = new FImdlp()
self.thisptr = new CPPFImdlp()
def __dealloc__(self):
del self.thisptr
def cut_points(self, X, y):

Binary file not shown.

103
fimdlp/mdlp.py Normal file
View File

@@ -0,0 +1,103 @@
import numpy as np
from .cppfimdlp import CFImdlp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
class FImdlp(TransformerMixin, BaseEstimator):
"""Fayyad - Irani MDLP discretization algorithm.
Parameters
----------
demo_param : str, default='demo'
A parameter used for demonstation of how to pass and store paramters.
Attributes
----------
n_features_ : int
The number of features of the data passed to :meth:`fit`.
"""
def __init__(self):
pass
def _check_params_fit(self, X, y, expected_args, kwargs):
"""Check the common parameters passed to fit"""
# Check that X and y have correct shape
X, y = check_X_y(X, y)
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
# Default values
self.class_name_ = "class"
self.features_ = [f"feature_{i}" for i in range(X.shape[1])]
for key, value in kwargs.items():
if key in expected_args:
setattr(self, f"{key}_", value)
else:
raise ValueError(f"Unexpected argument: {key}")
if len(self.features_) != X.shape[1]:
raise ValueError(
"Number of features does not match the number of columns in X"
)
return X, y
def fit(self, X, y, **kwargs):
"""A reference implementation of a fitting function for a transformer.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The training input samples.
y : None
There is no need of a target in a transformer, yet the pipeline API
requires this parameter.
Returns
-------
self : object
Returns self.
"""
X, y = self._check_params_fit(
X, y, expected_args=["class_name", "features"], kwargs=kwargs
)
self.n_features_ = X.shape[1]
self.X_ = X
self.y_ = y
self.discretizer_ = CFImdlp()
return self
def transform(self, X):
"""Discretize X values.
Parameters
----------
X : {array-like}, shape (n_samples, n_features)
The input samples.
Returns
-------
X_transformed : array, shape (n_samples, n_features)
The array containing the discretized values of ``X``.
"""
# Check is fit had been called
check_is_fitted(self, "n_features_")
# Input validation
X = check_array(X)
if (X != self.X_).any():
raise ValueError(
"X values are not the same as the ones used to fit the model."
)
# Check that the input is of the same shape as the one passed
# during fit.
if X.shape[1] != self.n_features_:
raise ValueError(
"Shape of input is different from what was seen" "in `fit`"
)
print("Cut points for each feature in Iris dataset:")
for i in range(0, self.n_features_):
data = np.sort(X[:, i])
Xcutpoints = self.discretizer_.cut_points(data, self.y_)
print(f"{self.features_[i]:20s}: {Xcutpoints}")
return X

View File

@@ -37,3 +37,21 @@ classifiers = [
[project.urls]
Home = "https://github.com/doctorado-ml/FImdlp"
[tool.black]
line-length = 79
target_version = ['py38', 'py39', 'py310']
include = '\.pyi?$'
exclude = '''
/(
\.git
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| _build
| buck-out
| build
| dist
)/
'''

View File

@@ -1,14 +1,9 @@
import numpy as np
from sklearn.datasets import load_iris
from fimdlp import CFImdlp
from fimdlp.mdlp import FImdlp
data = load_iris()
X = data.data
y = data.target
features = data.feature_names
test = CFImdlp()
print("Cut points for each feature in Iris dataset:")
for i in range(0, X.shape[1]):
data = np.sort(X[:, i])
Xcutpoints = test.cut_points(data, y)
print(f"{features[i]:20s}: {Xcutpoints}")
test = FImdlp()
Xcutpoints = test.fit(X, y, features=features).transform(X)

View File

@@ -9,24 +9,10 @@ from setuptools import Extension, setup
setup(
ext_modules=[
Extension(
name="fimdlp",
sources=["fimdlp/cfimdlp.pyx", "fimdlp/FImdlp.cpp"],
name="cppfimdlp",
sources=["fimdlp/cfimdlp.pyx", "fimdlp/CPPFImdlp.cpp"],
language="c++",
include_dirs=["fimdlp"],
),
]
)
# from Cython.Build import cythonize
# setup(
# ext_modules=cythonize(
# Extension(
# "fimdlp",
# sources=["fimdlp/cfimdlp.pyx", "fimdlp/FImdlp.cpp"],
# language="c++",
# include_dirs=["fimdlp"],
# ),
# include_path=["./fimdlp"],
# )
# )