diff --git a/.gitignore b/.gitignore index cbabd70..0318fe0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ __pycache__/ *$py.class # C extensions -*.so +build/**/*.so # Distribution / packaging .Python diff --git a/MANIFEST.in b/MANIFEST.in index 152f4c3..4d6a74c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -include fimdlp/FImdlp.h +include fimdlp/CPPFImdlp.h diff --git a/Makefile b/Makefile index 80acec7..49674a6 100644 --- a/Makefile +++ b/Makefile @@ -16,9 +16,13 @@ push: ## Push code with tags build: ## Build package rm -fr dist/* rm -fr build/* - #python setup.py build_ext python -m build +buildext: ## Build extension + rm -fr dist/* + rm -fr build/* + python setup.py build_ext + audit: ## Audit pip pip-audit diff --git a/fimdlp/FImdlp.cpp b/fimdlp/CPPFImdlp.cpp similarity index 65% rename from fimdlp/FImdlp.cpp rename to fimdlp/CPPFImdlp.cpp index 19241c7..3600959 100644 --- a/fimdlp/FImdlp.cpp +++ b/fimdlp/CPPFImdlp.cpp @@ -1,13 +1,13 @@ -#include "FImdlp.h" -namespace FImdlp +#include "CPPFImdlp.h" +namespace CPPFImdlp { - FImdlp::FImdlp() + CPPFImdlp::CPPFImdlp() { } - FImdlp::~FImdlp() + CPPFImdlp::~CPPFImdlp() { } - std::vector FImdlp::cutPoints(std::vector &X, std::vector &y) + std::vector CPPFImdlp::cutPoints(std::vector &X, std::vector &y) { std::vector cutPts; int i, ant = X.at(0); diff --git a/fimdlp/FImdlp.h b/fimdlp/CPPFImdlp.h similarity index 55% rename from fimdlp/FImdlp.h rename to fimdlp/CPPFImdlp.h index d15cf8b..81f589a 100644 --- a/fimdlp/FImdlp.h +++ b/fimdlp/CPPFImdlp.h @@ -1,14 +1,14 @@ -#ifndef FIMDLP_H -#define FIMDLP_H +#ifndef CPPFIMDLP_H +#define CPPFIMDLP_H #include #include -namespace FImdlp +namespace CPPFImdlp { - class FImdlp + class CPPFImdlp { public: - FImdlp(); - ~FImdlp(); + CPPFImdlp(); + ~CPPFImdlp(); std::vector cutPoints(std::vector &, std::vector &); }; } diff --git a/fimdlp/__init__.py b/fimdlp/__init__.py index 7df9f7a..d1675e1 100644 --- a/fimdlp/__init__.py +++ b/fimdlp/__init__.py @@ -1 +1,3 @@ -from ._version import __version__ \ No newline at end of file +from ._version import __version__ + +all = ["FImdlp", "__version__"] diff --git a/fimdlp/cfimdlp.pyx b/fimdlp/cfimdlp.pyx index cfa00b2..8317808 100644 --- a/fimdlp/cfimdlp.pyx +++ b/fimdlp/cfimdlp.pyx @@ -2,15 +2,15 @@ # cython: language_level = 3 from libcpp.vector cimport vector -cdef extern from "FImdlp.h" namespace "FImdlp": - cdef cppclass FImdlp: - FImdlp() except + +cdef extern from "CPPFImdlp.h" namespace "CPPFImdlp": + cdef cppclass CPPFImdlp: + CPPFImdlp() except + vector[float] cutPoints(vector[int]&, vector[int]&) cdef class CFImdlp: - cdef FImdlp *thisptr + cdef CPPFImdlp *thisptr def __cinit__(self): - self.thisptr = new FImdlp() + self.thisptr = new CPPFImdlp() def __dealloc__(self): del self.thisptr def cut_points(self, X, y): diff --git a/fimdlp/cppfimdlp.cpython-310-darwin.so b/fimdlp/cppfimdlp.cpython-310-darwin.so new file mode 100755 index 0000000..c553ecc Binary files /dev/null and b/fimdlp/cppfimdlp.cpython-310-darwin.so differ diff --git a/fimdlp/mdlp.py b/fimdlp/mdlp.py new file mode 100644 index 0000000..34584d0 --- /dev/null +++ b/fimdlp/mdlp.py @@ -0,0 +1,103 @@ +import numpy as np +from .cppfimdlp import CFImdlp +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.multiclass import unique_labels +from sklearn.utils.validation import check_X_y, check_array, check_is_fitted + + +class FImdlp(TransformerMixin, BaseEstimator): + """Fayyad - Irani MDLP discretization algorithm. + + Parameters + ---------- + demo_param : str, default='demo' + A parameter used for demonstation of how to pass and store paramters. + + Attributes + ---------- + n_features_ : int + The number of features of the data passed to :meth:`fit`. + """ + + def __init__(self): + pass + + def _check_params_fit(self, X, y, expected_args, kwargs): + """Check the common parameters passed to fit""" + # Check that X and y have correct shape + X, y = check_X_y(X, y) + # Store the classes seen during fit + self.classes_ = unique_labels(y) + self.n_classes_ = self.classes_.shape[0] + # Default values + self.class_name_ = "class" + self.features_ = [f"feature_{i}" for i in range(X.shape[1])] + for key, value in kwargs.items(): + if key in expected_args: + setattr(self, f"{key}_", value) + else: + raise ValueError(f"Unexpected argument: {key}") + if len(self.features_) != X.shape[1]: + raise ValueError( + "Number of features does not match the number of columns in X" + ) + return X, y + + def fit(self, X, y, **kwargs): + """A reference implementation of a fitting function for a transformer. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The training input samples. + y : None + There is no need of a target in a transformer, yet the pipeline API + requires this parameter. + Returns + ------- + self : object + Returns self. + """ + X, y = self._check_params_fit( + X, y, expected_args=["class_name", "features"], kwargs=kwargs + ) + + self.n_features_ = X.shape[1] + self.X_ = X + self.y_ = y + self.discretizer_ = CFImdlp() + + return self + + def transform(self, X): + """Discretize X values. + Parameters + ---------- + X : {array-like}, shape (n_samples, n_features) + The input samples. + Returns + ------- + X_transformed : array, shape (n_samples, n_features) + The array containing the discretized values of ``X``. + """ + # Check is fit had been called + check_is_fitted(self, "n_features_") + + # Input validation + X = check_array(X) + if (X != self.X_).any(): + raise ValueError( + "X values are not the same as the ones used to fit the model." + ) + + # Check that the input is of the same shape as the one passed + # during fit. + if X.shape[1] != self.n_features_: + raise ValueError( + "Shape of input is different from what was seen" "in `fit`" + ) + print("Cut points for each feature in Iris dataset:") + for i in range(0, self.n_features_): + data = np.sort(X[:, i]) + Xcutpoints = self.discretizer_.cut_points(data, self.y_) + print(f"{self.features_[i]:20s}: {Xcutpoints}") + return X diff --git a/pyproject.toml b/pyproject.toml index a706dee..6617025 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,3 +37,21 @@ classifiers = [ [project.urls] Home = "https://github.com/doctorado-ml/FImdlp" + +[tool.black] +line-length = 79 +target_version = ['py38', 'py39', 'py310'] +include = '\.pyi?$' +exclude = ''' +/( + \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist +)/ +''' diff --git a/sample.py b/sample.py index 43df59e..85f43d8 100644 --- a/sample.py +++ b/sample.py @@ -1,14 +1,9 @@ -import numpy as np from sklearn.datasets import load_iris -from fimdlp import CFImdlp +from fimdlp.mdlp import FImdlp data = load_iris() X = data.data y = data.target features = data.feature_names -test = CFImdlp() -print("Cut points for each feature in Iris dataset:") -for i in range(0, X.shape[1]): - data = np.sort(X[:, i]) - Xcutpoints = test.cut_points(data, y) - print(f"{features[i]:20s}: {Xcutpoints}") +test = FImdlp() +Xcutpoints = test.fit(X, y, features=features).transform(X) diff --git a/setup.py b/setup.py index 66d5539..6e9a25d 100644 --- a/setup.py +++ b/setup.py @@ -9,24 +9,10 @@ from setuptools import Extension, setup setup( ext_modules=[ Extension( - name="fimdlp", - sources=["fimdlp/cfimdlp.pyx", "fimdlp/FImdlp.cpp"], + name="cppfimdlp", + sources=["fimdlp/cfimdlp.pyx", "fimdlp/CPPFImdlp.cpp"], language="c++", include_dirs=["fimdlp"], ), ] ) - -# from Cython.Build import cythonize -# setup( -# ext_modules=cythonize( -# Extension( -# "fimdlp", -# sources=["fimdlp/cfimdlp.pyx", "fimdlp/FImdlp.cpp"], -# language="c++", -# include_dirs=["fimdlp"], -# ), -# include_path=["./fimdlp"], -# ) -# ) -