42 Commits
ci ... v0.9.3

Author SHA1 Message Date
f65efe3dfd Update the c++ sources with new version 2023-02-24 11:04:06 +01:00
e9d19d41da Add changed submodule 2023-02-22 11:56:39 +01:00
6450ccb9bd Add changed submodule 2023-02-22 11:34:27 +01:00
5d2f32bb0e Add needed header file to MANIFEST 2023-02-22 11:33:26 +01:00
Ricardo Montañana Gómez
8249e55b0c Merge pull request #6 from Doctorado-ML/joinfeatures
- Add a join_fit feature that can update a fitted discretizer. Making it possible to discretize a variable by taking into account the label and a list of other features of the dataset. Used in local discretization with bayesian estimators.
- Add factorize method to be able to simulate the pandas factorize method.
- Remove the algorithm hyperparameter as it is no longer needed
- Add get_states_feature method to obtain a list of states of any feature based on the number of cut points computed while fitting the discretizer
2023-02-22 10:44:43 +01:00
40871f128d Add 1.1.0 version of mdlp 2023-02-22 10:15:33 +01:00
718c9d0e63 make static methods factorize and test_sklrn_trans 2023-02-20 20:12:36 +01:00
e0b7cae9a0 Remove algorithm hyperparameter in discretizer 2023-02-20 18:26:51 +01:00
31d79a77fa Add get_states_feature method 2023-02-13 17:34:50 +01:00
2d495293bb Add range_features method 2023-02-13 16:15:50 +01:00
9899781640 Complete join_fit and remove MultiDiscretizer 2023-02-05 00:30:03 +01:00
f20496203e refactor Multidiscretizer to use one per column 2023-02-04 19:23:15 +01:00
cf09d92ccc add MultiDiscretizer 2023-02-04 17:45:36 +01:00
1186e4ad53 chore: 🔖 Upgrade version number to 0.9.3 2023-01-28 19:15:26 +01:00
7913f5151e Add version command to Makefile 2023-01-28 19:14:32 +01:00
050b923631 feat: Add factorize method to transformer 2023-01-28 10:35:07 +01:00
29fc88cecc test: Add scikit learn compatibility check_estimator test 2023-01-26 23:20:51 +01:00
16b31ec293 test: Complete join_transform test 2023-01-26 11:17:10 +01:00
ca7d158ac8 feat: ⚗️ Add join_transform method and cpp factorize 2023-01-26 10:47:27 +01:00
Ricardo Montañana Gómez
34cd54f77e feat: ♻️ Add Classic algorithm as number 2 to compare performance 2023-01-13 11:47:01 +01:00
70bf03155c Add scikit-learn as requirement 2022-12-23 14:07:36 +01:00
77b571af71 Update README to include link to pypi 2022-12-22 19:41:55 +01:00
ff7a91a7ec build: 🚀 2022-12-22 19:39:05 +01:00
621c19d00c style: 🎨 Remove unused variable in c++ module 2022-12-22 11:02:16 +01:00
Ricardo Montañana Gómez
790da5cc60 Merge pull request #5 from Doctorado-ML/fix_sdist
fix: 🐛 Fix a bug when pip install tries to build the package of F…
2022-12-22 10:29:46 +01:00
2775698063 test: 2022-12-21 19:05:24 +01:00
9db16d9d3c feat: Add version method to cppfimdlp 2022-12-20 01:11:39 +01:00
edd464311f fix: 🐛 Fix Tests and sample mistake 2022-12-15 12:18:10 +01:00
fe32ed4b2a Update algorithm type to compute cut points 2022-12-15 12:12:44 +01:00
1d95311a7d fix: 🐛 Fix a bug when pip install tries to build the package of File not Found
#4
2022-12-14 12:23:07 +01:00
d8066ea274 Update branch name from master to main in CI 2022-12-13 18:46:15 +01:00
a2c1b07525 Update Badges and README 2022-12-13 18:40:53 +01:00
05c12561ac Add submodule to ci 2022-12-13 18:18:12 +01:00
8f4bdd262a Update ci 2022-12-13 18:09:26 +01:00
0740d1f515 Update submodule command 2022-12-13 17:57:44 +01:00
eb7f3dc092 Command to update submodule and update it 2022-12-13 17:56:12 +01:00
cfc18adf06 Fix c++ sample 2022-12-13 17:18:38 +01:00
3ae0d67884 Fix tests because stable_sort in c++ 2022-12-13 17:16:23 +01:00
0ca507c692 Add submodule 2022-12-13 17:05:11 +01:00
Ricardo Montañana Gómez
70b3af94cc Merge pull request #3 from Doctorado-ML/ci
Ci
2022-12-13 17:01:08 +01:00
a5dc2d7162 Remove submodule to fix it 2022-12-13 16:54:37 +01:00
67726bf219 Added submodule fimdlp/cppmdlp 2022-12-13 15:57:52 +01:00
23 changed files with 411 additions and 302 deletions

View File

@@ -32,13 +32,15 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
language: ["cpp", "python"] language: ["python"]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v3 uses: actions/checkout@v3
with:
submodules: recursive
# Initializes the CodeQL tools for scanning. # Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL - name: Initialize CodeQL
@@ -61,6 +63,8 @@ jobs:
- if: matrix.language == 'cpp' - if: matrix.language == 'cpp'
name: Build CPP name: Build CPP
run: | run: |
pip install -q --upgrade pip
pip install -q scikit-learn cython
make install make install
# Command-line programs to run using the OS shell. # Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun

View File

@@ -2,9 +2,9 @@ name: CI
on: on:
push: push:
branches: [master, ci] branches: [main]
pull_request: pull_request:
branches: [master] branches: [main]
workflow_dispatch: workflow_dispatch:
jobs: jobs:
@@ -16,7 +16,9 @@ jobs:
python: ["3.10"] python: ["3.10"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
with:
submodules: recursive
- name: Set up Python ${{ matrix.python }} - name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2 uses: actions/setup-python@v2
with: with:
@@ -24,7 +26,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
pip install -q --upgrade pip pip install -q --upgrade pip
pip install -q scikit-learn pip install -q scikit-learn cython
pip install -q --upgrade codecov coverage black flake8 codacy-coverage pip install -q --upgrade codecov coverage black flake8 codacy-coverage
- name: Build and install - name: Build and install
run: | run: |

8
.gitignore vendored
View File

@@ -33,8 +33,8 @@ MANIFEST
*.manifest *.manifest
*.spec *.spec
# Installer log2s # Installer logs
pip-log2.txt pip-log.txt
pip-delete-this-directory.txt pip-delete-this-directory.txt
# Unit test / coverage reports # Unit test / coverage reports
@@ -56,7 +56,7 @@ coverage.xml
*.pot *.pot
# Django stuff: # Django stuff:
*.log2 *.log
local_settings.py local_settings.py
db.sqlite3 db.sqlite3
db.sqlite3-journal db.sqlite3-journal
@@ -135,3 +135,5 @@ cmake-build-debug/**
**/lcoverage/** **/lcoverage/**
**/x/* **/x/*
**/*.so **/*.so
**/CMakeFiles
wheelhouse

3
.gitmodules vendored Normal file
View File

@@ -0,0 +1,3 @@
[submodule "src/cppmdlp"]
path = src/cppmdlp
url = https://github.com/rmontanana/mdlp.git

4
MANIFEST.in Normal file
View File

@@ -0,0 +1,4 @@
include src/cppmdlp/CPPFImdlp.h
include src/cppmdlp/typesFImdlp.h
include src/cppmdlp/Metrics.h
include src/fimdlp/Factorize.h

View File

@@ -15,6 +15,10 @@ coverage:
make test make test
coverage report -m coverage report -m
submodule:
git submodule update --remote src/cppmdlp
git submodule update --merge
lint: ## Lint and static-check lint: ## Lint and static-check
black src black src
flake8 --per-file-ignores="__init__.py:F401" src flake8 --per-file-ignores="__init__.py:F401" src
@@ -33,6 +37,11 @@ install: ## Build extension
audit: ## Audit pip audit: ## Audit pip
pip-audit pip-audit
version:
@echo "Current Python version .: $(shell python --version)"
@echo "Current FImdlp version .: $(shell python -c "from fimdlp import _version; print(_version.__version__)")"
@echo "Installed FImdlp version: $(shell pip show fimdlp | grep Version | cut -d' ' -f2)"
help: ## Show help message help: ## Show help message
@IFS=$$'\n' ; \ @IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \

View File

@@ -1,11 +1,21 @@
# FImdlp # FImdlp
[![CI](https://github.com/Doctorado-ML/FImdlp/actions/workflows/main.yml/badge.svg)](https://github.com/Doctorado-ML/FImdlp/actions/workflows/main.yml)
[![CodeQL](https://github.com/Doctorado-ML/FImdlp/actions/workflows/codeql.yml/badge.svg)](https://github.com/Doctorado-ML/FImdlp/actions/workflows/codeql.yml)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/8b4d784fee13401588aa8c06532a2f6d)](https://www.codacy.com/gh/Doctorado-ML/FImdlp/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/FImdlp&utm_campaign=Badge_Grade) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/8b4d784fee13401588aa8c06532a2f6d)](https://www.codacy.com/gh/Doctorado-ML/FImdlp/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/FImdlp&utm_campaign=Badge_Grade)
[![codecov](https://codecov.io/gh/Doctorado-ML/FImdlp/branch/main/graph/badge.svg?token=W8I45B5Z3J)](https://codecov.io/gh/Doctorado-ML/FImdlp)
[![pypy](https://img.shields.io/pypi/v/FImdlp?color=g)](https://pypi.org/project/FImdlp)
![https://img.shields.io/badge/python-3.9%2B-blue](https://img.shields.io/badge/python-3.9%2B-brightgreen)
Discretization algorithm based on the paper by Usama M. Fayyad and Keki B. Irani Discretization algorithm based on the paper by Usama M. Fayyad and Keki B. Irani
```
Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning. In Proceedings of the 13th International Joint Conference on Artificial Intelligence (IJCAI-95), pages 1022-1027, Montreal, Canada, August 1995. Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning. In Proceedings of the 13th International Joint Conference on Artificial Intelligence (IJCAI-95), pages 1022-1027, Montreal, Canada, August 1995.
## Installation
```bash
git clone --recurse-submodules https://github.com/doctorado-ml/FImdlp.git
``` ```
## Build and usage sample ## Build and usage sample
@@ -14,8 +24,8 @@ Multi-Interval Discretization of Continuous-Valued Attributes for Classification
```bash ```bash
pip install -e . pip install -e .
python samples/sample.py iris --original python samples/sample.py iris
python samples/sample.py iris --proposal python samples/sample.py iris --alternative
python samples/sample.py -h # for more options python samples/sample.py -h # for more options
``` ```

12
k.py Normal file
View File

@@ -0,0 +1,12 @@
from sklearn.datasets import load_wine
from fimdlp.mdlp import FImdlp
X, y = load_wine(return_X_y=True)
trans = FImdlp()
Xt = trans.join_transform(X, y, 12)
print("X shape = ", X.shape)
print("Xt.shape=", Xt.shape)
print("Xt ", Xt[:10])
print("trans.X_ shape = ", trans.X_.shape)
print("trans.y_ ", trans.y_[:10])
print("y_join ", trans.y_join_[:10])

View File

@@ -18,7 +18,7 @@ authors = [
{ name = "Ricardo Montañana", email = "ricardo.montanana@alu.uclm.es" }, { name = "Ricardo Montañana", email = "ricardo.montanana@alu.uclm.es" },
] ]
dynamic = ['version'] dynamic = ['version']
dependencies = ["numpy", "joblib"] dependencies = ["numpy", "joblib", "scikit-learn"]
requires-python = ">=3.9" requires-python = ">=3.9"
classifiers = [ classifiers = [
"Development Status :: 3 - Alpha", "Development Status :: 3 - Alpha",

View File

@@ -1,117 +0,0 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
using namespace std;
ArffFiles::ArffFiles()
{
}
vector<string> ArffFiles::getLines()
{
return lines;
}
unsigned long int ArffFiles::getSize()
{
return lines.size();
}
vector<tuple<string, string>> ArffFiles::getAttributes()
{
return attributes;
}
string ArffFiles::getClassName()
{
return className;
}
string ArffFiles::getClassType()
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(string fileName, bool classLast)
{
ifstream file(fileName);
string keyword, attribute, type;
if (file.is_open()) {
string line;
while (getline(file, line)) {
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute >> type;
attributes.push_back(make_tuple(attribute, type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
if (classLast) {
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
} else {
className = get<0>(attributes.front());
classType = get<1>(attributes.front());
attributes.erase(attributes.begin());
}
generateDataset(classLast);
} else
throw invalid_argument("Unable to open file");
}
void ArffFiles::generateDataset(bool classLast)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
vector<string> yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? attributes.size() : 0;
for (int i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int pos = 0, xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
X[xIndex++][i] = stof(value);
}
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

View File

@@ -1,28 +0,0 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
#include <tuple>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<tuple<string, string>> attributes;
string className, classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset(bool);
public:
ArffFiles();
void load(string, bool = true);
vector<string> getLines();
unsigned long int getSize();
string getClassName();
string getClassType();
string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<tuple<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels_t);
};
#endif

View File

@@ -3,4 +3,4 @@ project(main)
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 14)
add_executable(sample sample.cpp ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp) add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp)

View File

@@ -1,4 +1,4 @@
#include "ArffFiles.h" #include "../src/cppmdlp/tests/ArffFiles.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <iomanip> #include <iomanip>
@@ -41,7 +41,7 @@ int main(int argc, char** argv)
} }
cout << y[i] << endl; cout << y[i] << endl;
} }
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(false); mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0);
for (auto i = 0; i < attributes.size(); i++) { for (auto i = 0; i < attributes.size(); i++) {
cout << "Cut points for " << get<0>(attributes[i]) << endl; cout << "Cut points for " << get<0>(attributes[i]) << endl;
cout << "--------------------------" << setprecision(3) << endl; cout << "--------------------------" << setprecision(3) << endl;
@@ -50,9 +50,5 @@ int main(int argc, char** argv)
cout << item << endl; cout << item << endl;
} }
} }
mdlp::indices_t indices = test.sortIndices(X[0]);
for (auto item : indices) {
cout << setw(3) << item << " " << X[0][item] << " " << y[item] << endl;
}
return 0; return 0;
} }

View File

@@ -14,8 +14,9 @@ datasets = {
} }
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
ap.add_argument("--proposal", action="store_true") ap.add_argument(
ap.add_argument("--original", dest="proposal", action="store_false") "--alternative", dest="proposal", action="store_const", const=1
)
ap.add_argument("dataset", type=str, choices=datasets.keys()) ap.add_argument("dataset", type=str, choices=datasets.keys())
args = ap.parse_args() args = ap.parse_args()
relative = "" if os.path.isdir("src") else ".." relative = "" if os.path.isdir("src") else ".."
@@ -29,7 +30,7 @@ class_name = df.columns.to_list()[class_column]
X = df.drop(class_name, axis=1) X = df.drop(class_name, axis=1)
y, _ = pd.factorize(df[class_name]) y, _ = pd.factorize(df[class_name])
X = X.to_numpy() X = X.to_numpy()
test = FImdlp(proposal=args.proposal) test = FImdlp(algorithm=args.proposal if args.proposal is not None else 0)
now = time.time() now = time.time()
test.fit(X, y) test.fit(X, y)
fit_time = time.time() fit_time = time.time()

View File

@@ -14,10 +14,13 @@ setup(
"src/fimdlp/cfimdlp.pyx", "src/fimdlp/cfimdlp.pyx",
"src/cppmdlp/CPPFImdlp.cpp", "src/cppmdlp/CPPFImdlp.cpp",
"src/cppmdlp/Metrics.cpp", "src/cppmdlp/Metrics.cpp",
"src/fimdlp/Factorize.cpp",
], ],
language="c++", language="c++",
include_dirs=["fimdlp"], include_dirs=["fimdlp"],
extra_compile_args=["-std=c++2a"], extra_compile_args=[
"-std=c++11",
],
), ),
] ]
) )

1
src/cppmdlp Submodule

Submodule src/cppmdlp added at 32a6fd9ba0

18
src/fimdlp/Factorize.cpp Normal file
View File

@@ -0,0 +1,18 @@
#include "Factorize.h"
namespace utils {
vector<int> cppFactorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}
}

10
src/fimdlp/Factorize.h Normal file
View File

@@ -0,0 +1,10 @@
#ifndef FACTORIZE_H
#define FACTORIZE_H
#include <vector>
#include <map>
#include <string>
namespace utils {
using namespace std;
vector<int> cppFactorize(const vector<string>&);
}
#endif

View File

@@ -1,8 +1,4 @@
from ._version import __version__ from ._version import __version__
def version():
return __version__
all = ["FImdlp", "__version__"] all = ["FImdlp", "__version__"]

View File

@@ -1 +1 @@
__version__ = "0.9.1" __version__ = "0.9.3"

View File

@@ -1,20 +1,20 @@
# distutils: language = c++ # distutils: language = c++
# cython: language_level = 3 # cython: language_level = 3
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp cimport bool from libcpp.string cimport string
cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
ctypedef float precision_t ctypedef float precision_t
cdef cppclass CPPFImdlp: cdef cppclass CPPFImdlp:
CPPFImdlp(bool) except + CPPFImdlp() except +
CPPFImdlp& fit(vector[precision_t]&, vector[int]&) CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
vector[precision_t] getCutPoints() vector[precision_t] getCutPoints()
string version()
cdef class CFImdlp: cdef class CFImdlp:
cdef CPPFImdlp *thisptr cdef CPPFImdlp *thisptr
def __cinit__(self, proposal): def __cinit__(self):
self.thisptr = new CPPFImdlp(proposal) self.thisptr = new CPPFImdlp()
def __dealloc__(self): def __dealloc__(self):
del self.thisptr del self.thisptr
def fit(self, X, y): def fit(self, X, y):
@@ -22,4 +22,12 @@ cdef class CFImdlp:
return self return self
def get_cut_points(self): def get_cut_points(self):
return self.thisptr.getCutPoints() return self.thisptr.getCutPoints()
def get_version(self):
return self.thisptr.version()
def __reduce__(self):
return (CFImdlp, ())
cdef extern from "Factorize.h" namespace "utils":
vector[int] cppFactorize(vector[string] &input_vector)
def factorize(input_vector):
return cppFactorize(input_vector)

View File

@@ -1,15 +1,17 @@
import numpy as np import numpy as np
from .cppfimdlp import CFImdlp from .cppfimdlp import CFImdlp, factorize
from sklearn.base import BaseEstimator, TransformerMixin from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import unique_labels from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from joblib import Parallel, delayed from joblib import Parallel, delayed
from ._version import __version__
# from ._version import __version__
class FImdlp(TransformerMixin, BaseEstimator): class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, n_jobs=-1, proposal=False): def __init__(self, n_jobs=-1):
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.proposal = proposal
"""Fayyad - Irani MDLP discretization algorithm based implementation. """Fayyad - Irani MDLP discretization algorithm based implementation.
@@ -22,27 +24,26 @@ class FImdlp(TransformerMixin, BaseEstimator):
Attributes Attributes
---------- ----------
n_features_ : int n_features_in_ : int
The number of features of the data passed to :meth:`fit`. The number of features of the data passed to :meth:`fit`.
discretizer_ : list discretizer_ : list
The list of discretizers, one for each feature. The list of discretizers, one for each feature.
cut_points_ : list cut_points_ : list
The list of cut points for each feature. The list of cut points for each feature.
X_ : array X_ : array, shape (n_samples, n_features)
the samples used to fit, shape (n_samples, n_features) the samples used to fit
y_ : array y_ : array, shape(n_samples,)
the labels used to fit, shape (n_samples,) the labels used to fit
features_ : list features_ : list
the list of features to be discretized the list of features to be discretized
""" """
def _check_params_fit(self, X, y, expected_args, kwargs): def _more_tags(self):
"""Check the common parameters passed to fit""" return {"preserves_dtype": [np.int32], "requires_y": True}
def _check_args(self, X, y, expected_args, kwargs):
# Check that X and y have correct shape # Check that X and y have correct shape
X, y = check_X_y(X, y) X, y = check_X_y(X, y)
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
# Default values # Default values
self.features_ = [i for i in range(X.shape[1])] self.features_ = [i for i in range(X.shape[1])]
for key, value in kwargs.items(): for key, value in kwargs.items():
@@ -63,15 +64,24 @@ class FImdlp(TransformerMixin, BaseEstimator):
raise ValueError("Feature index out of range") raise ValueError("Feature index out of range")
return X, y return X, y
def _update_params(self, X, y):
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
self.n_features_in_ = X.shape[1]
@staticmethod
def get_version():
return f"{__version__}({CFImdlp().get_version().decode()})"
def fit(self, X, y, **kwargs): def fit(self, X, y, **kwargs):
"""A reference implementation of a fitting function for a transformer. """A reference implementation of a fitting function for a transformer.
Parameters Parameters
---------- ----------
X : {array-like, sparse matrix}, shape (n_samples, n_features) X : array, shape (n_samples, n_features)
The training input samples. The training input samples.
y : None y : array, shape (n_samples,)
There is no need of a target in a transformer, yet the pipeline API the labels used to fit
requires this parameter.
features : list, default=[i for i in range(n_features)] features : list, default=[i for i in range(n_features)]
The list of features to be discretized. The list of features to be discretized.
Returns Returns
@@ -79,24 +89,30 @@ class FImdlp(TransformerMixin, BaseEstimator):
self : object self : object
Returns self. Returns self.
""" """
X, y = self._check_params_fit( X, y = self._check_args(
X, y, expected_args=["features"], kwargs=kwargs X, y, expected_args=["features"], kwargs=kwargs
) )
self.n_features_ = X.shape[1] self._update_params(X, y)
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
self.discretizer_ = [None] * self.n_features_ self.discretizer_ = [None] * self.n_features_in_
self.cut_points_ = [None] * self.n_features_ self.cut_points_ = [None] * self.n_features_in_
Parallel(n_jobs=self.n_jobs, prefer="threads")( Parallel(n_jobs=self.n_jobs, prefer="threads")(
delayed(self._fit_discretizer)(feature) delayed(self._fit_discretizer)(feature)
for feature in range(self.n_features_) for feature in range(self.n_features_in_)
) )
return self return self
def _fit_discretizer(self, feature): def _fit_discretizer(self, feature):
self.discretizer_[feature] = CFImdlp(proposal=self.proposal) if feature in self.features_:
self.discretizer_[feature] = CFImdlp()
self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[feature].get_cut_points() self.cut_points_[feature] = self.discretizer_[
feature
].get_cut_points()
else:
self.discretizer_[feature] = None
self.cut_points_[feature] = []
def _discretize_feature(self, feature, X, result): def _discretize_feature(self, feature, X, result):
if feature in self.features_: if feature in self.features_:
@@ -108,7 +124,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
"""Discretize X values. """Discretize X values.
Parameters Parameters
---------- ----------
X : {array-like}, shape (n_samples, n_features) X : array, shape (n_samples, n_features)
The input samples. The input samples.
Returns Returns
------- -------
@@ -116,22 +132,41 @@ class FImdlp(TransformerMixin, BaseEstimator):
The array containing the discretized values of ``X``. The array containing the discretized values of ``X``.
""" """
# Check is fit had been called # Check is fit had been called
check_is_fitted(self, "n_features_") check_is_fitted(self, "n_features_in_")
# Input validation # Input validation
X = check_array(X) X = check_array(X)
# Check that the input is of the same shape as the one passed # Check that the input is of the same shape as the one passed
# during fit. # during fit.
if X.shape[1] != self.n_features_: if X.shape[1] != self.n_features_in_:
raise ValueError( raise ValueError(
"Shape of input is different from what was seen in `fit`" "Shape of input is different from what was seen in `fit`"
) )
if len(self.features_) == self.n_features_in_:
result = np.zeros_like(X, dtype=np.int32) - 1 result = np.zeros_like(X, dtype=np.int32) - 1
else:
result = np.zeros_like(X) - 1
Parallel(n_jobs=self.n_jobs, prefer="threads")( Parallel(n_jobs=self.n_jobs, prefer="threads")(
delayed(self._discretize_feature)(feature, X[:, feature], result) delayed(self._discretize_feature)(feature, X[:, feature], result)
for feature in range(self.n_features_) for feature in range(self.n_features_in_)
) )
return result return result
@staticmethod
def factorize(yy):
"""Factorize the input labels
Parameters
----------
yy : array, shape (n_samples,)
Labels to be factorized, MUST be bytes, i.e. b"0", b"1", ...
Returns
-------
array, shape (n_samples,)
Factorized labels
"""
return factorize(yy)
def get_cut_points(self): def get_cut_points(self):
"""Get the cut points for each feature. """Get the cut points for each feature.
Returns Returns
@@ -140,6 +175,70 @@ class FImdlp(TransformerMixin, BaseEstimator):
The list of cut points for each feature. The list of cut points for each feature.
""" """
result = [] result = []
for feature in range(self.n_features_): for feature in range(self.n_features_in_):
result.append(self.cut_points_[feature]) result.append(self.cut_points_[feature])
return result return result
def get_states_feature(self, feature):
"""Return the states a feature can take
Parameters
----------
feature : int
feature to get the states
Returns
-------
list
states of the feature
"""
if feature in self.features_:
return list(range(len(self.cut_points_[feature]) + 1))
return None
def join_fit(self, features, target, data):
"""Join the selected features with the labels and fit the discretizer
of the target variable
join - fit - transform
Parameters
----------
features : [list]
index of the features to join with the labels
target : [int]
index of the target variable to discretize
data: [array] shape (n_samples, n_features)
dataset that contains the features to join
Returns
-------
result: np.array
The target variable newly discretized
"""
check_is_fitted(self, "n_features_in_")
if len(features) < 1 or len(features) > self.n_features_in_:
raise ValueError(
"Number of features must be in range [1, "
f"{self.n_features_in_}]"
)
for feature in features:
if feature < 0 or feature >= self.n_features_in_:
raise ValueError(
f"Feature {feature} not in range [0, "
f"{self.n_features_in_})"
)
if target < 0 or target >= self.n_features_in_:
raise ValueError(
f"Target {target} not in range [0, {self.n_features_in_})"
)
if target in features:
raise ValueError("Target cannot in features to join")
y_join = [
f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode()
for item_y, items_x in zip(self.y_, data[:, features])
]
self.y_join_ = y_join
self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))
self.cut_points_[target] = self.discretizer_[target].get_cut_points()
# return the discretized target variable with the new cut points
return np.searchsorted(self.cut_points_[target], self.X_[:, target])

View File

@@ -1,72 +1,46 @@
import unittest import unittest
import sklearn import sklearn
from sklearn.datasets import load_iris
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.utils.estimator_checks import check_estimator
from ..cppfimdlp import CFImdlp, factorize
from ..mdlp import FImdlp from ..mdlp import FImdlp
from .. import version from .. import __version__
from .._version import __version__
# from .._version import __version__
class FImdlpTest(unittest.TestCase): class FImdlpTest(unittest.TestCase):
def test_version(self): def test_version(self):
self.assertEqual(version(), __version__) clf = FImdlp()
self.assertEqual(
clf.get_version(),
f"{__version__}({CFImdlp().get_version().decode()})",
)
def test_init(self): def test_init(self):
clf = FImdlp() clf = FImdlp()
self.assertEqual(-1, clf.n_jobs) self.assertEqual(-1, clf.n_jobs)
self.assertFalse(clf.proposal) clf = FImdlp(n_jobs=7)
clf = FImdlp(proposal=True, n_jobs=7)
self.assertTrue(clf.proposal)
self.assertEqual(7, clf.n_jobs) self.assertEqual(7, clf.n_jobs)
def test_fit_proposal(self): def test_fit_definitive(self):
clf = FImdlp(proposal=True) clf = FImdlp()
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[], []], clf.get_cut_points())
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
clf.fit(X, y) clf.fit(X, y)
self.assertEqual(clf.n_features_, 4) self.assertEqual(clf.n_features_in_, 4)
self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_)) self.assertTrue(np.array_equal(y, clf.y_))
expected = [ expected = [
[ [5.449999809265137, 5.75],
4.900000095367432, [2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684],
5.0, [2.45, 4.75, 5.050000190734863],
5.099999904632568, [0.8, 1.75],
5.400000095367432,
5.699999809265137,
],
[2.6999998092651367, 2.9000000953674316, 3.1999998092651367],
[2.3499999046325684, 4.5, 4.800000190734863],
[0.75, 1.399999976158142, 1.5, 1.7000000476837158],
] ]
self.assertListEqual(expected, clf.get_cut_points()) computed = clf.get_cut_points()
self.assertListEqual([0, 1, 2, 3], clf.features_) for item_computed, item_expected in zip(computed, expected):
clf.fit(X, y, features=[0, 2, 3]) for x_, y_ in zip(item_computed, item_expected):
self.assertListEqual([0, 2, 3], clf.features_) self.assertAlmostEqual(x_, y_)
def test_fit_original(self):
clf = FImdlp(proposal=False)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[], []], clf.get_cut_points())
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
expected = [
[5.5, 5.800000190734863],
[3.0999999046325684],
[2.450000047683716, 4.800000190734863, 5.099999904632568],
[0.800000011920929, 1.7000000476837158],
]
self.assertListEqual(expected, clf.get_cut_points())
self.assertListEqual([0, 1, 2, 3], clf.features_) self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3]) clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_) self.assertListEqual([0, 2, 3], clf.features_)
@@ -87,67 +61,169 @@ class FImdlpTest(unittest.TestCase):
clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2]) clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2])
def test_fit_features(self): def test_fit_features(self):
clf = FImdlp(n_jobs=-1)
# Two samples doesn't have enough information to split
clf.fit([[1, -2], [3, 4]], [1, 2], features=[0])
self.assertListEqual(clf.get_cut_points(), [[], []])
clf.fit([[1, -2], [3, 4], [5, 6]], [1, 2, 2], features=[0])
self.assertListEqual(clf.get_cut_points(), [[2], []])
res = clf.transform([[1, -2], [3, 4]])
self.assertListEqual(res.tolist(), [[0, -2], [1, 4]])
X, y = load_iris(return_X_y=True)
X_expected = X[:, [0, 2]].copy()
clf.fit(X, y, features=[1, 3])
X_computed = clf.transform(X)
self.assertListEqual(
X_expected[:, 0].tolist(), X_computed[:, 0].tolist()
)
self.assertListEqual(
X_expected[:, 1].tolist(), X_computed[:, 2].tolist()
)
self.assertEqual(X_computed.dtype, np.float64)
def test_transform(self):
clf = FImdlp() clf = FImdlp()
clf.fit([[1, 2], [3, 4]], [1, 2], features=[0]) clf.fit([[1, 2], [3, 4], [5, 6]], [1, 2, 2])
res = clf.transform([[1, 2], [3, 4]])
self.assertListEqual(res.tolist(), [[0, 2], [0, 4]])
def test_transform_original(self):
clf = FImdlp(proposal=False)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual( self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]] clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
) )
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
clf.fit(X, y) clf.fit(X, y)
self.assertEqual(clf.n_features_, 4) self.assertEqual(clf.n_features_in_, 4)
self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_)) self.assertTrue(np.array_equal(y, clf.y_))
X_transformed = clf.transform(X)
self.assertListEqual( self.assertListEqual(
clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist() X_transformed.tolist(), clf.fit(X, y).transform(X).tolist()
) )
self.assertEqual(X_transformed.dtype, np.int32)
expected = [ expected = [
[0, 0, 1, 1], [1, 0, 1, 1],
[2, 3, 1, 1],
[2, 0, 1, 1], [2, 0, 1, 1],
[1, 0, 1, 1],
[0, 0, 1, 1], [0, 0, 1, 1],
[1, 0, 1, 1], [1, 0, 1, 1],
[1, 0, 1, 1], [1, 3, 1, 1],
[1, 0, 1, 1], [1, 2, 1, 1],
] ]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected)) self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]]) clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError): with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(proposal=False) clf = FImdlp()
clf.transform([[1, 2], [3, 4]]) clf.transform([[1, 2], [3, 4]])
def test_transform_proposal(self): def test_cppfactorize(self):
clf = FImdlp(proposal=True) source = [
clf.fit([[1, 2], [3, 4]], [1, 2]) b"f0",
self.assertEqual( b"f1",
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]] b"f2",
b"f3",
b"f4",
b"f5",
b"f6",
b"f1",
b"f1",
b"f7",
b"f8",
]
expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8]
computed = factorize(source)
self.assertListEqual(expected, computed)
def test_join_fit(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
x = np.array(
[
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[1, 2, 3, 4, 5],
[2, 3, 4, 5, 6],
[3, 4, 5, 6, 7],
]
) )
expected = [0, 0, 1, 2, 2]
clf = FImdlp()
clf.fit(x, factorize(y))
computed = clf.join_fit([0, 2], 1, x)
self.assertListEqual(computed.tolist(), expected)
expected_y = [b"002", b"002", b"113", b"224", b"335"]
self.assertListEqual(expected_y, clf.y_join_)
def test_join_fit_error(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
x = np.array(
[
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[1, 2, 3, 4, 5],
[2, 3, 4, 5, 6],
[3, 4, 5, 6, 7],
]
)
clf = FImdlp()
clf.fit(x, factorize(y))
with self.assertRaises(ValueError) as exception:
clf.join_fit([], 1, x)
self.assertEqual(
str(exception.exception),
"Number of features must be in range [1, 5]",
)
with self.assertRaises(ValueError) as exception:
FImdlp().join_fit([0, 4], 1, x)
self.assertTrue(
str(exception.exception).startswith(
"This FImdlp instance is not fitted yet."
)
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 5], 1, x)
self.assertEqual(
str(exception.exception),
"Feature 5 not in range [0, 5)",
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 2], 5, x)
self.assertEqual(
str(exception.exception),
"Target 5 not in range [0, 5)",
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 2], 2, x)
self.assertEqual(
str(exception.exception),
"Target cannot in features to join",
)
def test_factorize(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
clf = FImdlp()
computed = clf.factorize(y)
self.assertListEqual([0, 0, 1, 2, 3], computed)
y = [b"f4", b"f0", b"f0", b"f2", b"f3"]
clf = FImdlp()
computed = clf.factorize(y)
self.assertListEqual([0, 1, 1, 2, 3], computed)
@staticmethod
def test_sklearn_transformer():
for check, test in check_estimator(FImdlp(), generate_only=True):
test(check)
def test_states_feature(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
clf.fit(X, y) clf.fit(X, y)
self.assertEqual(clf.n_features_, 4) expected = []
self.assertTrue(np.array_equal(X, clf.X_)) for i in [3, 6, 4, 3]:
self.assertTrue(np.array_equal(y, clf.y_)) expected.append(list(range(i)))
for feature in range(X.shape[1]):
self.assertListEqual( self.assertListEqual(
clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist() expected[feature], clf.get_states_feature(feature)
) )
expected = [
[4, 0, 1, 1], def test_states_no_feature(self):
[5, 2, 2, 2], clf = FImdlp()
[5, 0, 1, 1], X, y = load_iris(return_X_y=True)
[1, 0, 1, 1], clf.fit(X, y)
[4, 1, 1, 1], self.assertIsNone(clf.get_states_feature(4))
[5, 2, 1, 1],
[5, 1, 1, 1],
]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(proposal=True)
clf.transform([[1, 2], [3, 4]])