Update the c++ sources with new version

Add changed submodule
2025-08-18 08:55:51 +00:00 · 2023-02-24 11:04:06 +01:00 · 2023-02-22 11:56:39 +01:00 · 2023-02-22 11:34:27 +01:00 · 2023-02-22 11:33:26 +01:00 · 2023-02-22 10:44:43 +01:00
49 changed files with 491 additions and 34153 deletions
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,82 @@
 # For most projects, this workflow file will not need changing; you simply need
 # to commit it to your repository.
 #
 # You may wish to alter this file to override the set of languages analyzed,
 # or to provide custom queries or build logic.
 #
 # ******** NOTE ********
 # We have attempted to detect the languages in your repository. Please check
 # the `language` matrix defined below to confirm you have the correct set of
 # supported CodeQL languages.
 #
 name: "CodeQL"
 on:
  push:
    branches: ["main"]
  pull_request:
    # The branches below must be a subset of the branches above
    branches: ["main"]
  schedule:
    - cron: "16 22 * * 0"
 jobs:
  analyze:
    name: Analyze
    runs-on: ubuntu-latest
    permissions:
      actions: read
      contents: read
      security-events: write
    strategy:
      fail-fast: false
      matrix:
        language: ["python"]
        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
        with:
          submodules: recursive
      # Initializes the CodeQL tools for scanning.
      - name: Initialize CodeQL
        uses: github/codeql-action/init@v2
        with:
          languages: ${{ matrix.language }}
          # If you wish to specify custom queries, you can do so here or in a config file.
          # By default, queries listed here will override any specified in a config file.
          # Prefix the list here with "+" to use these queries and those in the config file.
          # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
          # queries: security-extended,security-and-quality
      # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
      # If this step fails, then you should remove it and run the build manually (see below)
      - if: matrix.language == 'python'
        name: Autobuild
        uses: github/codeql-action/autobuild@v2
      - if: matrix.language == 'cpp'
        name: Build CPP
        run: |
          pip install -q --upgrade pip
          pip install -q scikit-learn cython
          make install
      # ℹ️ Command-line programs to run using the OS shell.
      # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
      #   If the Autobuild fails above, remove it and uncomment the following three lines.
      #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
      # - run: |
      #   echo "Run, Build Application using script"
      #   ./location_of_script_within_repo/buildscript.sh
      - name: Perform CodeQL Analysis
        uses: github/codeql-action/analyze@v2
        with:
          category: "/language:${{matrix.language}}"
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -2,9 +2,9 @@ name: CI
 on:
  push:
-    branches: [master]
+    branches: [main]
  pull_request:
-    branches: [master]
+    branches: [main]
  workflow_dispatch:
 jobs:
@@ -12,11 +12,13 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [macos-latest, ubuntu-latest, windows-latest]
+        os: [ubuntu-latest]
-        python: [3.9, "3.10"]
+        python: ["3.10"]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
        with:
          submodules: recursive
      - name: Set up Python ${{ matrix.python }}
        uses: actions/setup-python@v2
        with:
@@ -24,10 +26,10 @@ jobs:
      - name: Install dependencies
        run: |
          pip install -q --upgrade pip
          pip install -q scikit-learn cython
          pip install -q --upgrade codecov coverage black flake8 codacy-coverage
      - name: Build and install
        run: |
          cd FImdlp
          make install
      - name: Lint
        run: |
--- a/.gitignore
+++ b/.gitignore
@@ -33,8 +33,8 @@ MANIFEST
 *.manifest
 *.spec
-# Installer log2s
+# Installer logs
-pip-log2.txt
+pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
@@ -56,7 +56,7 @@ coverage.xml
 *.pot
 # Django stuff:
-*.log2
+*.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
@@ -135,3 +135,5 @@ cmake-build-debug/**
 **/lcoverage/**
 **/x/*
 **/*.so
 **/CMakeFiles
 wheelhouse
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "fimdlp/cppmdlp"]
+[submodule "src/cppmdlp"]
-	path = src/cppfimdlp
+	path = src/cppmdlp
-	url = https://github.com/rmontanana/mdlp
+	url = https://github.com/rmontanana/mdlp.git
--- a/Ejemplo.xlsx
+++ b/Ejemplo.xlsx
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
 include src/cppmdlp/CPPFImdlp.h
 include src/cppmdlp/typesFImdlp.h
 include src/cppmdlp/Metrics.h
 include src/fimdlp/Factorize.h
--- a/9
+++ b/9
@@ -15,6 +15,10 @@ coverage:
 	make test
 	coverage report -m
 submodule:
 	git submodule update --remote src/cppmdlp
 	git submodule update --merge
 lint:  ## Lint and static-check
 	black src
 	flake8 --per-file-ignores="__init__.py:F401" src
@@ -33,6 +37,11 @@ install:  ## Build extension
 audit: ## Audit pip
 	pip-audit
 version:
 	@echo "Current Python version .: $(shell python --version)"
 	@echo "Current FImdlp version .: $(shell python -c "from fimdlp import _version; print(_version.__version__)")"
 	@echo "Installed FImdlp version: $(shell pip show fimdlp | grep Version | cut -d' ' -f2)"
 help: ## Show help message
 	@IFS=$$'\n' ; \
 	help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
--- a/README.md
+++ b/README.md
@@ -1,11 +1,21 @@
 # FImdlp
-
+[![CI](https://github.com/Doctorado-ML/FImdlp/actions/workflows/main.yml/badge.svg)](https://github.com/Doctorado-ML/FImdlp/actions/workflows/main.yml)
 [![CodeQL](https://github.com/Doctorado-ML/FImdlp/actions/workflows/codeql.yml/badge.svg)](https://github.com/Doctorado-ML/FImdlp/actions/workflows/codeql.yml)
 [![Codacy Badge](https://app.codacy.com/project/badge/Grade/8b4d784fee13401588aa8c06532a2f6d)](https://www.codacy.com/gh/Doctorado-ML/FImdlp/dashboard?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=Doctorado-ML/FImdlp&amp;utm_campaign=Badge_Grade)
 [![codecov](https://codecov.io/gh/Doctorado-ML/FImdlp/branch/main/graph/badge.svg?token=W8I45B5Z3J)](https://codecov.io/gh/Doctorado-ML/FImdlp)
 [![pypy](https://img.shields.io/pypi/v/FImdlp?color=g)](https://pypi.org/project/FImdlp)
 ![https://img.shields.io/badge/python-3.9%2B-blue](https://img.shields.io/badge/python-3.9%2B-brightgreen)
 Discretization algorithm based on the paper by Usama M. Fayyad and Keki B. Irani 
-```
+
 Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning. In Proceedings of the 13th International Joint Conference on Artificial Intelligence (IJCAI-95), pages 1022-1027, Montreal, Canada, August 1995.
 ## Installation
 ```bash
 git clone --recurse-submodules https://github.com/doctorado-ml/FImdlp.git
 ```
 ## Build and usage sample
@@ -14,8 +24,8 @@ Multi-Interval Discretization of Continuous-Valued Attributes for Classification
 ```bash
 pip install -e .
-python samples/sample.py iris --original 
+python samples/sample.py iris  
-python samples/sample.py iris --proposal
+python samples/sample.py iris --alternative
 python samples/sample.py -h # for more options
 ```
--- a/feature0.txt
+++ b/feature0.txt
@@ -1,152 +0,0 @@
 +++++++++++++++++++++++
 (  0,  13) -> (4.3, 0)
 (  1,   8) -> (4.4, 0)
 (  2,  38) -> (4.4, 0)
 (  3,  42) -> (4.4, 0)
 (  4,  41) -> (4.5, 0)
 (  5,   3) -> (4.6, 0)
 (  6,   6) -> (4.6, 0)
 (  7,  22) -> (4.6, 0)
 (  8,  47) -> (4.6, 0)
 (  9,   2) -> (4.7, 0)
 ( 10,  29) -> (4.7, 0)
 ( 11,  11) -> (4.8, 0)
 ( 12,  12) -> (4.8, 0)
 ( 13,  24) -> (4.8, 0)
 ( 14,  30) -> (4.8, 0)
 ( 15,  45) -> (4.8, 0)
 ( 16,   1) -> (4.9, 0)
 ( 17,   9) -> (4.9, 0)
 ( 18,  34) -> (4.9, 0)
 ( 19,  37) -> (4.9, 0)
 ( 20,  57) -> (4.9, 1)
 ( 21, 106) -> (4.9, 2)
 ( 22,   4) -> (5.0, 0)
 ( 23,   7) -> (5.0, 0)
 ( 24,  25) -> (5.0, 0)
 ( 25,  26) -> (5.0, 0)
 ( 26,  35) -> (5.0, 0)
 ( 27,  40) -> (5.0, 0)
 ( 28,  43) -> (5.0, 0)
 ( 29,  49) -> (5.0, 0)
 ( 30,  60) -> (5.0, 1)
 ( 31,  93) -> (5.0, 1)
 ( 32,   0) -> (5.1, 0)
 ( 33,  17) -> (5.1, 0)
 ( 34,  19) -> (5.1, 0)
 ( 35,  21) -> (5.1, 0)
 ( 36,  23) -> (5.1, 0)
 ( 37,  39) -> (5.1, 0)
 ( 38,  44) -> (5.1, 0)
 ( 39,  46) -> (5.1, 0)
 ( 40,  98) -> (5.1, 1)
 ( 41,  27) -> (5.2, 0)
 ( 42,  28) -> (5.2, 0)
 ( 43,  32) -> (5.2, 0)
 ( 44,  59) -> (5.2, 1)
 ( 45,  48) -> (5.3, 0)
 ( 46,   5) -> (5.4, 0)
 ( 47,  10) -> (5.4, 0)
 ( 48,  16) -> (5.4, 0)
 ( 49,  20) -> (5.4, 0)
 ( 50,  31) -> (5.4, 0)
 ( 51,  84) -> (5.4, 1)
 ( 52,  33) -> (5.5, 0)
 ( 53,  36) -> (5.5, 0)
 ( 54,  53) -> (5.5, 1)
 ( 55,  80) -> (5.5, 1)
 ( 56,  81) -> (5.5, 1)
 ( 57,  89) -> (5.5, 1)
 ( 58,  90) -> (5.5, 1)
 ( 59,  64) -> (5.6, 1)
 ( 60,  66) -> (5.6, 1)
 ( 61,  69) -> (5.6, 1)
 ( 62,  88) -> (5.6, 1)
 ( 63,  94) -> (5.6, 1)
 ( 64, 121) -> (5.6, 2)
 ( 65,  15) -> (5.7, 0)
 ( 66,  18) -> (5.7, 0)
 ( 67,  55) -> (5.7, 1)
 ( 68,  79) -> (5.7, 1)
 ( 69,  95) -> (5.7, 1)
 ( 70,  96) -> (5.7, 1)
 ( 71,  99) -> (5.7, 1)
 ( 72, 113) -> (5.7, 2)
 ( 73,  14) -> (5.8, 0)
 ( 74,  67) -> (5.8, 1)
 ( 75,  82) -> (5.8, 1)
 ( 76,  92) -> (5.8, 1)
 ( 77, 101) -> (5.8, 2)
 ( 78, 114) -> (5.8, 2)
 ( 79, 142) -> (5.8, 2)
 ( 80,  61) -> (5.9, 1)
 ( 81,  70) -> (5.9, 1)
 ( 82, 149) -> (5.9, 2)
 ( 83,  62) -> (6.0, 1)
 ( 84,  78) -> (6.0, 1)
 ( 85,  83) -> (6.0, 1)
 ( 86,  85) -> (6.0, 1)
 ( 87, 119) -> (6.0, 2)
 ( 88, 138) -> (6.0, 2)
 ( 89,  63) -> (6.1, 1)
 ( 90,  71) -> (6.1, 1)
 ( 91,  73) -> (6.1, 1)
 ( 92,  91) -> (6.1, 1)
 ( 93, 127) -> (6.1, 2)
 ( 94, 134) -> (6.1, 2)
 ( 95,  68) -> (6.2, 1)
 ( 96,  97) -> (6.2, 1)
 ( 97, 126) -> (6.2, 2)
 ( 98, 148) -> (6.2, 2)
 ( 99,  56) -> (6.3, 1)
 (100,  72) -> (6.3, 1)
 (101,  87) -> (6.3, 1)
 (102, 100) -> (6.3, 2)
 (103, 103) -> (6.3, 2)
 (104, 123) -> (6.3, 2)
 (105, 133) -> (6.3, 2)
 (106, 136) -> (6.3, 2)
 (107, 146) -> (6.3, 2)
 (108,  51) -> (6.4, 1)
 (109,  74) -> (6.4, 1)
 (110, 111) -> (6.4, 2)
 (111, 115) -> (6.4, 2)
 (112, 128) -> (6.4, 2)
 (113, 132) -> (6.4, 2)
 (114, 137) -> (6.4, 2)
 (115,  54) -> (6.5, 1)
 (116, 104) -> (6.5, 2)
 (117, 110) -> (6.5, 2)
 (118, 116) -> (6.5, 2)
 (119, 147) -> (6.5, 2)
 (120,  58) -> (6.6, 1)
 (121,  75) -> (6.6, 1)
 (122,  65) -> (6.7, 1)
 (123,  77) -> (6.7, 1)
 (124,  86) -> (6.7, 1)
 (125, 108) -> (6.7, 2)
 (126, 124) -> (6.7, 2)
 (127, 140) -> (6.7, 2)
 (128, 144) -> (6.7, 2)
 (129, 145) -> (6.7, 2)
 (130,  76) -> (6.8, 1)
 (131, 112) -> (6.8, 2)
 (132, 143) -> (6.8, 2)
 (133,  52) -> (6.9, 1)
 (134, 120) -> (6.9, 2)
 (135, 139) -> (6.9, 2)
 (136, 141) -> (6.9, 2)
 (137,  50) -> (7.0, 1)
 (138, 102) -> (7.1, 2)
 (139, 109) -> (7.2, 2)
 (140, 125) -> (7.2, 2)
 (141, 129) -> (7.2, 2)
 (142, 107) -> (7.3, 2)
 (143, 130) -> (7.4, 2)
 (144, 105) -> (7.6, 2)
 (145, 117) -> (7.7, 2)
 (146, 118) -> (7.7, 2)
 (147, 122) -> (7.7, 2)
 (148, 135) -> (7.7, 2)
 (149, 131) -> (7.9, 2)
 +++++++++++++++++++++++
--- a/k.py
+++ b/k.py
@@ -0,0 +1,12 @@
 from sklearn.datasets import load_wine
 from fimdlp.mdlp import FImdlp
 X, y = load_wine(return_X_y=True)
 trans = FImdlp()
 Xt = trans.join_transform(X, y, 12)
 print("X shape = ", X.shape)
 print("Xt.shape=", Xt.shape)
 print("Xt ", Xt[:10])
 print("trans.X_ shape = ", trans.X_.shape)
 print("trans.y_ ", trans.y_[:10])
 print("y_join ", trans.y_join_[:10])
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ authors = [
    { name = "Ricardo Montañana", email = "ricardo.montanana@alu.uclm.es" },
 ]
 dynamic = ['version']
-dependencies = ["numpy", "joblib"]
+dependencies = ["numpy", "joblib", "scikit-learn"]
 requires-python = ">=3.9"
 classifiers = [
    "Development Status :: 3 - Alpha",
--- a/samples/ArffFiles.cpp
+++ b/samples/ArffFiles.cpp
@@ -1,117 +0,0 @@
 #include "ArffFiles.h"
 #include <fstream>
 #include <sstream>
 #include <map>
 #include <iostream>
 using namespace std;
 ArffFiles::ArffFiles()
 {
 }
 vector<string> ArffFiles::getLines()
 {
    return lines;
 }
 unsigned long int ArffFiles::getSize()
 {
    return lines.size();
 }
 vector<tuple<string, string>> ArffFiles::getAttributes()
 {
    return attributes;
 }
 string ArffFiles::getClassName()
 {
    return className;
 }
 string ArffFiles::getClassType()
 {
    return classType;
 }
 vector<vector<float>>& ArffFiles::getX()
 {
    return X;
 }
 vector<int>& ArffFiles::getY()
 {
    return y;
 }
 void ArffFiles::load(string fileName, bool classLast)
 {
    ifstream file(fileName);
    string keyword, attribute, type;
    if (file.is_open()) {
        string line;
        while (getline(file, line)) {
            if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
                continue;
            }
            if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
                stringstream ss(line);
                ss >> keyword >> attribute >> type;
                attributes.push_back(make_tuple(attribute, type));
                continue;
            }
            if (line[0] == '@') {
                continue;
            }
            lines.push_back(line);
        }
        file.close();
        if (attributes.empty())
            throw invalid_argument("No attributes found");
        if (classLast) {
            className = get<0>(attributes.back());
            classType = get<1>(attributes.back());
            attributes.pop_back();
        } else {
            className = get<0>(attributes.front());
            classType = get<1>(attributes.front());
            attributes.erase(attributes.begin());
        }
        generateDataset(classLast);
    } else
        throw invalid_argument("Unable to open file");
 }
 void ArffFiles::generateDataset(bool classLast)
 {
    X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
    vector<string> yy = vector<string>(lines.size(), "");
    int labelIndex = classLast ? attributes.size() : 0;
    for (int i = 0; i < lines.size(); i++) {
        stringstream ss(lines[i]);
        string value;
        int pos = 0, xIndex = 0;
        while (getline(ss, value, ',')) {
            if (pos++ == labelIndex) {
                yy[i] = value;
            } else {
                X[xIndex++][i] = stof(value);
            }
        }
    }
    y = factorize(yy);
 }
 string ArffFiles::trim(const string& source)
 {
    string s(source);
    s.erase(0, s.find_first_not_of(" \n\r\t"));
    s.erase(s.find_last_not_of(" \n\r\t") + 1);
    return s;
 }
 vector<int> ArffFiles::factorize(const vector<string>& labels_t)
 {
    vector<int> yy;
    yy.reserve(labels_t.size());
    map<string, int> labelMap;
    int i = 0;
    for (string label : labels_t) {
        if (labelMap.find(label) == labelMap.end()) {
            labelMap[label] = i++;
        }
        yy.push_back(labelMap[label]);
    }
    return yy;
 }
--- a/samples/ArffFiles.h
+++ b/samples/ArffFiles.h
@@ -1,28 +0,0 @@
 #ifndef ARFFFILES_H
 #define ARFFFILES_H
 #include <string>
 #include <vector>
 #include <tuple>
 using namespace std;
 class ArffFiles {
 private:
    vector<string> lines;
    vector<tuple<string, string>> attributes;
    string className, classType;
    vector<vector<float>> X;
    vector<int> y;
    void generateDataset(bool);
 public:
    ArffFiles();
    void load(string, bool = true);
    vector<string> getLines();
    unsigned long int getSize();
    string getClassName();
    string getClassType();
    string trim(const string&);
    vector<vector<float>>& getX();
    vector<int>& getY();
    vector<tuple<string, string>> getAttributes();
    vector<int> factorize(const vector<string>& labels_t);
 };
 #endif
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -3,4 +3,4 @@ project(main)
 set(CMAKE_CXX_STANDARD 14)
-add_executable(sample sample.cpp ArffFiles.cpp ../src/fimdlp/cppmdlp/Metrics.cpp ../src/fimdlp/cppmdlp/CPPFImdlp.cpp)
+add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp)
--- a/samples/sample.cpp
+++ b/samples/sample.cpp
@@ -1,4 +1,4 @@
-#include "ArffFiles.h"
+#include "../src/cppmdlp/tests/ArffFiles.h"
 #include <iostream>
 #include <vector>
 #include <iomanip>
@@ -41,7 +41,7 @@ int main(int argc, char** argv)
        }
        cout << y[i] << endl;
    }
-    mdlp::CPPFImdlp test = mdlp::CPPFImdlp(false);
+    mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0);
    for (auto i = 0; i < attributes.size(); i++) {
        cout << "Cut points for " << get<0>(attributes[i]) << endl;
        cout << "--------------------------" << setprecision(3) << endl;
--- a/samples/sample.py
+++ b/samples/sample.py
@@ -14,8 +14,9 @@ datasets = {
 }
 ap = argparse.ArgumentParser()
-ap.add_argument("--proposal", action="store_true")
+ap.add_argument(
-ap.add_argument("--original", dest="proposal", action="store_false")
+    "--alternative", dest="proposal", action="store_const", const=1
 )
 ap.add_argument("dataset", type=str, choices=datasets.keys())
 args = ap.parse_args()
 relative = "" if os.path.isdir("src") else ".."
@@ -29,7 +30,7 @@ class_name = df.columns.to_list()[class_column]
 X = df.drop(class_name, axis=1)
 y, _ = pd.factorize(df[class_name])
 X = X.to_numpy()
-test = FImdlp(proposal=args.proposal)
+test = FImdlp(algorithm=args.proposal if args.proposal is not None else 0)
 now = time.time()
 test.fit(X, y)
 fit_time = time.time()
--- a/setup.py
+++ b/setup.py
@@ -14,10 +14,13 @@ setup(
                "src/fimdlp/cfimdlp.pyx",
                "src/cppmdlp/CPPFImdlp.cpp",
                "src/cppmdlp/Metrics.cpp",
                "src/fimdlp/Factorize.cpp",
            ],
            language="c++",
            include_dirs=["fimdlp"],
-            extra_compile_args=["-std=c++2a"],
+            extra_compile_args=[
                "-std=c++11",
            ],
        ),
    ]
 )
--- a/src/cppmdlp
+++ b/src/cppmdlp
--- a/src/cppmdlp/.gitignore
+++ b/src/cppmdlp/.gitignore
@@ -1,36 +0,0 @@
 # Prerequisites
 *.d
 # Compiled Object files
 *.slo
 *.lo
 *.o
 *.obj
 # Precompiled Headers
 *.gch
 *.pch
 # Compiled Dynamic libraries
 *.so
 *.dylib
 *.dll
 # Fortran module files
 *.mod
 *.smod
 # Compiled Static libraries
 *.lai
 *.la
 *.a
 *.lib
 # Executables
 *.exe
 *.out
 *.app
 **/build
 **/lcoverage
 .idea
 cmake-*
--- a/src/cppmdlp/CMakeLists.txt
+++ b/src/cppmdlp/CMakeLists.txt
@@ -1,7 +0,0 @@
 cmake_minimum_required(VERSION 3.24)
 project(mdlp)
 set(CMAKE_CXX_STANDARD 17)
 add_library(mdlp CPPFImdlp.cpp Metrics.cpp)
--- a/src/cppmdlp/CPPFImdlp.cpp
+++ b/src/cppmdlp/CPPFImdlp.cpp
@@ -1,160 +0,0 @@
 #include <numeric>
 #include <algorithm>
 #include <set>
 #include <cmath>
 #include "CPPFImdlp.h"
 #include "Metrics.h"
 namespace mdlp {
    CPPFImdlp::CPPFImdlp(bool proposal):proposal(proposal), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices))
    {
    }
    CPPFImdlp::~CPPFImdlp()
        = default;
    CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_)
    {
        X = X_;
        y = y_;
        cutPoints.clear();
        if (X.size() != y.size()) {
            throw invalid_argument("X and y must have the same size");
        }
        if (X.size() == 0 || y.size() == 0) {
            throw invalid_argument("X and y must have at least one element");
        }
        indices = sortIndices(X_);
        metrics.setData(y, indices);
        if (proposal)
            computeCutPointsProposal();
        else
            computeCutPoints(0, X.size());
        return *this;
    }
    void CPPFImdlp::computeCutPoints(size_t start, size_t end)
    {
        int cut;
        if (end - start < 2)
            return;
        cut = getCandidate(start, end);
        if (cut == -1 || !mdlp(start, cut, end)) {
            // cut.value == -1 means that there is no candidate in the interval
            // No boundary found, so we add both ends of the interval as cutpoints
            // because they were selected by the algorithm before
            if (start != 0)
                cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2);
            if (end != X.size())
                cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2);
            return;
        }
        computeCutPoints(start, cut);
        computeCutPoints(cut, end);
    }
    void CPPFImdlp::computeCutPointsOriginal(size_t start, size_t end)
    {
        precision_t cut;
        if (end - start < 2)
            return;
        cut = getCandidate(start, end);
        if (cut == -1)
            return;
        if (mdlp(start, cut, end)) {
            cutPoints.push_back((X[indices[cut]] + X[indices[cut - 1]]) / 2);
        }
        computeCutPointsOriginal(start, cut);
        computeCutPointsOriginal(cut, end);
    }
    void CPPFImdlp::computeCutPointsProposal()
    {
        precision_t xPrev, xCur, xPivot, cutPoint;
        int yPrev, yCur, yPivot;
        size_t idx, numElements, start;
        xCur = xPrev = X[indices[0]];
        yCur = yPrev = y[indices[0]];
        numElements = indices.size() - 1;
        idx = start = 0;
        while (idx < numElements) {
            xPivot = xCur;
            yPivot = yCur;
            // Read the same values and check class changes
            do {
                idx++;
                xCur = X[indices[idx]];
                yCur = y[indices[idx]];
                if (yCur != yPivot && xCur == xPivot) {
                    yPivot = -1;
                }
            }
            while (idx < numElements && xCur == xPivot);
            // Check if the class changed and there are more than 1 element
            if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) {
                start = idx;
                cutPoint = (xPrev + xCur) / 2;
                cutPoints.push_back(cutPoint);
            }
            yPrev = yPivot;
            xPrev = xPivot;
        }
    }
    long int CPPFImdlp::getCandidate(size_t start, size_t end)
    {
        long int candidate = -1, elements = end - start;
        precision_t entropy_left, entropy_right, minEntropy = numeric_limits<precision_t>::max();
        for (auto idx = start + 1; idx < end; idx++) {
            // Cutpoints are always on boudndaries
            if (y[indices[idx]] == y[indices[idx - 1]])
                continue;
            entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx);
            entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end);
            if (entropy_left + entropy_right < minEntropy) {
                minEntropy = entropy_left + entropy_right;
                candidate = idx;
            }
        }
        return candidate;
    }
    bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
    {
        int k, k1, k2;
        precision_t ig, delta;
        precision_t ent, ent1, ent2;
        auto N = precision_t(end - start);
        if (N < 2) {
            return false;
        }
        k = metrics.computeNumClasses(start, end);
        k1 = metrics.computeNumClasses(start, cut);
        k2 = metrics.computeNumClasses(cut, end);
        ent = metrics.entropy(start, end);
        ent1 = metrics.entropy(start, cut);
        ent2 = metrics.entropy(cut, end);
        ig = metrics.informationGain(start, cut, end);
        delta = log2(pow(3, precision_t(k)) - 2) -
            (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
        precision_t term = 1 / N * (log2(N - 1) + delta);
        return ig > term;
    }
    cutPoints_t CPPFImdlp::getCutPoints()
    {
        // Remove duplicates and sort
        cutPoints_t output(cutPoints.size());
        set<precision_t> s;
        unsigned size = cutPoints.size();
        for (unsigned i = 0; i < size; i++)
            s.insert(cutPoints[i]);
        output.assign(s.begin(), s.end());
        sort(output.begin(), output.end());
        return output;
    }
    // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
    indices_t CPPFImdlp::sortIndices(samples_t& X_)
    {
        indices_t idx(X_.size());
        iota(idx.begin(), idx.end(), 0);
        for (size_t i = 0; i < X_.size(); i++)
            sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
                { return X_[i1] < X_[i2]; });
        return idx;
    }
 }
--- a/src/cppmdlp/CPPFImdlp.h
+++ b/src/cppmdlp/CPPFImdlp.h
@@ -1,33 +0,0 @@
 #ifndef CPPFIMDLP_H
 #define CPPFIMDLP_H
 #include "typesFImdlp.h"
 #include "Metrics.h"
 #include <utility>
 namespace mdlp {
    class CPPFImdlp {
    protected:
        bool proposal;
        indices_t indices; // sorted indices to use with X and y
        samples_t X;
        labels_t y;
        Metrics metrics;
        cutPoints_t cutPoints;
        static indices_t sortIndices(samples_t&);
        void computeCutPoints(size_t, size_t);
        long int getCandidate(size_t, size_t);
        bool mdlp(size_t, size_t, size_t);
        // Original algorithm
        void computeCutPointsOriginal(size_t, size_t);
        bool goodCut(size_t, size_t, size_t);
        void computeCutPointsProposal();
    public:
        CPPFImdlp(bool);
        ~CPPFImdlp();
        CPPFImdlp& fit(samples_t&, labels_t&);
        samples_t getCutPoints();
    };
 }
 #endif
--- a/src/cppmdlp/LICENSE
+++ b/src/cppmdlp/LICENSE
@@ -1,21 +0,0 @@
 MIT License
 Copyright (c) 2022 Ricardo Montañana Gómez
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/src/cppmdlp/Metrics.cpp
+++ b/src/cppmdlp/Metrics.cpp
@@ -1,65 +0,0 @@
 #include "Metrics.h"
 #include <set>
 #include <cmath>
 using namespace std;
 namespace mdlp {
    Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
    {
    }
    int Metrics::computeNumClasses(size_t start, size_t end)
    {
        set<int> nClasses;
        for (auto i = start; i < end; ++i) {
            nClasses.insert(y[indices[i]]);
        }
        return nClasses.size();
    }
    void Metrics::setData(labels_t& y_, indices_t& indices_)
    {
        indices = indices_;
        y = y_;
        numClasses = computeNumClasses(0, indices.size());
        entropyCache.clear();
        igCache.clear();
    }
    precision_t Metrics::entropy(size_t start, size_t end)
    {
        precision_t p, ventropy = 0;
        int nElements = 0;
        labels_t counts(numClasses + 1, 0);
        if (end - start < 2)
            return 0;
        if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
            return entropyCache[make_tuple(start, end)];
        }
        for (auto i = &indices[start]; i != &indices[end]; ++i) {
            counts[y[*i]]++;
            nElements++;
        }
        for (auto count : counts) {
            if (count > 0) {
                p = (precision_t)count / nElements;
                ventropy -= p * log2(p);
            }
        }
        entropyCache[make_tuple(start, end)] = ventropy;
        return ventropy;
    }
    precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
    {
        precision_t iGain;
        precision_t entropyInterval, entropyLeft, entropyRight;
        int nElementsLeft = cut - start, nElementsRight = end - cut;
        int nElements = end - start;
        if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
            return igCache[make_tuple(start, cut, end)];
        }
        entropyInterval = entropy(start, end);
        entropyLeft = entropy(start, cut);
        entropyRight = entropy(cut, end);
        iGain = entropyInterval - ((precision_t)nElementsLeft * entropyLeft + (precision_t)nElementsRight * entropyRight) / nElements;
        igCache[make_tuple(start, cut, end)] = iGain;
        return iGain;
    }
 }
--- a/src/cppmdlp/Metrics.h
+++ b/src/cppmdlp/Metrics.h
@@ -1,20 +0,0 @@
 #ifndef CCMETRICS_H
 #define CCMETRICS_H
 #include "typesFImdlp.h"
 namespace mdlp {
    class Metrics {
    protected:
        labels_t& y;
        indices_t& indices;
        int numClasses;
        cacheEnt_t entropyCache;
        cacheIg_t igCache;
    public:
        Metrics(labels_t&, indices_t&);
        void setData(labels_t&, indices_t&);
        int computeNumClasses(size_t, size_t);
        precision_t entropy(size_t, size_t);
        precision_t informationGain(size_t, size_t, size_t);
    };
 }
 #endif
--- a/src/cppmdlp/README.md
+++ b/src/cppmdlp/README.md
@@ -1,2 +0,0 @@
 # mdlp
 Discretization algorithm based on the paper by Fayyad &amp; Irani Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning
--- a/src/cppmdlp/sample/ArffFiles.cpp
+++ b/src/cppmdlp/sample/ArffFiles.cpp
@@ -1,117 +0,0 @@
 #include "ArffFiles.h"
 #include <fstream>
 #include <sstream>
 #include <map>
 #include <iostream>
 using namespace std;
 ArffFiles::ArffFiles()
 {
 }
 vector<string> ArffFiles::getLines()
 {
    return lines;
 }
 unsigned long int ArffFiles::getSize()
 {
    return lines.size();
 }
 vector<tuple<string, string>> ArffFiles::getAttributes()
 {
    return attributes;
 }
 string ArffFiles::getClassName()
 {
    return className;
 }
 string ArffFiles::getClassType()
 {
    return classType;
 }
 vector<vector<float>>& ArffFiles::getX()
 {
    return X;
 }
 vector<int>& ArffFiles::getY()
 {
    return y;
 }
 void ArffFiles::load(string fileName, bool classLast)
 {
    ifstream file(fileName);
    string keyword, attribute, type;
    if (file.is_open()) {
        string line;
        while (getline(file, line)) {
            if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
                continue;
            }
            if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
                stringstream ss(line);
                ss >> keyword >> attribute >> type;
                attributes.push_back(make_tuple(attribute, type));
                continue;
            }
            if (line[0] == '@') {
                continue;
            }
            lines.push_back(line);
        }
        file.close();
        if (attributes.empty())
            throw invalid_argument("No attributes found");
        if (classLast) {
            className = get<0>(attributes.back());
            classType = get<1>(attributes.back());
            attributes.pop_back();
        } else {
            className = get<0>(attributes.front());
            classType = get<1>(attributes.front());
            attributes.erase(attributes.begin());
        }
        generateDataset(classLast);
    } else
        throw invalid_argument("Unable to open file");
 }
 void ArffFiles::generateDataset(bool classLast)
 {
    X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
    vector<string> yy = vector<string>(lines.size(), "");
    int labelIndex = classLast ? attributes.size() : 0;
    for (int i = 0; i < lines.size(); i++) {
        stringstream ss(lines[i]);
        string value;
        int pos = 0, xIndex = 0;
        while (getline(ss, value, ',')) {
            if (pos++ == labelIndex) {
                yy[i] = value;
            } else {
                X[xIndex++][i] = stof(value);
            }
        }
    }
    y = factorize(yy);
 }
 string ArffFiles::trim(const string& source)
 {
    string s(source);
    s.erase(0, s.find_first_not_of(" \n\r\t"));
    s.erase(s.find_last_not_of(" \n\r\t") + 1);
    return s;
 }
 vector<int> ArffFiles::factorize(const vector<string>& labels_t)
 {
    vector<int> yy;
    yy.reserve(labels_t.size());
    map<string, int> labelMap;
    int i = 0;
    for (string label : labels_t) {
        if (labelMap.find(label) == labelMap.end()) {
            labelMap[label] = i++;
        }
        yy.push_back(labelMap[label]);
    }
    return yy;
 }
--- a/src/cppmdlp/sample/ArffFiles.h
+++ b/src/cppmdlp/sample/ArffFiles.h
@@ -1,28 +0,0 @@
 #ifndef ARFFFILES_H
 #define ARFFFILES_H
 #include <string>
 #include <vector>
 #include <tuple>
 using namespace std;
 class ArffFiles {
 private:
    vector<string> lines;
    vector<tuple<string, string>> attributes;
    string className, classType;
    vector<vector<float>> X;
    vector<int> y;
    void generateDataset(bool);
 public:
    ArffFiles();
    void load(string, bool = true);
    vector<string> getLines();
    unsigned long int getSize();
    string getClassName();
    string getClassType();
    string trim(const string&);
    vector<vector<float>>& getX();
    vector<int>& getY();
    vector<tuple<string, string>> getAttributes();
    vector<int> factorize(const vector<string>& labels_t);
 };
 #endif
--- a/src/cppmdlp/sample/CMakeLists.txt
+++ b/src/cppmdlp/sample/CMakeLists.txt
@@ -1,6 +0,0 @@
 cmake_minimum_required(VERSION 3.24)
 project(main)
 set(CMAKE_CXX_STANDARD 17)
 add_executable(sample sample.cpp ArffFiles.cpp ../Metrics.cpp ../CPPFImdlp.cpp)
--- a/src/cppmdlp/sample/sample.cpp
+++ b/src/cppmdlp/sample/sample.cpp
@@ -1,54 +0,0 @@
 #include "ArffFiles.h"
 #include <iostream>
 #include <vector>
 #include <iomanip>
 #include "../CPPFImdlp.h"
 using namespace std;
 int main(int argc, char** argv)
 {
    ArffFiles file;
    vector<string> lines;
    string path = "../../tests/datasets/";
    map<string, bool > datasets = {
        {"mfeat-factors", true},
        {"iris", true},
        {"letter", true},
        {"kdd_JapaneseVowels", false}
    };
    if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {
        cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl;
        return 1;
    }
    file.load(path + argv[1] + ".arff", datasets[argv[1]]);
    auto attributes = file.getAttributes();
    int items = file.getSize();
    cout << "Number of lines: " << items << endl;
    cout << "Attributes: " << endl;
    for (auto attribute : attributes) {
        cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl;
    }
    cout << "Class name: " << file.getClassName() << endl;
    cout << "Class type: " << file.getClassType() << endl;
    cout << "Data: " << endl;
    vector<vector<float>>& X = file.getX();
    vector<int>& y = file.getY();
    for (int i = 0; i < 50; i++) {
        for (auto feature : X) {
            cout << fixed << setprecision(1) << feature[i] << " ";
        }
        cout << y[i] << endl;
    }
    mdlp::CPPFImdlp test = mdlp::CPPFImdlp(false);
    for (auto i = 0; i < attributes.size(); i++) {
        cout << "Cut points for " << get<0>(attributes[i]) << endl;
        cout << "--------------------------" << setprecision(3) << endl;
        test.fit(X[i], y);
        for (auto item : test.getCutPoints()) {
            cout << item << endl;
        }
    }
    return 0;
 }
--- a/src/cppmdlp/tests/.gitignore
+++ b/src/cppmdlp/tests/.gitignore
@@ -1,2 +0,0 @@
 build
 build/*
--- a/src/cppmdlp/tests/CMakeLists.txt
+++ b/src/cppmdlp/tests/CMakeLists.txt
@@ -1,32 +0,0 @@
 cmake_minimum_required(VERSION 3.14)
 project(FImdlp)
 # GoogleTest requires at least C++14
 set(CMAKE_CXX_STANDARD 14)
 include(FetchContent)
 include_directories(${GTEST_INCLUDE_DIRS})
 FetchContent_Declare(
  googletest
  URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
 )
 # For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 FetchContent_MakeAvailable(googletest)
 enable_testing()
 add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp)
 add_executable(FImdlp_unittest ../CPPFImdlp.cpp ../Metrics.cpp FImdlp_unittest.cpp)
 target_link_libraries(Metrics_unittest GTest::gtest_main)
 target_link_libraries(FImdlp_unittest GTest::gtest_main)
 target_compile_options(Metrics_unittest PRIVATE --coverage)
 target_compile_options(FImdlp_unittest PRIVATE --coverage)
 target_link_options(Metrics_unittest PRIVATE --coverage)
 target_link_options(FImdlp_unittest PRIVATE --coverage)
 include(GoogleTest)
 gtest_discover_tests(Metrics_unittest)
 gtest_discover_tests(FImdlp_unittest)
--- a/src/cppmdlp/tests/FImdlp_unittest.cpp
+++ b/src/cppmdlp/tests/FImdlp_unittest.cpp
@@ -1,141 +0,0 @@
 #include "gtest/gtest.h"
 #include "../Metrics.h"
 #include "../CPPFImdlp.h"
 #include <iostream>
 namespace mdlp {
    class TestFImdlp : public CPPFImdlp, public testing::Test {
    public:
        precision_t precision = 0.000001;
        TestFImdlp() : CPPFImdlp(false) {}
        void SetUp() {
            //    5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
            //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
            X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9};
            y = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2};
            proposal = false;
            fit(X, y);
        }
        void setProposal(bool value) {
            proposal = value;
        }
        // void initIndices()
        // {
        //     indices = indices_t();
        // }
        void checkSortedVector() {
            indices_t testSortedIndices = sortIndices(X);
            precision_t prev = X[testSortedIndices[0]];
            for (auto i = 0; i < X.size(); ++i) {
                EXPECT_EQ(testSortedIndices[i], indices[i]);
                EXPECT_LE(prev, X[testSortedIndices[i]]);
                prev = X[testSortedIndices[i]];
            }
        }
        void checkCutPoints(cutPoints_t &expected) {
            int expectedSize = expected.size();
            EXPECT_EQ(cutPoints.size(), expectedSize);
            for (auto i = 0; i < cutPoints.size(); i++) {
                EXPECT_NEAR(cutPoints[i], expected[i], precision);
            }
        }
        template<typename T, typename A>
        void checkVectors(std::vector<T, A> const &expected, std::vector<T, A> const &computed) {
            EXPECT_EQ(expected.size(), computed.size());
            ASSERT_EQ(expected.size(), computed.size());
            for (auto i = 0; i < expected.size(); i++) {
                EXPECT_NEAR(expected[i], computed[i],precision);
            }
        }
    };
    TEST_F(TestFImdlp, FitErrorEmptyDataset) {
        X = samples_t();
        y = labels_t();
        EXPECT_THROW(fit(X, y), std::invalid_argument);
    }
    TEST_F(TestFImdlp, FitErrorDifferentSize) {
        X = {1, 2, 3};
        y = {1, 2};
        EXPECT_THROW(fit(X, y), std::invalid_argument);
    }
    TEST_F(TestFImdlp, SortIndices) {
        X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9};
        indices = {4, 3, 6, 8, 2, 1, 5, 0, 9, 7};
        checkSortedVector();
        X = {5.77, 5.88, 5.99};
        indices = {0, 1, 2};
        checkSortedVector();
        X = {5.33, 5.22, 5.11};
        indices = {2, 1, 0};
        checkSortedVector();
    }
    TEST_F(TestFImdlp, TestDataset) {
        proposal = false;
        fit(X, y);
        computeCutPointsOriginal(0, 10);
        cutPoints_t expected = {5.6499996185302734};
        vector<precision_t> computed = getCutPoints();
        computed = getCutPoints();
        int expectedSize = expected.size();
        EXPECT_EQ(computed.size(), expected.size());
        for (auto i = 0; i < expectedSize; i++) {
            EXPECT_NEAR(computed[i], expected[i], precision);
        }
    }
    TEST_F(TestFImdlp, ComputeCutPointsOriginal) {
        cutPoints_t expected = {5.65};
        proposal = false;
        computeCutPointsOriginal(0, 10);
        checkCutPoints(expected);
    }
    TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) {
        cutPoints_t expected;
        proposal = false;
        expected = {2};
        samples_t X_ = {0, 1, 2, 2};
        labels_t y_ = {1, 1, 1, 2};
        fit(X_, y_);
        checkCutPoints(expected);
    }
    TEST_F(TestFImdlp, ComputeCutPointsProposal) {
        proposal = true;
        cutPoints_t expected;
        expected = {};
        fit(X, y);
        computeCutPointsProposal();
        checkCutPoints(expected);
    }
    TEST_F(TestFImdlp, ComputeCutPointsProposalGCase) {
        cutPoints_t expected;
        expected = {1.5};
        proposal = true;
        samples_t X_ = {0, 1, 2, 2};
        labels_t y_ = {1, 1, 1, 2};
        fit(X_, y_);
        checkCutPoints(expected);
    }
    TEST_F(TestFImdlp, GetCutPoints) {
        samples_t computed, expected = {5.65};
        proposal = false;
        computeCutPointsOriginal(0, 10);
        computed = getCutPoints();
        for (auto item: cutPoints)
            cout << setprecision(6) << item << endl;
        checkVectors(expected, computed);
    }
 }
--- a/src/cppmdlp/tests/Metrics_unittest.cpp
+++ b/src/cppmdlp/tests/Metrics_unittest.cpp
@@ -1,43 +0,0 @@
 #include "gtest/gtest.h"
 #include "../Metrics.h"
 namespace mdlp {
    class TestMetrics: public Metrics, public testing::Test {
    public:
        labels_t y;
        samples_t X;
        indices_t indices;
        precision_t precision = 0.000001;
        TestMetrics(): Metrics(y, indices) {}
        void SetUp()
        {
            y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
            indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
            setData(y, indices);
        }
    };
    TEST_F(TestMetrics, NumClasses)
    {
        y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
        EXPECT_EQ(1, computeNumClasses(4, 8));
        EXPECT_EQ(2, computeNumClasses(0, 10));
        EXPECT_EQ(2, computeNumClasses(8, 10));
    }
    TEST_F(TestMetrics, Entropy)
    {
        EXPECT_EQ(1, entropy(0, 10));
        EXPECT_EQ(0, entropy(0, 5));
        y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
        setData(y, indices);
        ASSERT_NEAR(0.468996, entropy(0, 10), precision);
    }
    TEST_F(TestMetrics, InformationGain)
    {
        ASSERT_NEAR(1, informationGain(0, 5, 10), precision);
        y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
        setData(y, indices);
        ASSERT_NEAR(0.108032, informationGain(0, 5, 10), precision);
    }
 }
--- a/src/cppmdlp/tests/cover
+++ b/src/cppmdlp/tests/cover
@@ -1,4 +0,0 @@
 rm -fr lcoverage/*
 lcov --capture --directory ./ --output-file lcoverage/main_coverage.info
 genhtml lcoverage/main_coverage.info --output-directory lcoverage
 open lcoverage/index.html
--- a/src/cppmdlp/tests/datasets/iris.arff
+++ b/src/cppmdlp/tests/datasets/iris.arff
@@ -1,225 +0,0 @@
 % 1. Title: Iris Plants Database
 % 
 % 2. Sources:
 %      (a) Creator: R.A. Fisher
 %      (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
 %      (c) Date: July, 1988
 % 
 % 3. Past Usage:
 %    - Publications: too many to mention!!!  Here are a few.
 %    1. Fisher,R.A. "The use of multiple measurements in taxonomic problems"
 %       Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions
 %       to Mathematical Statistics" (John Wiley, NY, 1950).
 %    2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
 %       (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
 %    3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
 %       Structure and Classification Rule for Recognition in Partially Exposed
 %       Environments".  IEEE Transactions on Pattern Analysis and Machine
 %       Intelligence, Vol. PAMI-2, No. 1, 67-71.
 %       -- Results:
 %          -- very low misclassification rates (0% for the setosa class)
 %    4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE 
 %       Transactions on Information Theory, May 1972, 431-433.
 %       -- Results:
 %          -- very low misclassification rates again
 %    5. See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al's AUTOCLASS II
 %       conceptual clustering system finds 3 classes in the data.
 % 
 % 4. Relevant Information:
 %    --- This is perhaps the best known database to be found in the pattern
 %        recognition literature.  Fisher's paper is a classic in the field
 %        and is referenced frequently to this day.  (See Duda & Hart, for
 %        example.)  The data set contains 3 classes of 50 instances each,
 %        where each class refers to a type of iris plant.  One class is
 %        linearly separable from the other 2; the latter are NOT linearly
 %        separable from each other.
 %    --- Predicted attribute: class of iris plant.
 %    --- This is an exceedingly simple domain.
 % 
 % 5. Number of Instances: 150 (50 in each of three classes)
 % 
 % 6. Number of Attributes: 4 numeric, predictive attributes and the class
 % 
 % 7. Attribute Information:
 %    1. sepal length in cm
 %    2. sepal width in cm
 %    3. petal length in cm
 %    4. petal width in cm
 %    5. class: 
 %       -- Iris Setosa
 %       -- Iris Versicolour
 %       -- Iris Virginica
 % 
 % 8. Missing Attribute Values: None
 % 
 % Summary Statistics:
 %  	           Min  Max   Mean    SD   Class Correlation
 %    sepal length: 4.3  7.9   5.84  0.83    0.7826   
 %     sepal width: 2.0  4.4   3.05  0.43   -0.4194
 %    petal length: 1.0  6.9   3.76  1.76    0.9490  (high!)
 %     petal width: 0.1  2.5   1.20  0.76    0.9565  (high!)
 % 
 % 9. Class Distribution: 33.3% for each of 3 classes.
@RELATION iris
@ATTRIBUTE sepallength	REAL
@ATTRIBUTE sepalwidth 	REAL
@ATTRIBUTE petallength 	REAL
@ATTRIBUTE petalwidth	REAL
@ATTRIBUTE class 	{Iris-setosa,Iris-versicolor,Iris-virginica}
@DATA
 5.1,3.5,1.4,0.2,Iris-setosa
 4.9,3.0,1.4,0.2,Iris-setosa
 4.7,3.2,1.3,0.2,Iris-setosa
 4.6,3.1,1.5,0.2,Iris-setosa
 5.0,3.6,1.4,0.2,Iris-setosa
 5.4,3.9,1.7,0.4,Iris-setosa
 4.6,3.4,1.4,0.3,Iris-setosa
 5.0,3.4,1.5,0.2,Iris-setosa
 4.4,2.9,1.4,0.2,Iris-setosa
 4.9,3.1,1.5,0.1,Iris-setosa
 5.4,3.7,1.5,0.2,Iris-setosa
 4.8,3.4,1.6,0.2,Iris-setosa
 4.8,3.0,1.4,0.1,Iris-setosa
 4.3,3.0,1.1,0.1,Iris-setosa
 5.8,4.0,1.2,0.2,Iris-setosa
 5.7,4.4,1.5,0.4,Iris-setosa
 5.4,3.9,1.3,0.4,Iris-setosa
 5.1,3.5,1.4,0.3,Iris-setosa
 5.7,3.8,1.7,0.3,Iris-setosa
 5.1,3.8,1.5,0.3,Iris-setosa
 5.4,3.4,1.7,0.2,Iris-setosa
 5.1,3.7,1.5,0.4,Iris-setosa
 4.6,3.6,1.0,0.2,Iris-setosa
 5.1,3.3,1.7,0.5,Iris-setosa
 4.8,3.4,1.9,0.2,Iris-setosa
 5.0,3.0,1.6,0.2,Iris-setosa
 5.0,3.4,1.6,0.4,Iris-setosa
 5.2,3.5,1.5,0.2,Iris-setosa
 5.2,3.4,1.4,0.2,Iris-setosa
 4.7,3.2,1.6,0.2,Iris-setosa
 4.8,3.1,1.6,0.2,Iris-setosa
 5.4,3.4,1.5,0.4,Iris-setosa
 5.2,4.1,1.5,0.1,Iris-setosa
 5.5,4.2,1.4,0.2,Iris-setosa
 4.9,3.1,1.5,0.1,Iris-setosa
 5.0,3.2,1.2,0.2,Iris-setosa
 5.5,3.5,1.3,0.2,Iris-setosa
 4.9,3.1,1.5,0.1,Iris-setosa
 4.4,3.0,1.3,0.2,Iris-setosa
 5.1,3.4,1.5,0.2,Iris-setosa
 5.0,3.5,1.3,0.3,Iris-setosa
 4.5,2.3,1.3,0.3,Iris-setosa
 4.4,3.2,1.3,0.2,Iris-setosa
 5.0,3.5,1.6,0.6,Iris-setosa
 5.1,3.8,1.9,0.4,Iris-setosa
 4.8,3.0,1.4,0.3,Iris-setosa
 5.1,3.8,1.6,0.2,Iris-setosa
 4.6,3.2,1.4,0.2,Iris-setosa
 5.3,3.7,1.5,0.2,Iris-setosa
 5.0,3.3,1.4,0.2,Iris-setosa
 7.0,3.2,4.7,1.4,Iris-versicolor
 6.4,3.2,4.5,1.5,Iris-versicolor
 6.9,3.1,4.9,1.5,Iris-versicolor
 5.5,2.3,4.0,1.3,Iris-versicolor
 6.5,2.8,4.6,1.5,Iris-versicolor
 5.7,2.8,4.5,1.3,Iris-versicolor
 6.3,3.3,4.7,1.6,Iris-versicolor
 4.9,2.4,3.3,1.0,Iris-versicolor
 6.6,2.9,4.6,1.3,Iris-versicolor
 5.2,2.7,3.9,1.4,Iris-versicolor
 5.0,2.0,3.5,1.0,Iris-versicolor
 5.9,3.0,4.2,1.5,Iris-versicolor
 6.0,2.2,4.0,1.0,Iris-versicolor
 6.1,2.9,4.7,1.4,Iris-versicolor
 5.6,2.9,3.6,1.3,Iris-versicolor
 6.7,3.1,4.4,1.4,Iris-versicolor
 5.6,3.0,4.5,1.5,Iris-versicolor
 5.8,2.7,4.1,1.0,Iris-versicolor
 6.2,2.2,4.5,1.5,Iris-versicolor
 5.6,2.5,3.9,1.1,Iris-versicolor
 5.9,3.2,4.8,1.8,Iris-versicolor
 6.1,2.8,4.0,1.3,Iris-versicolor
 6.3,2.5,4.9,1.5,Iris-versicolor
 6.1,2.8,4.7,1.2,Iris-versicolor
 6.4,2.9,4.3,1.3,Iris-versicolor
 6.6,3.0,4.4,1.4,Iris-versicolor
 6.8,2.8,4.8,1.4,Iris-versicolor
 6.7,3.0,5.0,1.7,Iris-versicolor
 6.0,2.9,4.5,1.5,Iris-versicolor
 5.7,2.6,3.5,1.0,Iris-versicolor
 5.5,2.4,3.8,1.1,Iris-versicolor
 5.5,2.4,3.7,1.0,Iris-versicolor
 5.8,2.7,3.9,1.2,Iris-versicolor
 6.0,2.7,5.1,1.6,Iris-versicolor
 5.4,3.0,4.5,1.5,Iris-versicolor
 6.0,3.4,4.5,1.6,Iris-versicolor
 6.7,3.1,4.7,1.5,Iris-versicolor
 6.3,2.3,4.4,1.3,Iris-versicolor
 5.6,3.0,4.1,1.3,Iris-versicolor
 5.5,2.5,4.0,1.3,Iris-versicolor
 5.5,2.6,4.4,1.2,Iris-versicolor
 6.1,3.0,4.6,1.4,Iris-versicolor
 5.8,2.6,4.0,1.2,Iris-versicolor
 5.0,2.3,3.3,1.0,Iris-versicolor
 5.6,2.7,4.2,1.3,Iris-versicolor
 5.7,3.0,4.2,1.2,Iris-versicolor
 5.7,2.9,4.2,1.3,Iris-versicolor
 6.2,2.9,4.3,1.3,Iris-versicolor
 5.1,2.5,3.0,1.1,Iris-versicolor
 5.7,2.8,4.1,1.3,Iris-versicolor
 6.3,3.3,6.0,2.5,Iris-virginica
 5.8,2.7,5.1,1.9,Iris-virginica
 7.1,3.0,5.9,2.1,Iris-virginica
 6.3,2.9,5.6,1.8,Iris-virginica
 6.5,3.0,5.8,2.2,Iris-virginica
 7.6,3.0,6.6,2.1,Iris-virginica
 4.9,2.5,4.5,1.7,Iris-virginica
 7.3,2.9,6.3,1.8,Iris-virginica
 6.7,2.5,5.8,1.8,Iris-virginica
 7.2,3.6,6.1,2.5,Iris-virginica
 6.5,3.2,5.1,2.0,Iris-virginica
 6.4,2.7,5.3,1.9,Iris-virginica
 6.8,3.0,5.5,2.1,Iris-virginica
 5.7,2.5,5.0,2.0,Iris-virginica
 5.8,2.8,5.1,2.4,Iris-virginica
 6.4,3.2,5.3,2.3,Iris-virginica
 6.5,3.0,5.5,1.8,Iris-virginica
 7.7,3.8,6.7,2.2,Iris-virginica
 7.7,2.6,6.9,2.3,Iris-virginica
 6.0,2.2,5.0,1.5,Iris-virginica
 6.9,3.2,5.7,2.3,Iris-virginica
 5.6,2.8,4.9,2.0,Iris-virginica
 7.7,2.8,6.7,2.0,Iris-virginica
 6.3,2.7,4.9,1.8,Iris-virginica
 6.7,3.3,5.7,2.1,Iris-virginica
 7.2,3.2,6.0,1.8,Iris-virginica
 6.2,2.8,4.8,1.8,Iris-virginica
 6.1,3.0,4.9,1.8,Iris-virginica
 6.4,2.8,5.6,2.1,Iris-virginica
 7.2,3.0,5.8,1.6,Iris-virginica
 7.4,2.8,6.1,1.9,Iris-virginica
 7.9,3.8,6.4,2.0,Iris-virginica
 6.4,2.8,5.6,2.2,Iris-virginica
 6.3,2.8,5.1,1.5,Iris-virginica
 6.1,2.6,5.6,1.4,Iris-virginica
 7.7,3.0,6.1,2.3,Iris-virginica
 6.3,3.4,5.6,2.4,Iris-virginica
 6.4,3.1,5.5,1.8,Iris-virginica
 6.0,3.0,4.8,1.8,Iris-virginica
 6.9,3.1,5.4,2.1,Iris-virginica
 6.7,3.1,5.6,2.4,Iris-virginica
 6.9,3.1,5.1,2.3,Iris-virginica
 5.8,2.7,5.1,1.9,Iris-virginica
 6.8,3.2,5.9,2.3,Iris-virginica
 6.7,3.3,5.7,2.5,Iris-virginica
 6.7,3.0,5.2,2.3,Iris-virginica
 6.3,2.5,5.0,1.9,Iris-virginica
 6.5,3.0,5.2,2.0,Iris-virginica
 6.2,3.4,5.4,2.3,Iris-virginica
 5.9,3.0,5.1,1.8,Iris-virginica
 %
 %
 %
--- a/src/cppmdlp/tests/datasets/kdd_JapaneseVowels.arff
+++ b/src/cppmdlp/tests/datasets/kdd_JapaneseVowels.arff
--- a/src/cppmdlp/tests/datasets/letter.arff
+++ b/src/cppmdlp/tests/datasets/letter.arff
--- a/src/cppmdlp/tests/datasets/mfeat-factors.arff
+++ b/src/cppmdlp/tests/datasets/mfeat-factors.arff
--- a/src/cppmdlp/tests/test
+++ b/src/cppmdlp/tests/test
@@ -1,12 +0,0 @@
 cmake -S . -B build -Wno-dev 
 if test $? -ne 0; then
   echo "Error in creating build commands."
   exit 1
 fi
 cmake --build build
 if test $? -ne 0; then
   echo "Error in build command."
   exit 1
 fi
 cd build
 ctest --output-on-failure
--- a/src/cppmdlp/typesFImdlp.h
+++ b/src/cppmdlp/typesFImdlp.h
@@ -1,16 +0,0 @@
 #ifndef TYPES_H
 #define TYPES_H
 #include <vector>
 #include <map>
 using namespace std;
 namespace mdlp {
    typedef float precision_t;
    typedef vector<precision_t> samples_t;
    typedef vector<int> labels_t;
    typedef vector<size_t> indices_t;
    typedef vector<precision_t> cutPoints_t;
    typedef map<tuple<int, int>, precision_t> cacheEnt_t;
    typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
 }
 #endif
--- a/src/fimdlp/Factorize.cpp
+++ b/src/fimdlp/Factorize.cpp
@@ -0,0 +1,18 @@
 #include "Factorize.h"
 namespace utils {
    vector<int> cppFactorize(const vector<string>& labels_t)
    {
        vector<int> yy;
        yy.reserve(labels_t.size());
        map<string, int> labelMap;
        int i = 0;
        for (string label : labels_t) {
            if (labelMap.find(label) == labelMap.end()) {
                labelMap[label] = i++;
            }
            yy.push_back(labelMap[label]);
        }
        return yy;
    }
 }
--- a/src/fimdlp/Factorize.h
+++ b/src/fimdlp/Factorize.h
@@ -0,0 +1,10 @@
 #ifndef FACTORIZE_H
 #define FACTORIZE_H
 #include <vector>
 #include <map>
 #include <string>
 namespace utils {
    using namespace std;
    vector<int> cppFactorize(const vector<string>&);
 }
 #endif
--- a/src/fimdlp/init.py
+++ b/src/fimdlp/init.py
@@ -1,8 +1,4 @@
 from ._version import __version__
 def version():
    return __version__
 all = ["FImdlp", "__version__"]
--- a/src/fimdlp/_version.py
+++ b/src/fimdlp/_version.py
@@ -1 +1 @@
-__version__ = "0.9.1"
+__version__ = "0.9.3"
--- a/src/fimdlp/cfimdlp.pyx
+++ b/src/fimdlp/cfimdlp.pyx
@@ -1,20 +1,20 @@
 # distutils: language = c++
 # cython: language_level = 3
 from libcpp.vector cimport vector
-from libcpp cimport bool
+from libcpp.string cimport string
 cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
    ctypedef float precision_t
    cdef cppclass CPPFImdlp:
-        CPPFImdlp(bool) except + 
+        CPPFImdlp() except + 
        CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
        vector[precision_t] getCutPoints()
-        
+        string version()
 cdef class CFImdlp:
    cdef CPPFImdlp *thisptr
-    def __cinit__(self, proposal):
+    def __cinit__(self):
-        self.thisptr = new CPPFImdlp(proposal)
+        self.thisptr = new CPPFImdlp()
    def __dealloc__(self):
        del self.thisptr
    def fit(self, X, y):
@@ -22,4 +22,12 @@ cdef class CFImdlp:
        return self
    def get_cut_points(self):
        return self.thisptr.getCutPoints()
    def get_version(self):
        return self.thisptr.version()
    def __reduce__(self):
        return (CFImdlp, ())
 cdef extern from "Factorize.h" namespace "utils":
    vector[int] cppFactorize(vector[string] &input_vector)
 def factorize(input_vector):
    return cppFactorize(input_vector)
--- a/src/fimdlp/mdlp.py
+++ b/src/fimdlp/mdlp.py
@@ -1,15 +1,17 @@
 import numpy as np
-from .cppfimdlp import CFImdlp
+from .cppfimdlp import CFImdlp, factorize
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from joblib import Parallel, delayed
 from ._version import __version__
 # from ._version import __version__
 class FImdlp(TransformerMixin, BaseEstimator):
-    def __init__(self, n_jobs=-1, proposal=False):
+    def __init__(self, n_jobs=-1):
        self.n_jobs = n_jobs
        self.proposal = proposal
    """Fayyad - Irani MDLP discretization algorithm based implementation.
@@ -22,27 +24,26 @@ class FImdlp(TransformerMixin, BaseEstimator):
    Attributes
    ----------
-    n_features_ : int
+    n_features_in_ : int
        The number of features of the data passed to :meth:`fit`.
    discretizer_ : list
        The list of discretizers, one for each feature.
    cut_points_ : list
        The list of cut points for each feature.
-    X_ : array
+    X_ : array, shape (n_samples, n_features)
-        the samples used to fit, shape (n_samples, n_features)
+        the samples used to fit
-    y_ : array
+    y_ : array, shape(n_samples,)
-        the labels used to fit, shape (n_samples,)
+        the labels used to fit
    features_ : list
        the list of features to be discretized
    """
-    def _check_params_fit(self, X, y, expected_args, kwargs):
+    def _more_tags(self):
-        """Check the common parameters passed to fit"""
+        return {"preserves_dtype": [np.int32], "requires_y": True}
    def _check_args(self, X, y, expected_args, kwargs):
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.n_classes_ = self.classes_.shape[0]
        # Default values
        self.features_ = [i for i in range(X.shape[1])]
        for key, value in kwargs.items():
@@ -63,15 +64,24 @@ class FImdlp(TransformerMixin, BaseEstimator):
            raise ValueError("Feature index out of range")
        return X, y
    def _update_params(self, X, y):
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.n_classes_ = self.classes_.shape[0]
        self.n_features_in_ = X.shape[1]
    @staticmethod
    def get_version():
        return f"{__version__}({CFImdlp().get_version().decode()})"
    def fit(self, X, y, **kwargs):
        """A reference implementation of a fitting function for a transformer.
        Parameters
        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : array, shape (n_samples, n_features)
            The training input samples.
-        y : None
+        y : array, shape (n_samples,)
-            There is no need of a target in a transformer, yet the pipeline API
+            the labels used to fit
            requires this parameter.
        features : list, default=[i for i in range(n_features)]
            The list of features to be discretized.
        Returns
@@ -79,24 +89,30 @@ class FImdlp(TransformerMixin, BaseEstimator):
        self : object
            Returns self.
        """
-        X, y = self._check_params_fit(
+        X, y = self._check_args(
            X, y, expected_args=["features"], kwargs=kwargs
        )
-        self.n_features_ = X.shape[1]
+        self._update_params(X, y)
        self.X_ = X
        self.y_ = y
-        self.discretizer_ = [None] * self.n_features_
+        self.discretizer_ = [None] * self.n_features_in_
-        self.cut_points_ = [None] * self.n_features_
+        self.cut_points_ = [None] * self.n_features_in_
        Parallel(n_jobs=self.n_jobs, prefer="threads")(
            delayed(self._fit_discretizer)(feature)
-            for feature in range(self.n_features_)
+            for feature in range(self.n_features_in_)
        )
        return self
    def _fit_discretizer(self, feature):
-        self.discretizer_[feature] = CFImdlp(proposal=self.proposal)
+        if feature in self.features_:
            self.discretizer_[feature] = CFImdlp()
            self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
-        self.cut_points_[feature] = self.discretizer_[feature].get_cut_points()
+            self.cut_points_[feature] = self.discretizer_[
                feature
            ].get_cut_points()
        else:
            self.discretizer_[feature] = None
            self.cut_points_[feature] = []
    def _discretize_feature(self, feature, X, result):
        if feature in self.features_:
@@ -108,7 +124,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
        """Discretize X values.
        Parameters
        ----------
-        X : {array-like}, shape (n_samples, n_features)
+        X : array, shape (n_samples, n_features)
            The input samples.
        Returns
        -------
@@ -116,22 +132,41 @@ class FImdlp(TransformerMixin, BaseEstimator):
            The array containing the discretized values of ``X``.
        """
        # Check is fit had been called
-        check_is_fitted(self, "n_features_")
+        check_is_fitted(self, "n_features_in_")
        # Input validation
        X = check_array(X)
        # Check that the input is of the same shape as the one passed
        # during fit.
-        if X.shape[1] != self.n_features_:
+        if X.shape[1] != self.n_features_in_:
            raise ValueError(
                "Shape of input is different from what was seen in `fit`"
            )
        if len(self.features_) == self.n_features_in_:
            result = np.zeros_like(X, dtype=np.int32) - 1
        else:
            result = np.zeros_like(X) - 1
        Parallel(n_jobs=self.n_jobs, prefer="threads")(
            delayed(self._discretize_feature)(feature, X[:, feature], result)
-            for feature in range(self.n_features_)
+            for feature in range(self.n_features_in_)
        )
        return result
    @staticmethod
    def factorize(yy):
        """Factorize the input labels
        Parameters
        ----------
        yy : array, shape (n_samples,)
            Labels to be factorized, MUST be bytes, i.e. b"0", b"1", ...
        Returns
        -------
        array, shape (n_samples,)
            Factorized labels
        """
        return factorize(yy)
    def get_cut_points(self):
        """Get the cut points for each feature.
        Returns
@@ -140,6 +175,70 @@ class FImdlp(TransformerMixin, BaseEstimator):
            The list of cut points for each feature.
        """
        result = []
-        for feature in range(self.n_features_):
+        for feature in range(self.n_features_in_):
            result.append(self.cut_points_[feature])
        return result
    def get_states_feature(self, feature):
        """Return the states a feature can take
        Parameters
        ----------
        feature : int
            feature to get the states
        Returns
        -------
        list
            states of the feature
        """
        if feature in self.features_:
            return list(range(len(self.cut_points_[feature]) + 1))
        return None
    def join_fit(self, features, target, data):
        """Join the selected features with the labels and fit the discretizer
        of the target variable
        join - fit - transform
        Parameters
        ----------
        features : [list]
            index of the features to join with the labels
        target : [int]
            index of the target variable to discretize
        data: [array] shape (n_samples, n_features)
            dataset that contains the features to join
        Returns
        -------
        result: np.array
            The target variable newly discretized
        """
        check_is_fitted(self, "n_features_in_")
        if len(features) < 1 or len(features) > self.n_features_in_:
            raise ValueError(
                "Number of features must be in range [1, "
                f"{self.n_features_in_}]"
            )
        for feature in features:
            if feature < 0 or feature >= self.n_features_in_:
                raise ValueError(
                    f"Feature {feature} not in range [0, "
                    f"{self.n_features_in_})"
                )
        if target < 0 or target >= self.n_features_in_:
            raise ValueError(
                f"Target {target} not in range [0, {self.n_features_in_})"
            )
        if target in features:
            raise ValueError("Target cannot in features to join")
        y_join = [
            f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode()
            for item_y, items_x in zip(self.y_, data[:, features])
        ]
        self.y_join_ = y_join
        self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))
        self.cut_points_[target] = self.discretizer_[target].get_cut_points()
        # return the discretized target variable with the new cut points
        return np.searchsorted(self.cut_points_[target], self.X_[:, target])
--- a/src/fimdlp/tests/FImdlp_test.py
+++ b/src/fimdlp/tests/FImdlp_test.py
@@ -1,72 +1,46 @@
 import unittest
 import sklearn
 from sklearn.datasets import load_iris
 import numpy as np
 from sklearn.datasets import load_iris
 from sklearn.utils.estimator_checks import check_estimator
 from ..cppfimdlp import CFImdlp, factorize
 from ..mdlp import FImdlp
-from .. import version
+from .. import __version__
-from .._version import __version__
+
 # from .._version import __version__
 class FImdlpTest(unittest.TestCase):
    def test_version(self):
-        self.assertEqual(version(), __version__)
+        clf = FImdlp()
        self.assertEqual(
            clf.get_version(),
            f"{__version__}({CFImdlp().get_version().decode()})",
        )
    def test_init(self):
        clf = FImdlp()
        self.assertEqual(-1, clf.n_jobs)
-        self.assertFalse(clf.proposal)
+        clf = FImdlp(n_jobs=7)
        clf = FImdlp(proposal=True, n_jobs=7)
        self.assertTrue(clf.proposal)
        self.assertEqual(7, clf.n_jobs)
-    def test_fit_proposal(self):
+    def test_fit_definitive(self):
-        clf = FImdlp(proposal=True)
+        clf = FImdlp()
        clf.fit([[1, 2], [3, 4]], [1, 2])
        self.assertEqual(clf.n_features_, 2)
        self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
        self.assertListEqual(clf.y_.tolist(), [1, 2])
        self.assertListEqual([[], []], clf.get_cut_points())
        X, y = load_iris(return_X_y=True)
        clf.fit(X, y)
-        self.assertEqual(clf.n_features_, 4)
+        self.assertEqual(clf.n_features_in_, 4)
        self.assertTrue(np.array_equal(X, clf.X_))
        self.assertTrue(np.array_equal(y, clf.y_))
        expected = [
-            [
+            [5.449999809265137, 5.75],
-                4.900000095367432,
+            [2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684],
-                5.0,
+            [2.45, 4.75, 5.050000190734863],
-                5.099999904632568,
+            [0.8, 1.75],
                5.400000095367432,
                5.699999809265137,
            ],
            [2.6999998092651367, 2.9000000953674316, 3.1999998092651367],
            [2.3499999046325684, 4.5, 4.800000190734863],
            [0.75, 1.399999976158142, 1.5, 1.7000000476837158],
        ]
-        self.assertListEqual(expected, clf.get_cut_points())
+        computed = clf.get_cut_points()
-        self.assertListEqual([0, 1, 2, 3], clf.features_)
+        for item_computed, item_expected in zip(computed, expected):
-        clf.fit(X, y, features=[0, 2, 3])
+            for x_, y_ in zip(item_computed, item_expected):
-        self.assertListEqual([0, 2, 3], clf.features_)
+                self.assertAlmostEqual(x_, y_)
    def test_fit_original(self):
        clf = FImdlp(proposal=False)
        clf.fit([[1, 2], [3, 4]], [1, 2])
        self.assertEqual(clf.n_features_, 2)
        self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
        self.assertListEqual(clf.y_.tolist(), [1, 2])
        self.assertListEqual([[], []], clf.get_cut_points())
        X, y = load_iris(return_X_y=True)
        clf.fit(X, y)
        self.assertEqual(clf.n_features_, 4)
        self.assertTrue(np.array_equal(X, clf.X_))
        self.assertTrue(np.array_equal(y, clf.y_))
        expected = [
            [5.5, 5.800000190734863],
            [3.0999999046325684],
            [2.450000047683716, 4.800000190734863, 5.099999904632568],
            [0.800000011920929, 1.7000000476837158],
        ]
        self.assertListEqual(expected, clf.get_cut_points())
        self.assertListEqual([0, 1, 2, 3], clf.features_)
        clf.fit(X, y, features=[0, 2, 3])
        self.assertListEqual([0, 2, 3], clf.features_)
@@ -87,67 +61,169 @@ class FImdlpTest(unittest.TestCase):
            clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2])
    def test_fit_features(self):
        clf = FImdlp(n_jobs=-1)
        # Two samples doesn't have enough information to split
        clf.fit([[1, -2], [3, 4]], [1, 2], features=[0])
        self.assertListEqual(clf.get_cut_points(), [[], []])
        clf.fit([[1, -2], [3, 4], [5, 6]], [1, 2, 2], features=[0])
        self.assertListEqual(clf.get_cut_points(), [[2], []])
        res = clf.transform([[1, -2], [3, 4]])
        self.assertListEqual(res.tolist(), [[0, -2], [1, 4]])
        X, y = load_iris(return_X_y=True)
        X_expected = X[:, [0, 2]].copy()
        clf.fit(X, y, features=[1, 3])
        X_computed = clf.transform(X)
        self.assertListEqual(
            X_expected[:, 0].tolist(), X_computed[:, 0].tolist()
        )
        self.assertListEqual(
            X_expected[:, 1].tolist(), X_computed[:, 2].tolist()
        )
        self.assertEqual(X_computed.dtype, np.float64)
    def test_transform(self):
        clf = FImdlp()
-        clf.fit([[1, 2], [3, 4]], [1, 2], features=[0])
+        clf.fit([[1, 2], [3, 4], [5, 6]], [1, 2, 2])
        res = clf.transform([[1, 2], [3, 4]])
        self.assertListEqual(res.tolist(), [[0, 2], [0, 4]])
    def test_transform_original(self):
        clf = FImdlp(proposal=False)
        clf.fit([[1, 2], [3, 4]], [1, 2])
        self.assertEqual(
-            clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
+            clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
        )
        X, y = load_iris(return_X_y=True)
        clf.fit(X, y)
-        self.assertEqual(clf.n_features_, 4)
+        self.assertEqual(clf.n_features_in_, 4)
        self.assertTrue(np.array_equal(X, clf.X_))
        self.assertTrue(np.array_equal(y, clf.y_))
        X_transformed = clf.transform(X)
        self.assertListEqual(
-            clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
+            X_transformed.tolist(), clf.fit(X, y).transform(X).tolist()
        )
        self.assertEqual(X_transformed.dtype, np.int32)
        expected = [
-            [0, 0, 1, 1],
+            [1, 0, 1, 1],
            [2, 3, 1, 1],
            [2, 0, 1, 1],
            [1, 0, 1, 1],
            [0, 0, 1, 1],
            [1, 0, 1, 1],
-            [1, 0, 1, 1],
+            [1, 3, 1, 1],
-            [1, 0, 1, 1],
+            [1, 2, 1, 1],
        ]
        self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
        with self.assertRaises(ValueError):
            clf.transform([[1, 2, 3], [4, 5, 6]])
        with self.assertRaises(sklearn.exceptions.NotFittedError):
-            clf = FImdlp(proposal=False)
+            clf = FImdlp()
            clf.transform([[1, 2], [3, 4]])
-    def test_transform_proposal(self):
+    def test_cppfactorize(self):
-        clf = FImdlp(proposal=True)
+        source = [
-        clf.fit([[1, 2], [3, 4]], [1, 2])
+            b"f0",
-        self.assertEqual(
+            b"f1",
-            clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
+            b"f2",
            b"f3",
            b"f4",
            b"f5",
            b"f6",
            b"f1",
            b"f1",
            b"f7",
            b"f8",
        ]
        expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8]
        computed = factorize(source)
        self.assertListEqual(expected, computed)
    def test_join_fit(self):
        y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
        x = np.array(
            [
                [0, 1, 2, 3, 4],
                [0, 1, 2, 3, 4],
                [1, 2, 3, 4, 5],
                [2, 3, 4, 5, 6],
                [3, 4, 5, 6, 7],
            ]
        )
        expected = [0, 0, 1, 2, 2]
        clf = FImdlp()
        clf.fit(x, factorize(y))
        computed = clf.join_fit([0, 2], 1, x)
        self.assertListEqual(computed.tolist(), expected)
        expected_y = [b"002", b"002", b"113", b"224", b"335"]
        self.assertListEqual(expected_y, clf.y_join_)
    def test_join_fit_error(self):
        y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
        x = np.array(
            [
                [0, 1, 2, 3, 4],
                [0, 1, 2, 3, 4],
                [1, 2, 3, 4, 5],
                [2, 3, 4, 5, 6],
                [3, 4, 5, 6, 7],
            ]
        )
        clf = FImdlp()
        clf.fit(x, factorize(y))
        with self.assertRaises(ValueError) as exception:
            clf.join_fit([], 1, x)
        self.assertEqual(
            str(exception.exception),
            "Number of features must be in range [1, 5]",
        )
        with self.assertRaises(ValueError) as exception:
            FImdlp().join_fit([0, 4], 1, x)
        self.assertTrue(
            str(exception.exception).startswith(
                "This FImdlp instance is not fitted yet."
            )
        )
        with self.assertRaises(ValueError) as exception:
            clf.join_fit([0, 5], 1, x)
        self.assertEqual(
            str(exception.exception),
            "Feature 5 not in range [0, 5)",
        )
        with self.assertRaises(ValueError) as exception:
            clf.join_fit([0, 2], 5, x)
        self.assertEqual(
            str(exception.exception),
            "Target 5 not in range [0, 5)",
        )
        with self.assertRaises(ValueError) as exception:
            clf.join_fit([0, 2], 2, x)
        self.assertEqual(
            str(exception.exception),
            "Target cannot in features to join",
        )
    def test_factorize(self):
        y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
        clf = FImdlp()
        computed = clf.factorize(y)
        self.assertListEqual([0, 0, 1, 2, 3], computed)
        y = [b"f4", b"f0", b"f0", b"f2", b"f3"]
        clf = FImdlp()
        computed = clf.factorize(y)
        self.assertListEqual([0, 1, 1, 2, 3], computed)
    @staticmethod
    def test_sklearn_transformer():
        for check, test in check_estimator(FImdlp(), generate_only=True):
            test(check)
    def test_states_feature(self):
        clf = FImdlp()
        X, y = load_iris(return_X_y=True)
        clf.fit(X, y)
-        self.assertEqual(clf.n_features_, 4)
+        expected = []
-        self.assertTrue(np.array_equal(X, clf.X_))
+        for i in [3, 6, 4, 3]:
-        self.assertTrue(np.array_equal(y, clf.y_))
+            expected.append(list(range(i)))
        for feature in range(X.shape[1]):
            self.assertListEqual(
-            clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
+                expected[feature], clf.get_states_feature(feature)
            )
-        expected = [
+
-            [4, 0, 1, 1],
+    def test_states_no_feature(self):
-            [5, 2, 2, 2],
+        clf = FImdlp()
-            [5, 0, 1, 1],
+        X, y = load_iris(return_X_y=True)
-            [1, 0, 1, 1],
+        clf.fit(X, y)
-            [4, 1, 1, 1],
+        self.assertIsNone(clf.get_states_feature(4))
            [5, 2, 1, 1],
            [5, 1, 1, 1],
        ]
        self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
        with self.assertRaises(ValueError):
            clf.transform([[1, 2, 3], [4, 5, 6]])
        with self.assertRaises(sklearn.exceptions.NotFittedError):
            clf = FImdlp(proposal=True)
            clf.transform([[1, 2], [3, 4]])
--- a/test1.xlsx
+++ b/test1.xlsx
Author	SHA1	Message	Date
Ricardo Montañana	f65efe3dfd	Update the c++ sources with new version	2023-02-24 11:04:06 +01:00
Ricardo Montañana	e9d19d41da	Add changed submodule	2023-02-22 11:56:39 +01:00
Ricardo Montañana	6450ccb9bd	Add changed submodule	2023-02-22 11:34:27 +01:00
Ricardo Montañana	5d2f32bb0e	Add needed header file to MANIFEST	2023-02-22 11:33:26 +01:00
Ricardo Montañana Gómez	8249e55b0c	Merge pull request #6 from Doctorado-ML/joinfeatures - Add a join_fit feature that can update a fitted discretizer. Making it possible to discretize a variable by taking into account the label and a list of other features of the dataset. Used in local discretization with bayesian estimators. - Add factorize method to be able to simulate the pandas factorize method. - Remove the algorithm hyperparameter as it is no longer needed - Add get_states_feature method to obtain a list of states of any feature based on the number of cut points computed while fitting the discretizer	2023-02-22 10:44:43 +01:00
Ricardo Montañana	40871f128d	Add 1.1.0 version of mdlp	2023-02-22 10:15:33 +01:00
Ricardo Montañana	718c9d0e63	make static methods factorize and test_sklrn_trans	2023-02-20 20:12:36 +01:00
Ricardo Montañana	e0b7cae9a0	Remove algorithm hyperparameter in discretizer	2023-02-20 18:26:51 +01:00
Ricardo Montañana	31d79a77fa	Add get_states_feature method	2023-02-13 17:34:50 +01:00
Ricardo Montañana	2d495293bb	Add range_features method	2023-02-13 16:15:50 +01:00
Ricardo Montañana	9899781640	Complete join_fit and remove MultiDiscretizer	2023-02-05 00:30:03 +01:00
Ricardo Montañana	f20496203e	refactor Multidiscretizer to use one per column	2023-02-04 19:23:15 +01:00
Ricardo Montañana	cf09d92ccc	add MultiDiscretizer	2023-02-04 17:45:36 +01:00
Ricardo Montañana	1186e4ad53	chore: 🔖 Upgrade version number to 0.9.3	2023-01-28 19:15:26 +01:00
Ricardo Montañana	7913f5151e	Add version command to Makefile	2023-01-28 19:14:32 +01:00
Ricardo Montañana	050b923631	feat: ⚡ Add factorize method to transformer	2023-01-28 10:35:07 +01:00
Ricardo Montañana	29fc88cecc	test: ⚡ Add scikit learn compatibility check_estimator test	2023-01-26 23:20:51 +01:00
Ricardo Montañana	16b31ec293	test: ✅ Complete join_transform test	2023-01-26 11:17:10 +01:00
Ricardo Montañana	ca7d158ac8	feat: ⚗️ Add join_transform method and cpp factorize	2023-01-26 10:47:27 +01:00
Ricardo Montañana Gómez	34cd54f77e	feat: ♻️ Add Classic algorithm as number 2 to compare performance	2023-01-13 11:47:01 +01:00
Ricardo Montañana	70bf03155c	Add scikit-learn as requirement	2022-12-23 14:07:36 +01:00
Ricardo Montañana	77b571af71	Update README to include link to pypi	2022-12-22 19:41:55 +01:00
Ricardo Montañana	ff7a91a7ec	build: 🚀	2022-12-22 19:39:05 +01:00
Ricardo Montañana	621c19d00c	style: 🎨 Remove unused variable in c++ module	2022-12-22 11:02:16 +01:00
Ricardo Montañana Gómez	790da5cc60	Merge pull request #5 from Doctorado-ML/fix_sdist fix: 🐛 Fix a bug when pip install tries to build the package of F…	2022-12-22 10:29:46 +01:00
Ricardo Montañana	2775698063	test: ⚡	2022-12-21 19:05:24 +01:00
Ricardo Montañana	9db16d9d3c	feat: ✨ Add version method to cppfimdlp	2022-12-20 01:11:39 +01:00
Ricardo Montañana	edd464311f	fix: 🐛 Fix Tests and sample mistake	2022-12-15 12:18:10 +01:00
Ricardo Montañana	fe32ed4b2a	Update algorithm type to compute cut points	2022-12-15 12:12:44 +01:00
Ricardo Montañana	1d95311a7d	fix: 🐛 Fix a bug when pip install tries to build the package of File not Found #4	2022-12-14 12:23:07 +01:00
Ricardo Montañana	d8066ea274	Update branch name from master to main in CI	2022-12-13 18:46:15 +01:00
Ricardo Montañana	a2c1b07525	Update Badges and README	2022-12-13 18:40:53 +01:00
Ricardo Montañana	05c12561ac	Add submodule to ci	2022-12-13 18:18:12 +01:00
Ricardo Montañana	8f4bdd262a	Update ci	2022-12-13 18:09:26 +01:00
Ricardo Montañana	0740d1f515	Update submodule command	2022-12-13 17:57:44 +01:00
Ricardo Montañana	eb7f3dc092	Command to update submodule and update it	2022-12-13 17:56:12 +01:00
Ricardo Montañana	cfc18adf06	Fix c++ sample	2022-12-13 17:18:38 +01:00
Ricardo Montañana	3ae0d67884	Fix tests because stable_sort in c++	2022-12-13 17:16:23 +01:00
Ricardo Montañana	0ca507c692	Add submodule	2022-12-13 17:05:11 +01:00
Ricardo Montañana Gómez	70b3af94cc	Merge pull request #3 from Doctorado-ML/ci Ci	2022-12-13 17:01:08 +01:00
Ricardo Montañana	9d66bd6fd0	Remove some testing files	2022-12-13 16:59:14 +01:00
Ricardo Montañana	9039139a32	Remove submodule to fix it	2022-12-13 16:56:15 +01:00
Ricardo Montañana	a5dc2d7162	Remove submodule to fix it	2022-12-13 16:54:37 +01:00
Ricardo Montañana	67726bf219	Added submodule fimdlp/cppmdlp	2022-12-13 15:57:52 +01:00
Ricardo Montañana	2cb15cadbc	Added submodule fimdlp/cppmdlp	2022-12-13 14:12:03 +01:00
Ricardo Montañana	630ea1dfdb	Samplecpp	2022-12-13 14:02:35 +01:00
Ricardo Montañana	74d420dbce	Added scikit-learn to dependencies	2022-12-13 13:43:39 +01:00
Ricardo Montañana	707432cc28	Fix parameter in test	2022-12-13 13:42:11 +01:00
Ricardo Montañana	6bddb3ac43	second try	2022-12-13 13:39:54 +01:00
Ricardo Montañana	d29a5c6caf	Update CI	2022-12-13 13:37:49 +01:00
Ricardo Montañana	e5b09f1610	Update CI codeql	2022-12-13 13:31:20 +01:00
Ricardo Montañana Gómez	75c5a095c5	Merge pull request #2 from Doctorado-ML/submodule Submodule	2022-12-13 11:55:20 +01:00
Ricardo Montañana Gómez	4d1ce5d743	Create codeql.yml	2022-12-13 11:52:51 +01:00
`@@ -3,4 +3,4 @@ project(main)`

	`set(CMAKE_CXX_STANDARD 14)`	`set(CMAKE_CXX_STANDARD 14)`

	`add_executable(sample sample.cpp ArffFiles.cpp ../src/fimdlp/cppmdlp/Metrics.cpp ../src/fimdlp/cppmdlp/CPPFImdlp.cpp)`	`add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp)`
		`@@ -1,2 +0,0 @@`
			`# mdlp`
			`Discretization algorithm based on the paper by Fayyad & Irani Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning`