Refactor samples and fix Metrics tests

2025-08-16 16:05:52 +00:00 · 2022-12-10 14:32:28 +01:00
parent 418db2bb99
commit 3d48073574
22 changed files with 301 additions and 258 deletions
--- a/README.md
+++ b/README.md
@@ -6,5 +6,7 @@ Fayyad - Irani MDLP discretization algorithm

 ```bash
 python setup.py build_ext --inplace
-python sample.py
+python samples/sample.py iris --original 
+python samples/sample.py iris --proposal
+python samples/sample.py -h # for more options
 ```
--- a/fimdlp/CPPFImdlp.cpp
+++ b/fimdlp/CPPFImdlp.cpp
@@ -1,21 +1,17 @@
 #include <numeric>
-#include <iostream>
 #include <algorithm>
 #include <set>
 #include "CPPFImdlp.h"
 #include "Metrics.h"

 namespace mdlp {
-    CPPFImdlp::CPPFImdlp(): proposal(true), debug(false), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
-    {
-    }
-    CPPFImdlp::CPPFImdlp(bool proposal, bool debug): proposal(proposal), debug(debug), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
+    CPPFImdlp::CPPFImdlp(bool proposal):proposal(proposal), indices(indices_t()), y(labels_t()), metrics(Metrics(y, indices))
    {
    }
    CPPFImdlp::~CPPFImdlp()
        = default;

-    CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
+    CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_)
    {
        X = X_;
        y = y_;
@@ -28,8 +24,10 @@ namespace mdlp {
        }
        indices = sortIndices(X_);
        metrics.setData(y, indices);
-        //computeCutPoints(0, X.size());
-        computeCutPointsProposal();
+        if (proposal)
+            computeCutPointsProposal();
+        else
+            computeCutPoints(0, X.size());
        return *this;
    }
    void CPPFImdlp::computeCutPoints(size_t start, size_t end)
@@ -53,7 +51,6 @@ namespace mdlp {
    }
    void CPPFImdlp::computeCutPointsOriginal(size_t start, size_t end)
    {
-        size_t idx;
        precision_t cut;
        if (end - start < 2)
            return;
@@ -76,14 +73,9 @@ namespace mdlp {
        yCur = yPrev = y[indices[0]];
        numElements = indices.size() - 1;
        idx = start = 0;
-        bool firstCutPoint = true;
-        if (debug)
-            printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
        while (idx < numElements) {
            xPivot = xCur;
            yPivot = yCur;
-            if (debug)
-                printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
            // Read the same values and check class changes
            do {
                idx++;
@@ -92,17 +84,12 @@ namespace mdlp {
                if (yCur != yPivot && xCur == xPivot) {
                    yPivot = -1;
                }
-                if (debug)
-                    printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
            }
            while (idx < numElements && xCur == xPivot);
            // Check if the class changed and there are more than 1 element
            if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) {
                start = idx;
                cutPoint = (xPrev + xCur) / 2;
-                if (debug) {
-                    printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = %3.1g \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint);
-                }
                cutPoints.push_back(cutPoint);
            }
            yPrev = yPivot;
@@ -160,7 +147,7 @@ namespace mdlp {
        return output;
    }
    // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
-    indices_t CPPFImdlp::sortIndices(samples& X_)
+    indices_t CPPFImdlp::sortIndices(samples_t& X_)
    {
        indices_t idx(X_.size());
        iota(idx.begin(), idx.end(), 0);
--- a/fimdlp/CPPFImdlp.h
+++ b/fimdlp/CPPFImdlp.h
@@ -6,15 +6,14 @@
 namespace mdlp {
    class CPPFImdlp {
    protected:
-        bool proposal; // proposed algorithm or original algorithm
-        bool debug;
+        bool proposal;
        indices_t indices; // sorted indices to use with X and y
-        samples X;
-        labels y;
+        samples_t X;
+        labels_t y;
        Metrics metrics;
        cutPoints_t cutPoints;

-        static indices_t sortIndices(samples&);
+        static indices_t sortIndices(samples_t&);
        void computeCutPoints(size_t, size_t);
        long int getCandidate(size_t, size_t);
        bool mdlp(size_t, size_t, size_t);
@@ -25,11 +24,10 @@ namespace mdlp {
        void computeCutPointsProposal();

    public:
-        CPPFImdlp();
-        CPPFImdlp(bool, bool debug = false);
+        CPPFImdlp(bool);
        ~CPPFImdlp();
-        CPPFImdlp& fit(samples&, labels&);
-        samples getCutPoints();
+        CPPFImdlp& fit(samples_t&, labels_t&);
+        samples_t getCutPoints();
    };
 }
 #endif
--- a/fimdlp/Metrics.cpp
+++ b/fimdlp/Metrics.cpp
@@ -1,8 +1,9 @@
 #include "Metrics.h"
 #include <set>
+#include <cmath>
 using namespace std;
 namespace mdlp {
-    Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
+    Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
    {
    }
    int Metrics::computeNumClasses(size_t start, size_t end)
@@ -13,7 +14,7 @@ namespace mdlp {
        }
        return nClasses.size();
    }
-    void Metrics::setData(labels& y_, indices_t& indices_)
+    void Metrics::setData(labels_t& y_, indices_t& indices_)
    {
        indices = indices_;
        y = y_;
@@ -25,7 +26,7 @@ namespace mdlp {
    {
        precision_t p, ventropy = 0;
        int nElements = 0;
-        labels counts(numClasses + 1, 0);
+        labels_t counts(numClasses + 1, 0);
        if (end - start < 2)
            return 0;
        if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
--- a/fimdlp/Metrics.h
+++ b/fimdlp/Metrics.h
@@ -1,18 +1,17 @@
 #ifndef CCMETRICS_H
 #define CCMETRICS_H
 #include "typesFImdlp.h"
-#include <cmath>
 namespace mdlp {
    class Metrics {
    protected:
-        labels& y;
+        labels_t& y;
        indices_t& indices;
        int numClasses;
        cacheEnt_t entropyCache;
        cacheIg_t igCache;
    public:
-        Metrics(labels&, indices_t&);
-        void setData(labels&, indices_t&);
+        Metrics(labels_t&, indices_t&);
+        void setData(labels_t&, indices_t&);
        int computeNumClasses(size_t, size_t);
        precision_t entropy(size_t, size_t);
        precision_t informationGain(size_t, size_t, size_t);
--- a/fimdlp/_version.py
+++ b/fimdlp/_version.py
@@ -1 +1 @@
-__version__ = '0.1.1'
+__version__ = "0.9.1"
--- a/fimdlp/cfimdlp.pyx
+++ b/fimdlp/cfimdlp.pyx
@@ -6,24 +6,15 @@ from libcpp cimport bool
 cdef extern from "CPPFImdlp.h" namespace "mdlp":
    ctypedef float precision_t
    cdef cppclass CPPFImdlp:
-        CPPFImdlp() except + 
-        CPPFImdlp(bool, bool) except + 
+        CPPFImdlp(bool) except + 
        CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
        vector[precision_t] getCutPoints()
        

-class PcutPoint_t:
-    def __init__(self, start, end, fromValue, toValue):
-        self.start = start
-        self.end = end
-        self.fromValue = fromValue
-        self.toValue = toValue
-
 cdef class CFImdlp:
    cdef CPPFImdlp *thisptr
-    def __cinit__(self, debug=False, proposal=True):
-        # Proposal or original algorithm
-        self.thisptr = new CPPFImdlp(proposal, debug)
+    def __cinit__(self, proposal):
+        self.thisptr = new CPPFImdlp(proposal)
    def __dealloc__(self):
        del self.thisptr
    def fit(self, X, y):
--- a/fimdlp/cppfimdlp.cpython-310-darwin.so
+++ b/fimdlp/cppfimdlp.cpython-310-darwin.so
--- a/fimdlp/mdlp.py
+++ b/fimdlp/mdlp.py
@@ -3,33 +3,35 @@ from .cppfimdlp import CFImdlp
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
+from joblib import Parallel, delayed


 class FImdlp(TransformerMixin, BaseEstimator):
-    def __init__(self, proposal=True):
-        self.proposal = proposal  # proposed algorithm or original algorithm
+    def __init__(self, n_jobs=-1, proposal=False):
+        self.n_jobs = n_jobs
+        self.proposal = proposal

-    """Fayyad - Irani MDLP discretization algorithm.
+    """Fayyad - Irani MDLP discretization algorithm based implementation.

    Parameters
    ----------
-    demo_param : str, default='demo'
-        A parameter used for demonstation of how to pass and store paramters.
+    n_jobs : int, default=-1
+        The number of jobs to run in parallel. :meth:`fit` and 
+        :meth:`transform`, are parallelized over the features. ``-1`` means 
+        using all cores available.

    Attributes
    ----------
    n_features_ : int
        The number of features of the data passed to :meth:`fit`.
    discretizer_ : list
-        The list of discretizers for each feature.
+        The list of discretizers, one for each feature.
    cut_points_ : list
        The list of cut points for each feature.
    X_ : array 
        the samples used to fit, shape (n_samples, n_features)
    y_ : array 
        the labels used to fit, shape (n_samples,)
-    discretized_X_ : 
-        array of the discretized samples passed to fit(n_samples, n_features)
    features_ : list
        the list of features to be discretized
    """
@@ -70,6 +72,8 @@ class FImdlp(TransformerMixin, BaseEstimator):
        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.
+        features : list, default=[i for i in range(n_features)]
+            The list of features to be discretized.
        Returns
        -------
        self : object
@@ -83,36 +87,22 @@ class FImdlp(TransformerMixin, BaseEstimator):
        self.y_ = y
        self.discretizer_ = [None] * self.n_features_
        self.cut_points_ = [None] * self.n_features_
-        # Can do it in parallel
-        for feature in self.features_:
-            self.discretizer_[feature] = CFImdlp(
-                proposal=self.proposal, debug=False
-            )
-            self.discretizer_[feature].fit(X[:, feature], y)
-            self.cut_points_[feature] = self.discretizer_[
-                feature
-            ].get_cut_points()
+        Parallel(n_jobs=self.n_jobs, prefer="threads")(
+            delayed(self._fit_discretizer)(feature)
+            for feature in range(self.n_features_)
+        )
        return self

-    def get_fitted(self):
-        """Return the discretized X computed during fit.
+    def _fit_discretizer(self, feature):
+        self.discretizer_[feature] = CFImdlp(proposal=self.proposal)
+        self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
+        self.cut_points_[feature] = self.discretizer_[feature].get_cut_points()

-        Returns
-        -------
-        X_transformed : array, shape (n_samples, n_features)
-            discretized X computed during fit.
-        """
-        # Check is fit had been called
-        check_is_fitted(self, "n_features_")
-        result = np.zeros_like(self.X_, dtype=np.int32) - 1
-        for feature in range(self.n_features_):
-            if feature in self.features_:
-                result[:, feature] = self.discretizer_[
-                    feature
-                ].get_discretized_values()
-            else:
-                result[:, feature] = self.X_[:, feature]
-        return result
+    def _discretize_feature(self, feature, X, result):
+        if feature in self.features_:
+            result[:, feature] = np.searchsorted(self.cut_points_[feature], X)
+        else:
+            result[:, feature] = X

    def transform(self, X):
        """Discretize X values.
@@ -127,28 +117,28 @@ class FImdlp(TransformerMixin, BaseEstimator):
        """
        # Check is fit had been called
        check_is_fitted(self, "n_features_")
-
        # Input validation
        X = check_array(X)
-
        # Check that the input is of the same shape as the one passed
        # during fit.
-        # if X.shape[1] != self.n_features_:
-        #     raise ValueError(
-        #         "Shape of input is different from what was seen in `fit`"
-        #     )
+        if X.shape[1] != self.n_features_:
+            raise ValueError(
+                "Shape of input is different from what was seen in `fit`"
+            )
        result = np.zeros_like(X, dtype=np.int32) - 1
-        # Can do it in parallel
-        for feature in range(self.n_features_):
-            if feature in self.features_:
-                result[:, feature] = np.searchsorted(
-                    self.cut_points_[feature], X[:, feature]
-                )
-            else:
-                result[:, feature] = X[:, feature]
+        Parallel(n_jobs=self.n_jobs, prefer="threads")(
+            delayed(self._discretize_feature)(feature, X[:, feature], result)
+            for feature in range(self.n_features_)
+        )
        return result

    def get_cut_points(self):
+        """Get the cut points for each feature.
+        Returns
+        -------
+        result: list
+            The list of cut points for each feature.
+        """
        result = []
        for feature in range(self.n_features_):
            result.append(self.cut_points_[feature])
--- a/fimdlp/testcpp/FImdlp_unittest.cc
+++ b/fimdlp/testcpp/FImdlp_unittest.cc
@@ -1,74 +1,63 @@
-//#include "gtest/gtest.h"
-//#include "../Metrics.h"
-//#include "../CPPFImdlp.h"
-//namespace mdlp {
-//    class TestFImdlp : public CPPFImdlp, public testing::Test {
-//    public:
-//        TestFImdlp() : CPPFImdlp(true, true) {}
-//        void SetUp()
-//        {
-//            //    5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
-//            //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
-//            X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
-//            y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
-//            fit(X, y);
-//        }
-//        void setProposal(bool value)
-//        {
-//            proposal = value;
-//        }
-//        void initCutPoints()
-//        {
-//            setCutPoints(cutPoints_t());
-//        }
-//        void initIndices()
-//        {
-//            indices = indices_t();
-//        }
-//        void initDiscretized()
-//        {
-//            xDiscretized = labels();
-//        }
-//        void checkSortedVector(samples& X_, indices_t indices_)
-//        {
-//            X = X_;
-//            indices = indices_;
-//            indices_t testSortedIndices = sortIndices(X);
-//            precision_t prev = X[testSortedIndices[0]];
-//            for (auto i = 0; i < X.size(); ++i) {
-//                EXPECT_EQ(testSortedIndices[i], indices[i]);
-//                EXPECT_LE(prev, X[testSortedIndices[i]]);
-//                prev = X[testSortedIndices[i]];
-//            }
-//        }
-//        void checkCutPoints(cutPoints_t& expected)
-//        {
-//            int expectedSize = expected.size();
-//            EXPECT_EQ(cutPoints.size(), expectedSize);
-//            for (auto i = 0; i < expectedSize; i++) {
-//                EXPECT_EQ(cutPoints[i].start, expected[i].start);
-//                EXPECT_EQ(cutPoints[i].end, expected[i].end);
-//                EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber);
-//                EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision);
-//                EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision);
-//            }
-//        }
-//        template<typename T, typename A>
-//        void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
-//        {
-//            EXPECT_EQ(expected.size(), computed.size());
-//            for (auto i = 0; i < expected.size(); i++) {
-//                EXPECT_EQ(expected[i], computed[i]);
-//            }
-//        }
-//
-//    };
-//    TEST_F(TestFImdlp, FitErrorEmptyDataset)
-//    {
-//        X = samples();
-//        y = labels();
-//        EXPECT_THROW(fit(X, y), std::invalid_argument);
-//    }
+#include "gtest/gtest.h"
+#include "../Metrics.h"
+#include "../CPPFImdlp.h"
+namespace mdlp {
+    class TestFImdlp: public CPPFImdlp, public testing::Test {
+    public:
+        TestFImdlp(): CPPFImdlp(false) {}
+        void SetUp()
+        {
+            //    5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
+            //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
+            X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
+            y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
+            fit(X, y);
+        }
+        void setProposal(bool value)
+        {
+            proposal = value;
+        }
+        void initIndices()
+        {
+            indices = indices_t();
+        }
+        void checkSortedVector(samples_t& X_, indices_t indices_)
+        {
+            X = X_;
+            indices = indices_;
+            indices_t testSortedIndices = sortIndices(X);
+            precision_t prev = X[testSortedIndices[0]];
+            for (auto i = 0; i < X.size(); ++i) {
+                EXPECT_EQ(testSortedIndices[i], indices[i]);
+                EXPECT_LE(prev, X[testSortedIndices[i]]);
+                prev = X[testSortedIndices[i]];
+            }
+        }
+        void checkCutPoints(cutPoints_t& expected)
+        {
+            int expectedSize = expected.size();
+            EXPECT_EQ(cutPoints.size(), expectedSize);
+            for (auto i = 0; i < expectedSize; i++) {
+                EXPECT_EQ(cutPoints[i], expected[i]);
+            }
+        }
+        template<typename T, typename A>
+        void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
+        {
+            EXPECT_EQ(expected.size(), computed.size());
+            for (auto i = 0; i < expected.size(); i++) {
+                EXPECT_EQ(expected[i], computed[i]);
+            }
+        }
+    };
+    TEST_F(TestFImdlp, FitErrorEmptyDataset)
+    {
+        X = samples_t();
+        y = labels_t();
+        EXPECT_THROW(fit(X, y), std::invalid_argument);
+    }
+}
+//    
 //    TEST_F(TestFImdlp, FitErrorDifferentSize)
 //    {
 //        X = { 1, 2, 3 };
@@ -143,7 +132,7 @@
 //    }
 //    TEST_F(TestFImdlp, DiscretizedValues)
 //    {
-//        labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+//        labels_t computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 //        computed = getDiscretizedValues();
 //        checkVectors(expected, computed);
 //    }
@@ -157,7 +146,7 @@
 //    TEST_F(TestFImdlp, Constructor)
 //    {
 //        samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
-//        labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
+//        labels_t y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
 //        setProposal(false);
 //        fit(X, y);
 //        computeCutPointsOriginal();
--- a/fimdlp/testcpp/Metrics_unittest.cc
+++ b/fimdlp/testcpp/Metrics_unittest.cc
@@ -1,31 +1,43 @@
 #include "gtest/gtest.h"
 #include "../Metrics.h"

+
 namespace mdlp {
-    precision_t precision = 0.000001;
-    TEST(MetricTest, NumClasses)
+    class TestMetrics: public Metrics, public testing::Test {
+    public:
+        labels_t y;
+        samples_t X;
+        indices_t indices;
+        precision_t precision = 0.000001;
+
+        TestMetrics(): Metrics(y, indices) {}
+        void SetUp()
+        {
+            y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
+            indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+            setData(y, indices);
+        }
+    };
+    TEST_F(TestMetrics, NumClasses)
    {
-        labels y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
-        indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
-        EXPECT_EQ(1, Metrics::numClasses(y, indices, 4, 8));
-        EXPECT_EQ(2, Metrics::numClasses(y, indices, 0, 10));
-        EXPECT_EQ(2, Metrics::numClasses(y, indices, 8, 10));
+        y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
+        EXPECT_EQ(1, computeNumClasses(4, 8));
+        EXPECT_EQ(2, computeNumClasses(0, 10));
+        EXPECT_EQ(2, computeNumClasses(8, 10));
    }
-    TEST(MetricTest, Entropy)
+    TEST_F(TestMetrics, Entropy)
    {
-        labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
-        indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
-        EXPECT_EQ(1, Metrics::entropy(y, indices, 0, 10, 2));
-        EXPECT_EQ(0, Metrics::entropy(y, indices, 0, 5, 1));
-        labels yz = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
-        ASSERT_NEAR(0.468996, Metrics::entropy(yz, indices, 0, 10, 2), precision);
+        EXPECT_EQ(1, entropy(0, 10));
+        EXPECT_EQ(0, entropy(0, 5));
+        y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
+        setData(y, indices);
+        ASSERT_NEAR(0.468996, entropy(0, 10), precision);
    }
-    TEST(MetricTest, InformationGain)
+    TEST_F(TestMetrics, InformationGain)
    {
-        labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
-        indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
-        labels yz = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
-        ASSERT_NEAR(1, Metrics::informationGain(y, indices, 0, 10, 5, 2), precision);
-        ASSERT_NEAR(0.108032, Metrics::informationGain(yz, indices, 0, 10, 5, 2), precision);
+        ASSERT_NEAR(1, informationGain(0, 5, 10), precision);
+        y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
+        setData(y, indices);
+        ASSERT_NEAR(0.108032, informationGain(0, 5, 10), precision);
    }
-}
+}
--- a/fimdlp/testcpp/main
+++ b/fimdlp/testcpp/main
--- a/fimdlp/testcpp/xx/ArffFiles.cpp
+++ b/fimdlp/testcpp/xx/ArffFiles.cpp
@@ -101,13 +101,13 @@ string ArffFiles::trim(const string& source)
    s.erase(s.find_last_not_of(" \n\r\t") + 1);
    return s;
 }
-vector<int> ArffFiles::factorize(const vector<string>& labels)
+vector<int> ArffFiles::factorize(const vector<string>& labels_t)
 {
    vector<int> yy;
-    yy.reserve(labels.size());
+    yy.reserve(labels_t.size());
    map<string, int> labelMap;
    int i = 0;
-    for (string label : labels) {
+    for (string label : labels_t) {
        if (labelMap.find(label) == labelMap.end()) {
            labelMap[label] = i++;
        }
--- a/fimdlp/testcpp/xx/ArffFiles.h
+++ b/fimdlp/testcpp/xx/ArffFiles.h
@@ -23,6 +23,6 @@ public:
    vector<vector<float>>& getX();
    vector<int>& getY();
    vector<tuple<string, string>> getAttributes();
-    vector<int> factorize(const vector<string>& labels);
+    vector<int> factorize(const vector<string>& labels_t);
 };
 #endif
--- a/fimdlp/tests/FImdlp_test.py
+++ b/fimdlp/tests/FImdlp_test.py
@@ -8,12 +8,14 @@ from ..mdlp import FImdlp
 class FImdlpTest(unittest.TestCase):
    def test_init(self):
        clf = FImdlp()
-        self.assertTrue(clf.proposal)
-        clf = FImdlp(proposal=False)
+        self.assertEqual(-1, clf.n_jobs)
        self.assertFalse(clf.proposal)
+        clf = FImdlp(proposal=True, n_jobs=7)
+        self.assertTrue(clf.proposal)
+        self.assertEqual(7, clf.n_jobs)

-    def test_fit(self):
-        clf = FImdlp()
+    def test_fit_proposal(self):
+        clf = FImdlp(proposal=True)
        clf.fit([[1, 2], [3, 4]], [1, 2])
        self.assertEqual(clf.n_features_, 2)
        self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
@@ -25,10 +27,39 @@ class FImdlpTest(unittest.TestCase):
        self.assertTrue(np.array_equal(X, clf.X_))
        self.assertTrue(np.array_equal(y, clf.y_))
        expected = [
-            [4.900000095367432, 5.0, 5.099999904632568, 5.400000095367432],
-            [2.6999998092651367, 2.9000000953674316],
-            [2.3499999046325684, 4.5],
-            [0.75, 1.399999976158142, 1.5],
+            [
+                4.900000095367432,
+                5.0,
+                5.099999904632568,
+                5.400000095367432,
+                5.699999809265137,
+            ],
+            [2.6999998092651367, 2.9000000953674316, 3.1999998092651367],
+            [2.3499999046325684, 4.5, 4.800000190734863],
+            [0.75, 1.399999976158142, 1.5, 1.7000000476837158],
+        ]
+        self.assertListEqual(expected, clf.get_cut_points())
+        self.assertListEqual([0, 1, 2, 3], clf.features_)
+        clf.fit(X, y, features=[0, 2, 3])
+        self.assertListEqual([0, 2, 3], clf.features_)
+
+    def test_fit_original(self):
+        clf = FImdlp(proposal=False)
+        clf.fit([[1, 2], [3, 4]], [1, 2])
+        self.assertEqual(clf.n_features_, 2)
+        self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
+        self.assertListEqual(clf.y_.tolist(), [1, 2])
+        self.assertListEqual([[], []], clf.get_cut_points())
+        X, y = load_iris(return_X_y=True)
+        clf.fit(X, y)
+        self.assertEqual(clf.n_features_, 4)
+        self.assertTrue(np.array_equal(X, clf.X_))
+        self.assertTrue(np.array_equal(y, clf.y_))
+        expected = [
+            [5.5, 5.800000190734863],
+            [3.0999999046325684],
+            [2.450000047683716, 4.800000190734863, 5.099999904632568],
+            [0.800000011920929, 1.7000000476837158],
        ]
        self.assertListEqual(expected, clf.get_cut_points())
        self.assertListEqual([0, 1, 2, 3], clf.features_)
@@ -44,8 +75,38 @@ class FImdlpTest(unittest.TestCase):
        with self.assertRaises(ValueError):
            clf.fit([[1, 2], [3, 4]], [1, 2], unexpected="class_name")

-    def test_transform(self):
-        clf = FImdlp()
+    def test_transform_original(self):
+        clf = FImdlp(proposal=False)
+        clf.fit([[1, 2], [3, 4]], [1, 2])
+        self.assertEqual(
+            clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
+        )
+        X, y = load_iris(return_X_y=True)
+        clf.fit(X, y)
+        self.assertEqual(clf.n_features_, 4)
+        self.assertTrue(np.array_equal(X, clf.X_))
+        self.assertTrue(np.array_equal(y, clf.y_))
+        self.assertListEqual(
+            clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
+        )
+        expected = [
+            [0, 0, 1, 1],
+            [2, 0, 1, 1],
+            [1, 0, 1, 1],
+            [0, 0, 1, 1],
+            [1, 0, 1, 1],
+            [1, 0, 1, 1],
+            [1, 0, 1, 1],
+        ]
+        self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
+        with self.assertRaises(ValueError):
+            clf.transform([[1, 2, 3], [4, 5, 6]])
+        with self.assertRaises(sklearn.exceptions.NotFittedError):
+            clf = FImdlp(proposal=False)
+            clf.transform([[1, 2], [3, 4]])
+
+    def test_transform_proposal(self):
+        clf = FImdlp(proposal=True)
        clf.fit([[1, 2], [3, 4]], [1, 2])
        self.assertEqual(
            clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
@@ -60,16 +121,16 @@ class FImdlpTest(unittest.TestCase):
        )
        expected = [
            [4, 0, 1, 1],
-            [4, 2, 2, 2],
-            [4, 0, 1, 1],
+            [5, 2, 2, 2],
+            [5, 0, 1, 1],
            [1, 0, 1, 1],
            [4, 1, 1, 1],
-            [4, 2, 1, 1],
-            [4, 1, 1, 1],
+            [5, 2, 1, 1],
+            [5, 1, 1, 1],
        ]
        self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
        with self.assertRaises(ValueError):
            clf.transform([[1, 2, 3], [4, 5, 6]])
        with self.assertRaises(sklearn.exceptions.NotFittedError):
-            clf = FImdlp()
+            clf = FImdlp(proposal=True)
            clf.transform([[1, 2], [3, 4]])
--- a/fimdlp/typesFImdlp.h
+++ b/fimdlp/typesFImdlp.h
@@ -6,8 +6,8 @@
 using namespace std;
 namespace mdlp {
    typedef float precision_t;
-    typedef vector<precision_t> samples;
-    typedef vector<int> labels;
+    typedef vector<precision_t> samples_t;
+    typedef vector<int> labels_t;
    typedef vector<size_t> indices_t;
    typedef vector<precision_t> cutPoints_t;
    typedef map<tuple<int, int>, precision_t> cacheEnt_t;
--- a/sample.py
+++ b/sample.py
@@ -1,37 +0,0 @@
-from fimdlp.mdlp import FImdlp
-from fimdlp.cppfimdlp import CFImdlp
-from sklearn.ensemble import RandomForestClassifier
-import time
-
-from scipy.io import arff
-import pandas as pd
-
-path = "fimdlp/testcpp/datasets/"
-# class_name = "speaker"
-# file_name = "kdd_JapaneseVowels.arff"
-class_name = "class"
-# file_name = "mfeat-factors.arff"
-file_name = "letter.arff"
-data = arff.loadarff(path + file_name)
-df = pd.DataFrame(data[0])
-df.dropna(axis=0, how="any", inplace=True)
-dataset = df
-X = df.drop(class_name, axis=1)
-features = X.columns
-class_name = class_name
-y, _ = pd.factorize(df[class_name])
-X = X.to_numpy()
-
-test = FImdlp()
-now = time.time()
-# test.fit(X, y, features=[i for i in (range(3, 14))])
-test.fit(X, y)
-fit_time = time.time()
-print("Fitting: ", fit_time - now)
-now = time.time()
-Xt = test.transform(X)
-print("Transforming: ", time.time() - now)
-print(test.get_cut_points())
-
-clf = RandomForestClassifier(random_state=0)
-print(clf.fit(Xt, y).score(Xt, y))
--- a/fimdlp/testcpp/ArffFiles.cpp
+++ b/fimdlp/testcpp/ArffFiles.cpp
@@ -101,13 +101,13 @@ string ArffFiles::trim(const string& source)
    s.erase(s.find_last_not_of(" \n\r\t") + 1);
    return s;
 }
-vector<int> ArffFiles::factorize(const vector<string>& labels)
+vector<int> ArffFiles::factorize(const vector<string>& labels_t)
 {
    vector<int> yy;
-    yy.reserve(labels.size());
+    yy.reserve(labels_t.size());
    map<string, int> labelMap;
    int i = 0;
-    for (string label : labels) {
+    for (string label : labels_t) {
        if (labelMap.find(label) == labelMap.end()) {
            labelMap[label] = i++;
        }
--- a/fimdlp/testcpp/ArffFiles.h
+++ b/fimdlp/testcpp/ArffFiles.h
@@ -23,6 +23,6 @@ public:
    vector<vector<float>>& getX();
    vector<int>& getY();
    vector<tuple<string, string>> getAttributes();
-    vector<int> factorize(const vector<string>& labels);
+    vector<int> factorize(const vector<string>& labels_t);
 };
 #endif
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -0,0 +1,6 @@
+cmake_minimum_required(VERSION 3.24)
+project(main)
+
+set(CMAKE_CXX_STANDARD 17)
+
+add_executable(sample sample.cpp ArffFiles.cpp ../fimdlp/Metrics.cpp ../fimdlp/CPPFImdlp.cpp)
--- a/fimdlp/testcpp/main.cpp
+++ b/fimdlp/testcpp/main.cpp
@@ -2,7 +2,7 @@
 #include <iostream>
 #include <vector>
 #include <iomanip>
-#include "../CPPFImdlp.h"
+#include "../fimdlp/CPPFImdlp.h"

 using namespace std;

@@ -10,7 +10,7 @@ int main(int argc, char** argv)
 {
    ArffFiles file;
    vector<string> lines;
-    string path = "/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/";
+    string path = "../fimdlp/testcpp/datasets/";
    map<string, bool > datasets = {
        {"mfeat-factors", true},
        {"iris", true},
@@ -41,7 +41,7 @@ int main(int argc, char** argv)
        }
        cout << y[i] << endl;
    }
-    mdlp::CPPFImdlp test = mdlp::CPPFImdlp();
+    mdlp::CPPFImdlp test = mdlp::CPPFImdlp(false);
    for (auto i = 0; i < attributes.size(); i++) {
        cout << "Cut points for " << get<0>(attributes[i]) << endl;
        cout << "--------------------------" << setprecision(3) << endl;
--- a/samples/sample.py
+++ b/samples/sample.py
@@ -0,0 +1,44 @@
+import time
+import argparse
+import os
+from scipy.io import arff
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from fimdlp.mdlp import FImdlp
+
+datasets = {
+    "mfeat-factors": True,
+    "iris": True,
+    "letter": True,
+    "kdd_JapaneseVowels": False,
+}
+
+ap = argparse.ArgumentParser()
+ap.add_argument("--proposal", action="store_true")
+ap.add_argument("--original", dest="proposal", action="store_false")
+ap.add_argument("dataset", type=str, choices=datasets.keys())
+args = ap.parse_args()
+relative = "" if os.path.isdir("fimdlp") else ".."
+file_name = os.path.join(
+    relative, "fimdlp", "testcpp", "datasets", args.dataset
+)
+data = arff.loadarff(file_name + ".arff")
+df = pd.DataFrame(data[0])
+class_column = -1 if datasets[args.dataset] else 0
+class_name = df.columns.to_list()[class_column]
+X = df.drop(class_name, axis=1)
+y, _ = pd.factorize(df[class_name])
+X = X.to_numpy()
+test = FImdlp(proposal=args.proposal)
+now = time.time()
+test.fit(X, y)
+fit_time = time.time()
+print("Fitting: ", fit_time - now)
+now = time.time()
+Xt = test.transform(X)
+print("Transforming: ", time.time() - now)
+print(test.get_cut_points())
+clf = RandomForestClassifier(random_state=0)
+print(
+    "Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y)
+)