Refactor samples and fix Metrics tests

2025-08-18 00:45:52 +00:00 · 2022-12-10 14:32:28 +01:00
parent 418db2bb99
commit 3d48073574
22 changed files with 301 additions and 258 deletions
--- a/README.md
+++ b/README.md
@@ -6,5 +6,7 @@ Fayyad - Irani MDLP discretization algorithm
 ```bash
 python setup.py build_ext --inplace
-python sample.py
+python samples/sample.py iris --original 
 python samples/sample.py iris --proposal
 python samples/sample.py -h # for more options
 ```
--- a/fimdlp/CPPFImdlp.cpp
+++ b/fimdlp/CPPFImdlp.cpp
@@ -1,21 +1,17 @@
 #include <numeric>
 #include <iostream>
 #include <algorithm>
 #include <set>
 #include "CPPFImdlp.h"
 #include "Metrics.h"
 namespace mdlp {
-    CPPFImdlp::CPPFImdlp(): proposal(true), debug(false), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
+    CPPFImdlp::CPPFImdlp(bool proposal):proposal(proposal), indices(indices_t()), y(labels_t()), metrics(Metrics(y, indices))
    {
    }
    CPPFImdlp::CPPFImdlp(bool proposal, bool debug): proposal(proposal), debug(debug), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
    {
    }
    CPPFImdlp::~CPPFImdlp()
        = default;
-    CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
+    CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_)
    {
        X = X_;
        y = y_;
@@ -28,8 +24,10 @@ namespace mdlp {
        }
        indices = sortIndices(X_);
        metrics.setData(y, indices);
-        //computeCutPoints(0, X.size());
+        if (proposal)
            computeCutPointsProposal();
        else
            computeCutPoints(0, X.size());
        return *this;
    }
    void CPPFImdlp::computeCutPoints(size_t start, size_t end)
@@ -53,7 +51,6 @@ namespace mdlp {
    }
    void CPPFImdlp::computeCutPointsOriginal(size_t start, size_t end)
    {
        size_t idx;
        precision_t cut;
        if (end - start < 2)
            return;
@@ -76,14 +73,9 @@ namespace mdlp {
        yCur = yPrev = y[indices[0]];
        numElements = indices.size() - 1;
        idx = start = 0;
        bool firstCutPoint = true;
        if (debug)
            printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
        while (idx < numElements) {
            xPivot = xCur;
            yPivot = yCur;
            if (debug)
                printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
            // Read the same values and check class changes
            do {
                idx++;
@@ -92,17 +84,12 @@ namespace mdlp {
                if (yCur != yPivot && xCur == xPivot) {
                    yPivot = -1;
                }
                if (debug)
                    printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
            }
            while (idx < numElements && xCur == xPivot);
            // Check if the class changed and there are more than 1 element
            if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) {
                start = idx;
                cutPoint = (xPrev + xCur) / 2;
                if (debug) {
                    printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = %3.1g \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint);
                }
                cutPoints.push_back(cutPoint);
            }
            yPrev = yPivot;
@@ -160,7 +147,7 @@ namespace mdlp {
        return output;
    }
    // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
-    indices_t CPPFImdlp::sortIndices(samples& X_)
+    indices_t CPPFImdlp::sortIndices(samples_t& X_)
    {
        indices_t idx(X_.size());
        iota(idx.begin(), idx.end(), 0);
--- a/fimdlp/CPPFImdlp.h
+++ b/fimdlp/CPPFImdlp.h
@@ -6,15 +6,14 @@
 namespace mdlp {
    class CPPFImdlp {
    protected:
-        bool proposal; // proposed algorithm or original algorithm
+        bool proposal;
        bool debug;
        indices_t indices; // sorted indices to use with X and y
-        samples X;
+        samples_t X;
-        labels y;
+        labels_t y;
        Metrics metrics;
        cutPoints_t cutPoints;
-        static indices_t sortIndices(samples&);
+        static indices_t sortIndices(samples_t&);
        void computeCutPoints(size_t, size_t);
        long int getCandidate(size_t, size_t);
        bool mdlp(size_t, size_t, size_t);
@@ -25,11 +24,10 @@ namespace mdlp {
        void computeCutPointsProposal();
    public:
-        CPPFImdlp();
+        CPPFImdlp(bool);
        CPPFImdlp(bool, bool debug = false);
        ~CPPFImdlp();
-        CPPFImdlp& fit(samples&, labels&);
+        CPPFImdlp& fit(samples_t&, labels_t&);
-        samples getCutPoints();
+        samples_t getCutPoints();
    };
 }
 #endif
--- a/fimdlp/Metrics.cpp
+++ b/fimdlp/Metrics.cpp
@@ -1,8 +1,9 @@
 #include "Metrics.h"
 #include <set>
 #include <cmath>
 using namespace std;
 namespace mdlp {
-    Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
+    Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
    {
    }
    int Metrics::computeNumClasses(size_t start, size_t end)
@@ -13,7 +14,7 @@ namespace mdlp {
        }
        return nClasses.size();
    }
-    void Metrics::setData(labels& y_, indices_t& indices_)
+    void Metrics::setData(labels_t& y_, indices_t& indices_)
    {
        indices = indices_;
        y = y_;
@@ -25,7 +26,7 @@ namespace mdlp {
    {
        precision_t p, ventropy = 0;
        int nElements = 0;
-        labels counts(numClasses + 1, 0);
+        labels_t counts(numClasses + 1, 0);
        if (end - start < 2)
            return 0;
        if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
--- a/fimdlp/Metrics.h
+++ b/fimdlp/Metrics.h
@@ -1,18 +1,17 @@
 #ifndef CCMETRICS_H
 #define CCMETRICS_H
 #include "typesFImdlp.h"
 #include <cmath>
 namespace mdlp {
    class Metrics {
    protected:
-        labels& y;
+        labels_t& y;
        indices_t& indices;
        int numClasses;
        cacheEnt_t entropyCache;
        cacheIg_t igCache;
    public:
-        Metrics(labels&, indices_t&);
+        Metrics(labels_t&, indices_t&);
-        void setData(labels&, indices_t&);
+        void setData(labels_t&, indices_t&);
        int computeNumClasses(size_t, size_t);
        precision_t entropy(size_t, size_t);
        precision_t informationGain(size_t, size_t, size_t);
--- a/fimdlp/_version.py
+++ b/fimdlp/_version.py
@@ -1 +1 @@
-__version__ = '0.1.1'
+__version__ = "0.9.1"
--- a/fimdlp/cfimdlp.pyx
+++ b/fimdlp/cfimdlp.pyx
@@ -6,24 +6,15 @@ from libcpp cimport bool
 cdef extern from "CPPFImdlp.h" namespace "mdlp":
    ctypedef float precision_t
    cdef cppclass CPPFImdlp:
-        CPPFImdlp() except + 
+        CPPFImdlp(bool) except + 
        CPPFImdlp(bool, bool) except + 
        CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
        vector[precision_t] getCutPoints()
 class PcutPoint_t:
    def __init__(self, start, end, fromValue, toValue):
        self.start = start
        self.end = end
        self.fromValue = fromValue
        self.toValue = toValue
 cdef class CFImdlp:
    cdef CPPFImdlp *thisptr
-    def __cinit__(self, debug=False, proposal=True):
+    def __cinit__(self, proposal):
-        # Proposal or original algorithm
+        self.thisptr = new CPPFImdlp(proposal)
        self.thisptr = new CPPFImdlp(proposal, debug)
    def __dealloc__(self):
        del self.thisptr
    def fit(self, X, y):
--- a/fimdlp/cppfimdlp.cpython-310-darwin.so
+++ b/fimdlp/cppfimdlp.cpython-310-darwin.so
--- a/fimdlp/mdlp.py
+++ b/fimdlp/mdlp.py
@@ -3,33 +3,35 @@ from .cppfimdlp import CFImdlp
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from joblib import Parallel, delayed
 class FImdlp(TransformerMixin, BaseEstimator):
-    def __init__(self, proposal=True):
+    def __init__(self, n_jobs=-1, proposal=False):
-        self.proposal = proposal  # proposed algorithm or original algorithm
+        self.n_jobs = n_jobs
        self.proposal = proposal
-    """Fayyad - Irani MDLP discretization algorithm.
+    """Fayyad - Irani MDLP discretization algorithm based implementation.
    Parameters
    ----------
-    demo_param : str, default='demo'
+    n_jobs : int, default=-1
-        A parameter used for demonstation of how to pass and store paramters.
+        The number of jobs to run in parallel. :meth:`fit` and 
        :meth:`transform`, are parallelized over the features. ``-1`` means 
        using all cores available.
    Attributes
    ----------
    n_features_ : int
        The number of features of the data passed to :meth:`fit`.
    discretizer_ : list
-        The list of discretizers for each feature.
+        The list of discretizers, one for each feature.
    cut_points_ : list
        The list of cut points for each feature.
    X_ : array 
        the samples used to fit, shape (n_samples, n_features)
    y_ : array 
        the labels used to fit, shape (n_samples,)
    discretized_X_ : 
        array of the discretized samples passed to fit(n_samples, n_features)
    features_ : list
        the list of features to be discretized
    """
@@ -70,6 +72,8 @@ class FImdlp(TransformerMixin, BaseEstimator):
        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.
        features : list, default=[i for i in range(n_features)]
            The list of features to be discretized.
        Returns
        -------
        self : object
@@ -83,36 +87,22 @@ class FImdlp(TransformerMixin, BaseEstimator):
        self.y_ = y
        self.discretizer_ = [None] * self.n_features_
        self.cut_points_ = [None] * self.n_features_
-        # Can do it in parallel
+        Parallel(n_jobs=self.n_jobs, prefer="threads")(
-        for feature in self.features_:
+            delayed(self._fit_discretizer)(feature)
-            self.discretizer_[feature] = CFImdlp(
+            for feature in range(self.n_features_)
                proposal=self.proposal, debug=False
        )
            self.discretizer_[feature].fit(X[:, feature], y)
            self.cut_points_[feature] = self.discretizer_[
                feature
            ].get_cut_points()
        return self
-    def get_fitted(self):
+    def _fit_discretizer(self, feature):
-        """Return the discretized X computed during fit.
+        self.discretizer_[feature] = CFImdlp(proposal=self.proposal)
        self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
        self.cut_points_[feature] = self.discretizer_[feature].get_cut_points()
-        Returns
+    def _discretize_feature(self, feature, X, result):
        -------
        X_transformed : array, shape (n_samples, n_features)
            discretized X computed during fit.
        """
        # Check is fit had been called
        check_is_fitted(self, "n_features_")
        result = np.zeros_like(self.X_, dtype=np.int32) - 1
        for feature in range(self.n_features_):
        if feature in self.features_:
-                result[:, feature] = self.discretizer_[
+            result[:, feature] = np.searchsorted(self.cut_points_[feature], X)
                    feature
                ].get_discretized_values()
        else:
-                result[:, feature] = self.X_[:, feature]
+            result[:, feature] = X
        return result
    def transform(self, X):
        """Discretize X values.
@@ -127,28 +117,28 @@ class FImdlp(TransformerMixin, BaseEstimator):
        """
        # Check is fit had been called
        check_is_fitted(self, "n_features_")
        # Input validation
        X = check_array(X)
        # Check that the input is of the same shape as the one passed
        # during fit.
-        # if X.shape[1] != self.n_features_:
+        if X.shape[1] != self.n_features_:
-        #     raise ValueError(
+            raise ValueError(
-        #         "Shape of input is different from what was seen in `fit`"
+                "Shape of input is different from what was seen in `fit`"
-        #     )
+            )
-        result = np.zeros_like(X, dtype=np.int32) - 1
+        result = np.zeros_like(X, dtype=np.int32) - 1
-        # Can do it in parallel
+        Parallel(n_jobs=self.n_jobs, prefer="threads")(
-        for feature in range(self.n_features_):
+            delayed(self._discretize_feature)(feature, X[:, feature], result)
-            if feature in self.features_:
+            for feature in range(self.n_features_)
                result[:, feature] = np.searchsorted(
                    self.cut_points_[feature], X[:, feature]
        )
            else:
                result[:, feature] = X[:, feature]
        return result
    def get_cut_points(self):
        """Get the cut points for each feature.
        Returns
        -------
        result: list
            The list of cut points for each feature.
        """
        result = []
        for feature in range(self.n_features_):
            result.append(self.cut_points_[feature])
--- a/fimdlp/testcpp/FImdlp_unittest.cc
+++ b/fimdlp/testcpp/FImdlp_unittest.cc
@@ -1,74 +1,63 @@
-//#include "gtest/gtest.h"
+#include "gtest/gtest.h"
-//#include "../Metrics.h"
+#include "../Metrics.h"
-//#include "../CPPFImdlp.h"
+#include "../CPPFImdlp.h"
-//namespace mdlp {
+namespace mdlp {
-//    class TestFImdlp : public CPPFImdlp, public testing::Test {
+    class TestFImdlp: public CPPFImdlp, public testing::Test {
-//    public:
+    public:
-//        TestFImdlp() : CPPFImdlp(true, true) {}
+        TestFImdlp(): CPPFImdlp(false) {}
-//        void SetUp()
+        void SetUp()
-//        {
+        {
-//            //    5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
+            //    5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
-//            //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
+            //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
-//            X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
+            X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
-//            y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
+            y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
-//            fit(X, y);
+            fit(X, y);
-//        }
+        }
-//        void setProposal(bool value)
+        void setProposal(bool value)
-//        {
+        {
-//            proposal = value;
+            proposal = value;
-//        }
+        }
-//        void initCutPoints()
+        void initIndices()
-//        {
+        {
-//            setCutPoints(cutPoints_t());
+            indices = indices_t();
-//        }
+        }
-//        void initIndices()
+        void checkSortedVector(samples_t& X_, indices_t indices_)
-//        {
+        {
-//            indices = indices_t();
+            X = X_;
-//        }
+            indices = indices_;
-//        void initDiscretized()
+            indices_t testSortedIndices = sortIndices(X);
-//        {
+            precision_t prev = X[testSortedIndices[0]];
-//            xDiscretized = labels();
+            for (auto i = 0; i < X.size(); ++i) {
-//        }
+                EXPECT_EQ(testSortedIndices[i], indices[i]);
-//        void checkSortedVector(samples& X_, indices_t indices_)
+                EXPECT_LE(prev, X[testSortedIndices[i]]);
-//        {
+                prev = X[testSortedIndices[i]];
-//            X = X_;
+            }
-//            indices = indices_;
+        }
-//            indices_t testSortedIndices = sortIndices(X);
+        void checkCutPoints(cutPoints_t& expected)
-//            precision_t prev = X[testSortedIndices[0]];
+        {
-//            for (auto i = 0; i < X.size(); ++i) {
+            int expectedSize = expected.size();
-//                EXPECT_EQ(testSortedIndices[i], indices[i]);
+            EXPECT_EQ(cutPoints.size(), expectedSize);
-//                EXPECT_LE(prev, X[testSortedIndices[i]]);
+            for (auto i = 0; i < expectedSize; i++) {
-//                prev = X[testSortedIndices[i]];
+                EXPECT_EQ(cutPoints[i], expected[i]);
-//            }
+            }
-//        }
+        }
-//        void checkCutPoints(cutPoints_t& expected)
+        template<typename T, typename A>
-//        {
+        void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
-//            int expectedSize = expected.size();
+        {
-//            EXPECT_EQ(cutPoints.size(), expectedSize);
+            EXPECT_EQ(expected.size(), computed.size());
-//            for (auto i = 0; i < expectedSize; i++) {
+            for (auto i = 0; i < expected.size(); i++) {
-//                EXPECT_EQ(cutPoints[i].start, expected[i].start);
+                EXPECT_EQ(expected[i], computed[i]);
-//                EXPECT_EQ(cutPoints[i].end, expected[i].end);
+            }
-//                EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber);
+        }
-//                EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision);
+    };
-//                EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision);
+    TEST_F(TestFImdlp, FitErrorEmptyDataset)
-//            }
+    {
-//        }
+        X = samples_t();
-//        template<typename T, typename A>
+        y = labels_t();
-//        void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
+        EXPECT_THROW(fit(X, y), std::invalid_argument);
-//        {
+    }
-//            EXPECT_EQ(expected.size(), computed.size());
+}
 //            for (auto i = 0; i < expected.size(); i++) {
 //                EXPECT_EQ(expected[i], computed[i]);
 //            }
 //        }
 //    
 //    };
 //    TEST_F(TestFImdlp, FitErrorEmptyDataset)
 //    {
 //        X = samples();
 //        y = labels();
 //        EXPECT_THROW(fit(X, y), std::invalid_argument);
 //    }
 //    TEST_F(TestFImdlp, FitErrorDifferentSize)
 //    {
 //        X = { 1, 2, 3 };
@@ -143,7 +132,7 @@
 //    }
 //    TEST_F(TestFImdlp, DiscretizedValues)
 //    {
-//        labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+//        labels_t computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 //        computed = getDiscretizedValues();
 //        checkVectors(expected, computed);
 //    }
@@ -157,7 +146,7 @@
 //    TEST_F(TestFImdlp, Constructor)
 //    {
 //        samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
-//        labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
+//        labels_t y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
 //        setProposal(false);
 //        fit(X, y);
 //        computeCutPointsOriginal();
--- a/fimdlp/testcpp/Metrics_unittest.cc
+++ b/fimdlp/testcpp/Metrics_unittest.cc
@@ -1,31 +1,43 @@
 #include "gtest/gtest.h"
 #include "../Metrics.h"
 namespace mdlp {
    class TestMetrics: public Metrics, public testing::Test {
    public:
        labels_t y;
        samples_t X;
        indices_t indices;
        precision_t precision = 0.000001;
-    TEST(MetricTest, NumClasses)
+
        TestMetrics(): Metrics(y, indices) {}
        void SetUp()
        {
-        labels y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
+            y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
-        indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+            indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
-        EXPECT_EQ(1, Metrics::numClasses(y, indices, 4, 8));
+            setData(y, indices);
        EXPECT_EQ(2, Metrics::numClasses(y, indices, 0, 10));
        EXPECT_EQ(2, Metrics::numClasses(y, indices, 8, 10));
        }
-    TEST(MetricTest, Entropy)
+    };
    TEST_F(TestMetrics, NumClasses)
    {
-        labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
+        y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
-        indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+        EXPECT_EQ(1, computeNumClasses(4, 8));
-        EXPECT_EQ(1, Metrics::entropy(y, indices, 0, 10, 2));
+        EXPECT_EQ(2, computeNumClasses(0, 10));
-        EXPECT_EQ(0, Metrics::entropy(y, indices, 0, 5, 1));
+        EXPECT_EQ(2, computeNumClasses(8, 10));
        labels yz = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
        ASSERT_NEAR(0.468996, Metrics::entropy(yz, indices, 0, 10, 2), precision);
    }
-    TEST(MetricTest, InformationGain)
+    TEST_F(TestMetrics, Entropy)
    {
-        labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
+        EXPECT_EQ(1, entropy(0, 10));
-        indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+        EXPECT_EQ(0, entropy(0, 5));
-        labels yz = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
+        y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
-        ASSERT_NEAR(1, Metrics::informationGain(y, indices, 0, 10, 5, 2), precision);
+        setData(y, indices);
-        ASSERT_NEAR(0.108032, Metrics::informationGain(yz, indices, 0, 10, 5, 2), precision);
+        ASSERT_NEAR(0.468996, entropy(0, 10), precision);
    }
    TEST_F(TestMetrics, InformationGain)
    {
        ASSERT_NEAR(1, informationGain(0, 5, 10), precision);
        y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
        setData(y, indices);
        ASSERT_NEAR(0.108032, informationGain(0, 5, 10), precision);
    }
 }
--- a/fimdlp/testcpp/main
+++ b/fimdlp/testcpp/main
--- a/fimdlp/testcpp/xx/ArffFiles.cpp
+++ b/fimdlp/testcpp/xx/ArffFiles.cpp
@@ -101,13 +101,13 @@ string ArffFiles::trim(const string& source)
    s.erase(s.find_last_not_of(" \n\r\t") + 1);
    return s;
 }
-vector<int> ArffFiles::factorize(const vector<string>& labels)
+vector<int> ArffFiles::factorize(const vector<string>& labels_t)
 {
    vector<int> yy;
-    yy.reserve(labels.size());
+    yy.reserve(labels_t.size());
    map<string, int> labelMap;
    int i = 0;
-    for (string label : labels) {
+    for (string label : labels_t) {
        if (labelMap.find(label) == labelMap.end()) {
            labelMap[label] = i++;
        }
--- a/fimdlp/testcpp/xx/ArffFiles.h
+++ b/fimdlp/testcpp/xx/ArffFiles.h
@@ -23,6 +23,6 @@ public:
    vector<vector<float>>& getX();
    vector<int>& getY();
    vector<tuple<string, string>> getAttributes();
-    vector<int> factorize(const vector<string>& labels);
+    vector<int> factorize(const vector<string>& labels_t);
 };
 #endif
--- a/fimdlp/tests/FImdlp_test.py
+++ b/fimdlp/tests/FImdlp_test.py
@@ -8,12 +8,14 @@ from ..mdlp import FImdlp
 class FImdlpTest(unittest.TestCase):
    def test_init(self):
        clf = FImdlp()
-        self.assertTrue(clf.proposal)
+        self.assertEqual(-1, clf.n_jobs)
        clf = FImdlp(proposal=False)
        self.assertFalse(clf.proposal)
        clf = FImdlp(proposal=True, n_jobs=7)
        self.assertTrue(clf.proposal)
        self.assertEqual(7, clf.n_jobs)
-    def test_fit(self):
+    def test_fit_proposal(self):
-        clf = FImdlp()
+        clf = FImdlp(proposal=True)
        clf.fit([[1, 2], [3, 4]], [1, 2])
        self.assertEqual(clf.n_features_, 2)
        self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
@@ -25,10 +27,39 @@ class FImdlpTest(unittest.TestCase):
        self.assertTrue(np.array_equal(X, clf.X_))
        self.assertTrue(np.array_equal(y, clf.y_))
        expected = [
-            [4.900000095367432, 5.0, 5.099999904632568, 5.400000095367432],
+            [
-            [2.6999998092651367, 2.9000000953674316],
+                4.900000095367432,
-            [2.3499999046325684, 4.5],
+                5.0,
-            [0.75, 1.399999976158142, 1.5],
+                5.099999904632568,
                5.400000095367432,
                5.699999809265137,
            ],
            [2.6999998092651367, 2.9000000953674316, 3.1999998092651367],
            [2.3499999046325684, 4.5, 4.800000190734863],
            [0.75, 1.399999976158142, 1.5, 1.7000000476837158],
        ]
        self.assertListEqual(expected, clf.get_cut_points())
        self.assertListEqual([0, 1, 2, 3], clf.features_)
        clf.fit(X, y, features=[0, 2, 3])
        self.assertListEqual([0, 2, 3], clf.features_)
    def test_fit_original(self):
        clf = FImdlp(proposal=False)
        clf.fit([[1, 2], [3, 4]], [1, 2])
        self.assertEqual(clf.n_features_, 2)
        self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
        self.assertListEqual(clf.y_.tolist(), [1, 2])
        self.assertListEqual([[], []], clf.get_cut_points())
        X, y = load_iris(return_X_y=True)
        clf.fit(X, y)
        self.assertEqual(clf.n_features_, 4)
        self.assertTrue(np.array_equal(X, clf.X_))
        self.assertTrue(np.array_equal(y, clf.y_))
        expected = [
            [5.5, 5.800000190734863],
            [3.0999999046325684],
            [2.450000047683716, 4.800000190734863, 5.099999904632568],
            [0.800000011920929, 1.7000000476837158],
        ]
        self.assertListEqual(expected, clf.get_cut_points())
        self.assertListEqual([0, 1, 2, 3], clf.features_)
@@ -44,8 +75,38 @@ class FImdlpTest(unittest.TestCase):
        with self.assertRaises(ValueError):
            clf.fit([[1, 2], [3, 4]], [1, 2], unexpected="class_name")
-    def test_transform(self):
+    def test_transform_original(self):
-        clf = FImdlp()
+        clf = FImdlp(proposal=False)
        clf.fit([[1, 2], [3, 4]], [1, 2])
        self.assertEqual(
            clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
        )
        X, y = load_iris(return_X_y=True)
        clf.fit(X, y)
        self.assertEqual(clf.n_features_, 4)
        self.assertTrue(np.array_equal(X, clf.X_))
        self.assertTrue(np.array_equal(y, clf.y_))
        self.assertListEqual(
            clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
        )
        expected = [
            [0, 0, 1, 1],
            [2, 0, 1, 1],
            [1, 0, 1, 1],
            [0, 0, 1, 1],
            [1, 0, 1, 1],
            [1, 0, 1, 1],
            [1, 0, 1, 1],
        ]
        self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
        with self.assertRaises(ValueError):
            clf.transform([[1, 2, 3], [4, 5, 6]])
        with self.assertRaises(sklearn.exceptions.NotFittedError):
            clf = FImdlp(proposal=False)
            clf.transform([[1, 2], [3, 4]])
    def test_transform_proposal(self):
        clf = FImdlp(proposal=True)
        clf.fit([[1, 2], [3, 4]], [1, 2])
        self.assertEqual(
            clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
@@ -60,16 +121,16 @@ class FImdlpTest(unittest.TestCase):
        )
        expected = [
            [4, 0, 1, 1],
-            [4, 2, 2, 2],
+            [5, 2, 2, 2],
-            [4, 0, 1, 1],
+            [5, 0, 1, 1],
            [1, 0, 1, 1],
            [4, 1, 1, 1],
-            [4, 2, 1, 1],
+            [5, 2, 1, 1],
-            [4, 1, 1, 1],
+            [5, 1, 1, 1],
        ]
        self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
        with self.assertRaises(ValueError):
            clf.transform([[1, 2, 3], [4, 5, 6]])
        with self.assertRaises(sklearn.exceptions.NotFittedError):
-            clf = FImdlp()
+            clf = FImdlp(proposal=True)
            clf.transform([[1, 2], [3, 4]])
--- a/fimdlp/typesFImdlp.h
+++ b/fimdlp/typesFImdlp.h
@@ -6,8 +6,8 @@
 using namespace std;
 namespace mdlp {
    typedef float precision_t;
-    typedef vector<precision_t> samples;
+    typedef vector<precision_t> samples_t;
-    typedef vector<int> labels;
+    typedef vector<int> labels_t;
    typedef vector<size_t> indices_t;
    typedef vector<precision_t> cutPoints_t;
    typedef map<tuple<int, int>, precision_t> cacheEnt_t;
--- a/sample.py
+++ b/sample.py
@@ -1,37 +0,0 @@
 from fimdlp.mdlp import FImdlp
 from fimdlp.cppfimdlp import CFImdlp
 from sklearn.ensemble import RandomForestClassifier
 import time
 from scipy.io import arff
 import pandas as pd
 path = "fimdlp/testcpp/datasets/"
 # class_name = "speaker"
 # file_name = "kdd_JapaneseVowels.arff"
 class_name = "class"
 # file_name = "mfeat-factors.arff"
 file_name = "letter.arff"
 data = arff.loadarff(path + file_name)
 df = pd.DataFrame(data[0])
 df.dropna(axis=0, how="any", inplace=True)
 dataset = df
 X = df.drop(class_name, axis=1)
 features = X.columns
 class_name = class_name
 y, _ = pd.factorize(df[class_name])
 X = X.to_numpy()
 test = FImdlp()
 now = time.time()
 # test.fit(X, y, features=[i for i in (range(3, 14))])
 test.fit(X, y)
 fit_time = time.time()
 print("Fitting: ", fit_time - now)
 now = time.time()
 Xt = test.transform(X)
 print("Transforming: ", time.time() - now)
 print(test.get_cut_points())
 clf = RandomForestClassifier(random_state=0)
 print(clf.fit(Xt, y).score(Xt, y))
--- a/fimdlp/testcpp/ArffFiles.cpp
+++ b/fimdlp/testcpp/ArffFiles.cpp
@@ -101,13 +101,13 @@ string ArffFiles::trim(const string& source)
    s.erase(s.find_last_not_of(" \n\r\t") + 1);
    return s;
 }
-vector<int> ArffFiles::factorize(const vector<string>& labels)
+vector<int> ArffFiles::factorize(const vector<string>& labels_t)
 {
    vector<int> yy;
-    yy.reserve(labels.size());
+    yy.reserve(labels_t.size());
    map<string, int> labelMap;
    int i = 0;
-    for (string label : labels) {
+    for (string label : labels_t) {
        if (labelMap.find(label) == labelMap.end()) {
            labelMap[label] = i++;
        }
--- a/fimdlp/testcpp/ArffFiles.h
+++ b/fimdlp/testcpp/ArffFiles.h
@@ -23,6 +23,6 @@ public:
    vector<vector<float>>& getX();
    vector<int>& getY();
    vector<tuple<string, string>> getAttributes();
-    vector<int> factorize(const vector<string>& labels);
+    vector<int> factorize(const vector<string>& labels_t);
 };
 #endif
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -0,0 +1,6 @@
 cmake_minimum_required(VERSION 3.24)
 project(main)
 set(CMAKE_CXX_STANDARD 17)
 add_executable(sample sample.cpp ArffFiles.cpp ../fimdlp/Metrics.cpp ../fimdlp/CPPFImdlp.cpp)
--- a/fimdlp/testcpp/main.cpp
+++ b/fimdlp/testcpp/main.cpp
@@ -2,7 +2,7 @@
 #include <iostream>
 #include <vector>
 #include <iomanip>
-#include "../CPPFImdlp.h"
+#include "../fimdlp/CPPFImdlp.h"
 using namespace std;
@@ -10,7 +10,7 @@ int main(int argc, char** argv)
 {
    ArffFiles file;
    vector<string> lines;
-    string path = "/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/";
+    string path = "../fimdlp/testcpp/datasets/";
    map<string, bool > datasets = {
        {"mfeat-factors", true},
        {"iris", true},
@@ -41,7 +41,7 @@ int main(int argc, char** argv)
        }
        cout << y[i] << endl;
    }
-    mdlp::CPPFImdlp test = mdlp::CPPFImdlp();
+    mdlp::CPPFImdlp test = mdlp::CPPFImdlp(false);
    for (auto i = 0; i < attributes.size(); i++) {
        cout << "Cut points for " << get<0>(attributes[i]) << endl;
        cout << "--------------------------" << setprecision(3) << endl;
--- a/samples/sample.py
+++ b/samples/sample.py
@@ -0,0 +1,44 @@
 import time
 import argparse
 import os
 from scipy.io import arff
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
 from fimdlp.mdlp import FImdlp
 datasets = {
    "mfeat-factors": True,
    "iris": True,
    "letter": True,
    "kdd_JapaneseVowels": False,
 }
 ap = argparse.ArgumentParser()
 ap.add_argument("--proposal", action="store_true")
 ap.add_argument("--original", dest="proposal", action="store_false")
 ap.add_argument("dataset", type=str, choices=datasets.keys())
 args = ap.parse_args()
 relative = "" if os.path.isdir("fimdlp") else ".."
 file_name = os.path.join(
    relative, "fimdlp", "testcpp", "datasets", args.dataset
 )
 data = arff.loadarff(file_name + ".arff")
 df = pd.DataFrame(data[0])
 class_column = -1 if datasets[args.dataset] else 0
 class_name = df.columns.to_list()[class_column]
 X = df.drop(class_name, axis=1)
 y, _ = pd.factorize(df[class_name])
 X = X.to_numpy()
 test = FImdlp(proposal=args.proposal)
 now = time.time()
 test.fit(X, y)
 fit_time = time.time()
 print("Fitting: ", fit_time - now)
 now = time.time()
 Xt = test.transform(X)
 print("Transforming: ", time.time() - now)
 print(test.get_cut_points())
 clf = RandomForestClassifier(random_state=0)
 print(
    "Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y)
 )