diff --git a/README.md b/README.md index e249e1c..906273f 100644 --- a/README.md +++ b/README.md @@ -6,5 +6,7 @@ Fayyad - Irani MDLP discretization algorithm ```bash python setup.py build_ext --inplace -python sample.py +python samples/sample.py iris --original +python samples/sample.py iris --proposal +python samples/sample.py -h # for more options ``` diff --git a/fimdlp/CPPFImdlp.cpp b/fimdlp/CPPFImdlp.cpp index 56f8ea2..f078ecb 100644 --- a/fimdlp/CPPFImdlp.cpp +++ b/fimdlp/CPPFImdlp.cpp @@ -1,21 +1,17 @@ #include -#include #include #include #include "CPPFImdlp.h" #include "Metrics.h" namespace mdlp { - CPPFImdlp::CPPFImdlp(): proposal(true), debug(false), indices(indices_t()), y(labels()), metrics(Metrics(y, indices)) - { - } - CPPFImdlp::CPPFImdlp(bool proposal, bool debug): proposal(proposal), debug(debug), indices(indices_t()), y(labels()), metrics(Metrics(y, indices)) + CPPFImdlp::CPPFImdlp(bool proposal):proposal(proposal), indices(indices_t()), y(labels_t()), metrics(Metrics(y, indices)) { } CPPFImdlp::~CPPFImdlp() = default; - CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_) + CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_) { X = X_; y = y_; @@ -28,8 +24,10 @@ namespace mdlp { } indices = sortIndices(X_); metrics.setData(y, indices); - //computeCutPoints(0, X.size()); - computeCutPointsProposal(); + if (proposal) + computeCutPointsProposal(); + else + computeCutPoints(0, X.size()); return *this; } void CPPFImdlp::computeCutPoints(size_t start, size_t end) @@ -53,7 +51,6 @@ namespace mdlp { } void CPPFImdlp::computeCutPointsOriginal(size_t start, size_t end) { - size_t idx; precision_t cut; if (end - start < 2) return; @@ -76,14 +73,9 @@ namespace mdlp { yCur = yPrev = y[indices[0]]; numElements = indices.size() - 1; idx = start = 0; - bool firstCutPoint = true; - if (debug) - printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements); while (idx < numElements) { xPivot = xCur; yPivot = yCur; - if (debug) - printf(" Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur); // Read the same values and check class changes do { idx++; @@ -92,17 +84,12 @@ namespace mdlp { if (yCur != yPivot && xCur == xPivot) { yPivot = -1; } - if (debug) - printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur); } while (idx < numElements && xCur == xPivot); // Check if the class changed and there are more than 1 element if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) { start = idx; cutPoint = (xPrev + xCur) / 2; - if (debug) { - printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = %3.1g \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint); - } cutPoints.push_back(cutPoint); } yPrev = yPivot; @@ -160,7 +147,7 @@ namespace mdlp { return output; } // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes - indices_t CPPFImdlp::sortIndices(samples& X_) + indices_t CPPFImdlp::sortIndices(samples_t& X_) { indices_t idx(X_.size()); iota(idx.begin(), idx.end(), 0); diff --git a/fimdlp/CPPFImdlp.h b/fimdlp/CPPFImdlp.h index bc4285f..cab886a 100644 --- a/fimdlp/CPPFImdlp.h +++ b/fimdlp/CPPFImdlp.h @@ -6,15 +6,14 @@ namespace mdlp { class CPPFImdlp { protected: - bool proposal; // proposed algorithm or original algorithm - bool debug; + bool proposal; indices_t indices; // sorted indices to use with X and y - samples X; - labels y; + samples_t X; + labels_t y; Metrics metrics; cutPoints_t cutPoints; - static indices_t sortIndices(samples&); + static indices_t sortIndices(samples_t&); void computeCutPoints(size_t, size_t); long int getCandidate(size_t, size_t); bool mdlp(size_t, size_t, size_t); @@ -25,11 +24,10 @@ namespace mdlp { void computeCutPointsProposal(); public: - CPPFImdlp(); - CPPFImdlp(bool, bool debug = false); + CPPFImdlp(bool); ~CPPFImdlp(); - CPPFImdlp& fit(samples&, labels&); - samples getCutPoints(); + CPPFImdlp& fit(samples_t&, labels_t&); + samples_t getCutPoints(); }; } #endif \ No newline at end of file diff --git a/fimdlp/Metrics.cpp b/fimdlp/Metrics.cpp index f2e54f5..1275b00 100644 --- a/fimdlp/Metrics.cpp +++ b/fimdlp/Metrics.cpp @@ -1,8 +1,9 @@ #include "Metrics.h" #include +#include using namespace std; namespace mdlp { - Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t()) + Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t()) { } int Metrics::computeNumClasses(size_t start, size_t end) @@ -13,7 +14,7 @@ namespace mdlp { } return nClasses.size(); } - void Metrics::setData(labels& y_, indices_t& indices_) + void Metrics::setData(labels_t& y_, indices_t& indices_) { indices = indices_; y = y_; @@ -25,7 +26,7 @@ namespace mdlp { { precision_t p, ventropy = 0; int nElements = 0; - labels counts(numClasses + 1, 0); + labels_t counts(numClasses + 1, 0); if (end - start < 2) return 0; if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) { diff --git a/fimdlp/Metrics.h b/fimdlp/Metrics.h index 79bc286..a371502 100644 --- a/fimdlp/Metrics.h +++ b/fimdlp/Metrics.h @@ -1,18 +1,17 @@ #ifndef CCMETRICS_H #define CCMETRICS_H #include "typesFImdlp.h" -#include namespace mdlp { class Metrics { protected: - labels& y; + labels_t& y; indices_t& indices; int numClasses; cacheEnt_t entropyCache; cacheIg_t igCache; public: - Metrics(labels&, indices_t&); - void setData(labels&, indices_t&); + Metrics(labels_t&, indices_t&); + void setData(labels_t&, indices_t&); int computeNumClasses(size_t, size_t); precision_t entropy(size_t, size_t); precision_t informationGain(size_t, size_t, size_t); diff --git a/fimdlp/_version.py b/fimdlp/_version.py index c12f34c..d69d16e 100644 --- a/fimdlp/_version.py +++ b/fimdlp/_version.py @@ -1 +1 @@ -__version__ = '0.1.1' \ No newline at end of file +__version__ = "0.9.1" diff --git a/fimdlp/cfimdlp.pyx b/fimdlp/cfimdlp.pyx index db87af1..3ffea79 100644 --- a/fimdlp/cfimdlp.pyx +++ b/fimdlp/cfimdlp.pyx @@ -6,24 +6,15 @@ from libcpp cimport bool cdef extern from "CPPFImdlp.h" namespace "mdlp": ctypedef float precision_t cdef cppclass CPPFImdlp: - CPPFImdlp() except + - CPPFImdlp(bool, bool) except + + CPPFImdlp(bool) except + CPPFImdlp& fit(vector[precision_t]&, vector[int]&) vector[precision_t] getCutPoints() -class PcutPoint_t: - def __init__(self, start, end, fromValue, toValue): - self.start = start - self.end = end - self.fromValue = fromValue - self.toValue = toValue - cdef class CFImdlp: cdef CPPFImdlp *thisptr - def __cinit__(self, debug=False, proposal=True): - # Proposal or original algorithm - self.thisptr = new CPPFImdlp(proposal, debug) + def __cinit__(self, proposal): + self.thisptr = new CPPFImdlp(proposal) def __dealloc__(self): del self.thisptr def fit(self, X, y): diff --git a/fimdlp/cppfimdlp.cpython-310-darwin.so b/fimdlp/cppfimdlp.cpython-310-darwin.so index d5b5e7b..b61ccc0 100755 Binary files a/fimdlp/cppfimdlp.cpython-310-darwin.so and b/fimdlp/cppfimdlp.cpython-310-darwin.so differ diff --git a/fimdlp/mdlp.py b/fimdlp/mdlp.py index 50e5ca7..ab82dc6 100644 --- a/fimdlp/mdlp.py +++ b/fimdlp/mdlp.py @@ -3,33 +3,35 @@ from .cppfimdlp import CFImdlp from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_X_y, check_array, check_is_fitted +from joblib import Parallel, delayed class FImdlp(TransformerMixin, BaseEstimator): - def __init__(self, proposal=True): - self.proposal = proposal # proposed algorithm or original algorithm + def __init__(self, n_jobs=-1, proposal=False): + self.n_jobs = n_jobs + self.proposal = proposal - """Fayyad - Irani MDLP discretization algorithm. + """Fayyad - Irani MDLP discretization algorithm based implementation. Parameters ---------- - demo_param : str, default='demo' - A parameter used for demonstation of how to pass and store paramters. + n_jobs : int, default=-1 + The number of jobs to run in parallel. :meth:`fit` and + :meth:`transform`, are parallelized over the features. ``-1`` means + using all cores available. Attributes ---------- n_features_ : int The number of features of the data passed to :meth:`fit`. discretizer_ : list - The list of discretizers for each feature. + The list of discretizers, one for each feature. cut_points_ : list The list of cut points for each feature. X_ : array the samples used to fit, shape (n_samples, n_features) y_ : array the labels used to fit, shape (n_samples,) - discretized_X_ : - array of the discretized samples passed to fit(n_samples, n_features) features_ : list the list of features to be discretized """ @@ -70,6 +72,8 @@ class FImdlp(TransformerMixin, BaseEstimator): y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. + features : list, default=[i for i in range(n_features)] + The list of features to be discretized. Returns ------- self : object @@ -83,36 +87,22 @@ class FImdlp(TransformerMixin, BaseEstimator): self.y_ = y self.discretizer_ = [None] * self.n_features_ self.cut_points_ = [None] * self.n_features_ - # Can do it in parallel - for feature in self.features_: - self.discretizer_[feature] = CFImdlp( - proposal=self.proposal, debug=False - ) - self.discretizer_[feature].fit(X[:, feature], y) - self.cut_points_[feature] = self.discretizer_[ - feature - ].get_cut_points() + Parallel(n_jobs=self.n_jobs, prefer="threads")( + delayed(self._fit_discretizer)(feature) + for feature in range(self.n_features_) + ) return self - def get_fitted(self): - """Return the discretized X computed during fit. + def _fit_discretizer(self, feature): + self.discretizer_[feature] = CFImdlp(proposal=self.proposal) + self.discretizer_[feature].fit(self.X_[:, feature], self.y_) + self.cut_points_[feature] = self.discretizer_[feature].get_cut_points() - Returns - ------- - X_transformed : array, shape (n_samples, n_features) - discretized X computed during fit. - """ - # Check is fit had been called - check_is_fitted(self, "n_features_") - result = np.zeros_like(self.X_, dtype=np.int32) - 1 - for feature in range(self.n_features_): - if feature in self.features_: - result[:, feature] = self.discretizer_[ - feature - ].get_discretized_values() - else: - result[:, feature] = self.X_[:, feature] - return result + def _discretize_feature(self, feature, X, result): + if feature in self.features_: + result[:, feature] = np.searchsorted(self.cut_points_[feature], X) + else: + result[:, feature] = X def transform(self, X): """Discretize X values. @@ -127,28 +117,28 @@ class FImdlp(TransformerMixin, BaseEstimator): """ # Check is fit had been called check_is_fitted(self, "n_features_") - # Input validation X = check_array(X) - # Check that the input is of the same shape as the one passed # during fit. - # if X.shape[1] != self.n_features_: - # raise ValueError( - # "Shape of input is different from what was seen in `fit`" - # ) + if X.shape[1] != self.n_features_: + raise ValueError( + "Shape of input is different from what was seen in `fit`" + ) result = np.zeros_like(X, dtype=np.int32) - 1 - # Can do it in parallel - for feature in range(self.n_features_): - if feature in self.features_: - result[:, feature] = np.searchsorted( - self.cut_points_[feature], X[:, feature] - ) - else: - result[:, feature] = X[:, feature] + Parallel(n_jobs=self.n_jobs, prefer="threads")( + delayed(self._discretize_feature)(feature, X[:, feature], result) + for feature in range(self.n_features_) + ) return result def get_cut_points(self): + """Get the cut points for each feature. + Returns + ------- + result: list + The list of cut points for each feature. + """ result = [] for feature in range(self.n_features_): result.append(self.cut_points_[feature]) diff --git a/fimdlp/testcpp/FImdlp_unittest.cc b/fimdlp/testcpp/FImdlp_unittest.cc index 173bd41..1382132 100644 --- a/fimdlp/testcpp/FImdlp_unittest.cc +++ b/fimdlp/testcpp/FImdlp_unittest.cc @@ -1,74 +1,63 @@ -//#include "gtest/gtest.h" -//#include "../Metrics.h" -//#include "../CPPFImdlp.h" -//namespace mdlp { -// class TestFImdlp : public CPPFImdlp, public testing::Test { -// public: -// TestFImdlp() : CPPFImdlp(true, true) {} -// void SetUp() -// { -// // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0] -// //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2) -// X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; -// y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; -// fit(X, y); -// } -// void setProposal(bool value) -// { -// proposal = value; -// } -// void initCutPoints() -// { -// setCutPoints(cutPoints_t()); -// } -// void initIndices() -// { -// indices = indices_t(); -// } -// void initDiscretized() -// { -// xDiscretized = labels(); -// } -// void checkSortedVector(samples& X_, indices_t indices_) -// { -// X = X_; -// indices = indices_; -// indices_t testSortedIndices = sortIndices(X); -// precision_t prev = X[testSortedIndices[0]]; -// for (auto i = 0; i < X.size(); ++i) { -// EXPECT_EQ(testSortedIndices[i], indices[i]); -// EXPECT_LE(prev, X[testSortedIndices[i]]); -// prev = X[testSortedIndices[i]]; -// } -// } -// void checkCutPoints(cutPoints_t& expected) -// { -// int expectedSize = expected.size(); -// EXPECT_EQ(cutPoints.size(), expectedSize); -// for (auto i = 0; i < expectedSize; i++) { -// EXPECT_EQ(cutPoints[i].start, expected[i].start); -// EXPECT_EQ(cutPoints[i].end, expected[i].end); -// EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber); -// EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision); -// EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision); -// } -// } -// template -// void checkVectors(std::vector const& expected, std::vector const& computed) -// { -// EXPECT_EQ(expected.size(), computed.size()); -// for (auto i = 0; i < expected.size(); i++) { -// EXPECT_EQ(expected[i], computed[i]); -// } -// } -// -// }; -// TEST_F(TestFImdlp, FitErrorEmptyDataset) -// { -// X = samples(); -// y = labels(); -// EXPECT_THROW(fit(X, y), std::invalid_argument); -// } +#include "gtest/gtest.h" +#include "../Metrics.h" +#include "../CPPFImdlp.h" +namespace mdlp { + class TestFImdlp: public CPPFImdlp, public testing::Test { + public: + TestFImdlp(): CPPFImdlp(false) {} + void SetUp() + { + // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0] + //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2) + X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; + y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; + fit(X, y); + } + void setProposal(bool value) + { + proposal = value; + } + void initIndices() + { + indices = indices_t(); + } + void checkSortedVector(samples_t& X_, indices_t indices_) + { + X = X_; + indices = indices_; + indices_t testSortedIndices = sortIndices(X); + precision_t prev = X[testSortedIndices[0]]; + for (auto i = 0; i < X.size(); ++i) { + EXPECT_EQ(testSortedIndices[i], indices[i]); + EXPECT_LE(prev, X[testSortedIndices[i]]); + prev = X[testSortedIndices[i]]; + } + } + void checkCutPoints(cutPoints_t& expected) + { + int expectedSize = expected.size(); + EXPECT_EQ(cutPoints.size(), expectedSize); + for (auto i = 0; i < expectedSize; i++) { + EXPECT_EQ(cutPoints[i], expected[i]); + } + } + template + void checkVectors(std::vector const& expected, std::vector const& computed) + { + EXPECT_EQ(expected.size(), computed.size()); + for (auto i = 0; i < expected.size(); i++) { + EXPECT_EQ(expected[i], computed[i]); + } + } + }; + TEST_F(TestFImdlp, FitErrorEmptyDataset) + { + X = samples_t(); + y = labels_t(); + EXPECT_THROW(fit(X, y), std::invalid_argument); + } +} +// // TEST_F(TestFImdlp, FitErrorDifferentSize) // { // X = { 1, 2, 3 }; @@ -143,7 +132,7 @@ // } // TEST_F(TestFImdlp, DiscretizedValues) // { -// labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +// labels_t computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // computed = getDiscretizedValues(); // checkVectors(expected, computed); // } @@ -157,7 +146,7 @@ // TEST_F(TestFImdlp, Constructor) // { // samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; -// labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; +// labels_t y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; // setProposal(false); // fit(X, y); // computeCutPointsOriginal(); diff --git a/fimdlp/testcpp/Metrics_unittest.cc b/fimdlp/testcpp/Metrics_unittest.cc index 0bea1c1..c6e3e56 100644 --- a/fimdlp/testcpp/Metrics_unittest.cc +++ b/fimdlp/testcpp/Metrics_unittest.cc @@ -1,31 +1,43 @@ #include "gtest/gtest.h" #include "../Metrics.h" + namespace mdlp { - precision_t precision = 0.000001; - TEST(MetricTest, NumClasses) + class TestMetrics: public Metrics, public testing::Test { + public: + labels_t y; + samples_t X; + indices_t indices; + precision_t precision = 0.000001; + + TestMetrics(): Metrics(y, indices) {} + void SetUp() + { + y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; + indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + setData(y, indices); + } + }; + TEST_F(TestMetrics, NumClasses) { - labels y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; - indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; - EXPECT_EQ(1, Metrics::numClasses(y, indices, 4, 8)); - EXPECT_EQ(2, Metrics::numClasses(y, indices, 0, 10)); - EXPECT_EQ(2, Metrics::numClasses(y, indices, 8, 10)); + y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; + EXPECT_EQ(1, computeNumClasses(4, 8)); + EXPECT_EQ(2, computeNumClasses(0, 10)); + EXPECT_EQ(2, computeNumClasses(8, 10)); } - TEST(MetricTest, Entropy) + TEST_F(TestMetrics, Entropy) { - labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; - indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; - EXPECT_EQ(1, Metrics::entropy(y, indices, 0, 10, 2)); - EXPECT_EQ(0, Metrics::entropy(y, indices, 0, 5, 1)); - labels yz = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; - ASSERT_NEAR(0.468996, Metrics::entropy(yz, indices, 0, 10, 2), precision); + EXPECT_EQ(1, entropy(0, 10)); + EXPECT_EQ(0, entropy(0, 5)); + y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; + setData(y, indices); + ASSERT_NEAR(0.468996, entropy(0, 10), precision); } - TEST(MetricTest, InformationGain) + TEST_F(TestMetrics, InformationGain) { - labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; - indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; - labels yz = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; - ASSERT_NEAR(1, Metrics::informationGain(y, indices, 0, 10, 5, 2), precision); - ASSERT_NEAR(0.108032, Metrics::informationGain(yz, indices, 0, 10, 5, 2), precision); + ASSERT_NEAR(1, informationGain(0, 5, 10), precision); + y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; + setData(y, indices); + ASSERT_NEAR(0.108032, informationGain(0, 5, 10), precision); } -} \ No newline at end of file +} diff --git a/fimdlp/testcpp/main b/fimdlp/testcpp/main deleted file mode 100755 index a1bfdfb..0000000 Binary files a/fimdlp/testcpp/main and /dev/null differ diff --git a/fimdlp/testcpp/xx/ArffFiles.cpp b/fimdlp/testcpp/xx/ArffFiles.cpp index b8a8928..9baf861 100644 --- a/fimdlp/testcpp/xx/ArffFiles.cpp +++ b/fimdlp/testcpp/xx/ArffFiles.cpp @@ -101,13 +101,13 @@ string ArffFiles::trim(const string& source) s.erase(s.find_last_not_of(" \n\r\t") + 1); return s; } -vector ArffFiles::factorize(const vector& labels) +vector ArffFiles::factorize(const vector& labels_t) { vector yy; - yy.reserve(labels.size()); + yy.reserve(labels_t.size()); map labelMap; int i = 0; - for (string label : labels) { + for (string label : labels_t) { if (labelMap.find(label) == labelMap.end()) { labelMap[label] = i++; } diff --git a/fimdlp/testcpp/xx/ArffFiles.h b/fimdlp/testcpp/xx/ArffFiles.h index 317ebb5..6986d3b 100644 --- a/fimdlp/testcpp/xx/ArffFiles.h +++ b/fimdlp/testcpp/xx/ArffFiles.h @@ -23,6 +23,6 @@ public: vector>& getX(); vector& getY(); vector> getAttributes(); - vector factorize(const vector& labels); + vector factorize(const vector& labels_t); }; #endif \ No newline at end of file diff --git a/fimdlp/tests/FImdlp_test.py b/fimdlp/tests/FImdlp_test.py index 6d136ee..9e681e6 100644 --- a/fimdlp/tests/FImdlp_test.py +++ b/fimdlp/tests/FImdlp_test.py @@ -8,12 +8,14 @@ from ..mdlp import FImdlp class FImdlpTest(unittest.TestCase): def test_init(self): clf = FImdlp() - self.assertTrue(clf.proposal) - clf = FImdlp(proposal=False) + self.assertEqual(-1, clf.n_jobs) self.assertFalse(clf.proposal) + clf = FImdlp(proposal=True, n_jobs=7) + self.assertTrue(clf.proposal) + self.assertEqual(7, clf.n_jobs) - def test_fit(self): - clf = FImdlp() + def test_fit_proposal(self): + clf = FImdlp(proposal=True) clf.fit([[1, 2], [3, 4]], [1, 2]) self.assertEqual(clf.n_features_, 2) self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]]) @@ -25,10 +27,39 @@ class FImdlpTest(unittest.TestCase): self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(y, clf.y_)) expected = [ - [4.900000095367432, 5.0, 5.099999904632568, 5.400000095367432], - [2.6999998092651367, 2.9000000953674316], - [2.3499999046325684, 4.5], - [0.75, 1.399999976158142, 1.5], + [ + 4.900000095367432, + 5.0, + 5.099999904632568, + 5.400000095367432, + 5.699999809265137, + ], + [2.6999998092651367, 2.9000000953674316, 3.1999998092651367], + [2.3499999046325684, 4.5, 4.800000190734863], + [0.75, 1.399999976158142, 1.5, 1.7000000476837158], + ] + self.assertListEqual(expected, clf.get_cut_points()) + self.assertListEqual([0, 1, 2, 3], clf.features_) + clf.fit(X, y, features=[0, 2, 3]) + self.assertListEqual([0, 2, 3], clf.features_) + + def test_fit_original(self): + clf = FImdlp(proposal=False) + clf.fit([[1, 2], [3, 4]], [1, 2]) + self.assertEqual(clf.n_features_, 2) + self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]]) + self.assertListEqual(clf.y_.tolist(), [1, 2]) + self.assertListEqual([[], []], clf.get_cut_points()) + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + self.assertEqual(clf.n_features_, 4) + self.assertTrue(np.array_equal(X, clf.X_)) + self.assertTrue(np.array_equal(y, clf.y_)) + expected = [ + [5.5, 5.800000190734863], + [3.0999999046325684], + [2.450000047683716, 4.800000190734863, 5.099999904632568], + [0.800000011920929, 1.7000000476837158], ] self.assertListEqual(expected, clf.get_cut_points()) self.assertListEqual([0, 1, 2, 3], clf.features_) @@ -44,8 +75,38 @@ class FImdlpTest(unittest.TestCase): with self.assertRaises(ValueError): clf.fit([[1, 2], [3, 4]], [1, 2], unexpected="class_name") - def test_transform(self): - clf = FImdlp() + def test_transform_original(self): + clf = FImdlp(proposal=False) + clf.fit([[1, 2], [3, 4]], [1, 2]) + self.assertEqual( + clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]] + ) + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + self.assertEqual(clf.n_features_, 4) + self.assertTrue(np.array_equal(X, clf.X_)) + self.assertTrue(np.array_equal(y, clf.y_)) + self.assertListEqual( + clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist() + ) + expected = [ + [0, 0, 1, 1], + [2, 0, 1, 1], + [1, 0, 1, 1], + [0, 0, 1, 1], + [1, 0, 1, 1], + [1, 0, 1, 1], + [1, 0, 1, 1], + ] + self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected)) + with self.assertRaises(ValueError): + clf.transform([[1, 2, 3], [4, 5, 6]]) + with self.assertRaises(sklearn.exceptions.NotFittedError): + clf = FImdlp(proposal=False) + clf.transform([[1, 2], [3, 4]]) + + def test_transform_proposal(self): + clf = FImdlp(proposal=True) clf.fit([[1, 2], [3, 4]], [1, 2]) self.assertEqual( clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]] @@ -60,16 +121,16 @@ class FImdlpTest(unittest.TestCase): ) expected = [ [4, 0, 1, 1], - [4, 2, 2, 2], - [4, 0, 1, 1], + [5, 2, 2, 2], + [5, 0, 1, 1], [1, 0, 1, 1], [4, 1, 1, 1], - [4, 2, 1, 1], - [4, 1, 1, 1], + [5, 2, 1, 1], + [5, 1, 1, 1], ] self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected)) with self.assertRaises(ValueError): clf.transform([[1, 2, 3], [4, 5, 6]]) with self.assertRaises(sklearn.exceptions.NotFittedError): - clf = FImdlp() + clf = FImdlp(proposal=True) clf.transform([[1, 2], [3, 4]]) diff --git a/fimdlp/typesFImdlp.h b/fimdlp/typesFImdlp.h index b94b943..43e5842 100644 --- a/fimdlp/typesFImdlp.h +++ b/fimdlp/typesFImdlp.h @@ -6,8 +6,8 @@ using namespace std; namespace mdlp { typedef float precision_t; - typedef vector samples; - typedef vector labels; + typedef vector samples_t; + typedef vector labels_t; typedef vector indices_t; typedef vector cutPoints_t; typedef map, precision_t> cacheEnt_t; diff --git a/sample.py b/sample.py deleted file mode 100644 index 413df30..0000000 --- a/sample.py +++ /dev/null @@ -1,37 +0,0 @@ -from fimdlp.mdlp import FImdlp -from fimdlp.cppfimdlp import CFImdlp -from sklearn.ensemble import RandomForestClassifier -import time - -from scipy.io import arff -import pandas as pd - -path = "fimdlp/testcpp/datasets/" -# class_name = "speaker" -# file_name = "kdd_JapaneseVowels.arff" -class_name = "class" -# file_name = "mfeat-factors.arff" -file_name = "letter.arff" -data = arff.loadarff(path + file_name) -df = pd.DataFrame(data[0]) -df.dropna(axis=0, how="any", inplace=True) -dataset = df -X = df.drop(class_name, axis=1) -features = X.columns -class_name = class_name -y, _ = pd.factorize(df[class_name]) -X = X.to_numpy() - -test = FImdlp() -now = time.time() -# test.fit(X, y, features=[i for i in (range(3, 14))]) -test.fit(X, y) -fit_time = time.time() -print("Fitting: ", fit_time - now) -now = time.time() -Xt = test.transform(X) -print("Transforming: ", time.time() - now) -print(test.get_cut_points()) - -clf = RandomForestClassifier(random_state=0) -print(clf.fit(Xt, y).score(Xt, y)) diff --git a/fimdlp/testcpp/ArffFiles.cpp b/samples/ArffFiles.cpp similarity index 95% rename from fimdlp/testcpp/ArffFiles.cpp rename to samples/ArffFiles.cpp index b8a8928..9baf861 100644 --- a/fimdlp/testcpp/ArffFiles.cpp +++ b/samples/ArffFiles.cpp @@ -101,13 +101,13 @@ string ArffFiles::trim(const string& source) s.erase(s.find_last_not_of(" \n\r\t") + 1); return s; } -vector ArffFiles::factorize(const vector& labels) +vector ArffFiles::factorize(const vector& labels_t) { vector yy; - yy.reserve(labels.size()); + yy.reserve(labels_t.size()); map labelMap; int i = 0; - for (string label : labels) { + for (string label : labels_t) { if (labelMap.find(label) == labelMap.end()) { labelMap[label] = i++; } diff --git a/fimdlp/testcpp/ArffFiles.h b/samples/ArffFiles.h similarity index 91% rename from fimdlp/testcpp/ArffFiles.h rename to samples/ArffFiles.h index 317ebb5..6986d3b 100644 --- a/fimdlp/testcpp/ArffFiles.h +++ b/samples/ArffFiles.h @@ -23,6 +23,6 @@ public: vector>& getX(); vector& getY(); vector> getAttributes(); - vector factorize(const vector& labels); + vector factorize(const vector& labels_t); }; #endif \ No newline at end of file diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt new file mode 100644 index 0000000..df85077 --- /dev/null +++ b/samples/CMakeLists.txt @@ -0,0 +1,6 @@ +cmake_minimum_required(VERSION 3.24) +project(main) + +set(CMAKE_CXX_STANDARD 17) + +add_executable(sample sample.cpp ArffFiles.cpp ../fimdlp/Metrics.cpp ../fimdlp/CPPFImdlp.cpp) diff --git a/fimdlp/testcpp/main.cpp b/samples/sample.cpp similarity index 91% rename from fimdlp/testcpp/main.cpp rename to samples/sample.cpp index 201b930..1a9e407 100644 --- a/fimdlp/testcpp/main.cpp +++ b/samples/sample.cpp @@ -2,7 +2,7 @@ #include #include #include -#include "../CPPFImdlp.h" +#include "../fimdlp/CPPFImdlp.h" using namespace std; @@ -10,7 +10,7 @@ int main(int argc, char** argv) { ArffFiles file; vector lines; - string path = "/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/"; + string path = "../fimdlp/testcpp/datasets/"; map datasets = { {"mfeat-factors", true}, {"iris", true}, @@ -41,7 +41,7 @@ int main(int argc, char** argv) } cout << y[i] << endl; } - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(); + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(false); for (auto i = 0; i < attributes.size(); i++) { cout << "Cut points for " << get<0>(attributes[i]) << endl; cout << "--------------------------" << setprecision(3) << endl; diff --git a/samples/sample.py b/samples/sample.py new file mode 100644 index 0000000..6fcf065 --- /dev/null +++ b/samples/sample.py @@ -0,0 +1,44 @@ +import time +import argparse +import os +from scipy.io import arff +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from fimdlp.mdlp import FImdlp + +datasets = { + "mfeat-factors": True, + "iris": True, + "letter": True, + "kdd_JapaneseVowels": False, +} + +ap = argparse.ArgumentParser() +ap.add_argument("--proposal", action="store_true") +ap.add_argument("--original", dest="proposal", action="store_false") +ap.add_argument("dataset", type=str, choices=datasets.keys()) +args = ap.parse_args() +relative = "" if os.path.isdir("fimdlp") else ".." +file_name = os.path.join( + relative, "fimdlp", "testcpp", "datasets", args.dataset +) +data = arff.loadarff(file_name + ".arff") +df = pd.DataFrame(data[0]) +class_column = -1 if datasets[args.dataset] else 0 +class_name = df.columns.to_list()[class_column] +X = df.drop(class_name, axis=1) +y, _ = pd.factorize(df[class_name]) +X = X.to_numpy() +test = FImdlp(proposal=args.proposal) +now = time.time() +test.fit(X, y) +fit_time = time.time() +print("Fitting: ", fit_time - now) +now = time.time() +Xt = test.transform(X) +print("Transforming: ", time.time() - now) +print(test.get_cut_points()) +clf = RandomForestClassifier(random_state=0) +print( + "Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y) +)