diff --git a/cppfimdlp.cpython-310-darwin.so b/cppfimdlp.cpython-310-darwin.so new file mode 100755 index 0000000..d246baf Binary files /dev/null and b/cppfimdlp.cpython-310-darwin.so differ diff --git a/fimdlp/CPPFImdlp.cpp b/fimdlp/CPPFImdlp.cpp index a28d4c4..d203542 100644 --- a/fimdlp/CPPFImdlp.cpp +++ b/fimdlp/CPPFImdlp.cpp @@ -12,21 +12,24 @@ namespace mdlp { return os; } - CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false) + CPPFImdlp::CPPFImdlp() : proposal(true), precision(6), debug(false) { divider = pow(10, precision); numClasses = 0; } - CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug) + CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug) : proposal(proposal), precision(precision), debug(debug) { divider = pow(10, precision); numClasses = 0; } CPPFImdlp::~CPPFImdlp() = default; - std::vector CPPFImdlp::getCutPoints() + samples CPPFImdlp::getCutPoints() { - return cutPoints; + samples output(cutPoints.size()); + std::transform(cutPoints.begin(), cutPoints.end(), output.begin(), + [](cutPoint_t cut) { return cut.toValue; }); + return output; } labels CPPFImdlp::getDiscretizedValues() { @@ -48,28 +51,19 @@ namespace mdlp { this->xDiscretized = labels(X.size(), -1); this->numClasses = Metrics::numClasses(y, indices, 0, X.size()); - if (proposed) { - computeCutPointsProposed(); + if (proposal) { + computeCutPointsProposal(); } else { computeCutPointsOriginal(); } filterCutPoints(); - applyCutPoints(); - return *this; - } - labels& CPPFImdlp::transform(samples& X_) - { - indices_t indices_transform = sortIndices(X_); - applyCutPoints(); - return xDiscretized; - } - void CPPFImdlp::applyCutPoints() - { + // Apply cut points to the input vector for (auto cut : cutPoints) { for (size_t i = cut.start; i < cut.end; i++) { xDiscretized[indices[i]] = cut.classNumber; } } + return *this; } bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate) { @@ -142,7 +136,7 @@ namespace mdlp { } cutPoints = filtered; } - void CPPFImdlp::computeCutPointsProposed() + void CPPFImdlp::computeCutPointsProposal() { cutPoints_t cutPts; cutPoint_t cutPoint; @@ -206,7 +200,7 @@ namespace mdlp { if (debug) { std::cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << std::endl; for (auto cutPt : cutPts) - std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposed: Cut point: " << cutPt; + std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt; } cutPoints = cutPts; } diff --git a/fimdlp/CPPFImdlp.h b/fimdlp/CPPFImdlp.h index 5f29fb8..8965092 100644 --- a/fimdlp/CPPFImdlp.h +++ b/fimdlp/CPPFImdlp.h @@ -5,7 +5,7 @@ namespace mdlp { class CPPFImdlp { protected: - bool proposed; // proposed algorithm or original algorithm + bool proposal; // proposed algorithm or original algorithm int precision; bool debug; float divider; @@ -19,21 +19,20 @@ namespace mdlp { void setCutPoints(cutPoints_t); static indices_t sortIndices(samples&); void computeCutPointsOriginal(); - void computeCutPointsProposed(); + void computeCutPointsProposal(); bool evaluateCutPoint(cutPoint_t, cutPoint_t); void filterCutPoints(); - void applyCutPoints(); public: CPPFImdlp(); CPPFImdlp(bool, int, bool debug = false); ~CPPFImdlp(); - cutPoints_t getCutPoints(); + samples getCutPoints(); indices_t getIndices(); labels getDiscretizedValues(); void debugPoints(samples&, labels&); CPPFImdlp& fit(samples&, labels&); - labels& transform(samples&); + labels transform(samples&); }; } #endif \ No newline at end of file diff --git a/fimdlp/cfimdlp.pyx b/fimdlp/cfimdlp.pyx index 5093f96..fa3148d 100644 --- a/fimdlp/cfimdlp.pyx +++ b/fimdlp/cfimdlp.pyx @@ -12,9 +12,8 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp": CPPFImdlp() except + CPPFImdlp(bool, int, bool) except + CPPFImdlp& fit(vector[float]&, vector[int]&) - vector[int] transform(vector[float]&) vector[int] getDiscretizedValues() - vector[CutPointBody] getCutPoints() + vector[float] getCutPoints() class PcutPoint_t: @@ -26,16 +25,14 @@ class PcutPoint_t: cdef class CFImdlp: cdef CPPFImdlp *thisptr - def __cinit__(self, precision=6, debug=False, proposed=True): - # Proposed or original algorithm - self.thisptr = new CPPFImdlp(proposed, precision, debug) + def __cinit__(self, precision=6, debug=False, proposal=True): + # Proposal or original algorithm + self.thisptr = new CPPFImdlp(proposal, precision, debug) def __dealloc__(self): del self.thisptr def fit(self, X, y): self.thisptr.fit(X, y) return self - def transform(self, X): - return self.thisptr.transform(X) def get_discretized_values(self): return self.thisptr.getDiscretizedValues() def get_cut_points(self): diff --git a/fimdlp/cppfimdlp.cpython-310-darwin.so b/fimdlp/cppfimdlp.cpython-310-darwin.so index 623dd44..a367999 100755 Binary files a/fimdlp/cppfimdlp.cpython-310-darwin.so and b/fimdlp/cppfimdlp.cpython-310-darwin.so differ diff --git a/fimdlp/mdlp.py b/fimdlp/mdlp.py index 090766a..a0603b1 100644 --- a/fimdlp/mdlp.py +++ b/fimdlp/mdlp.py @@ -6,6 +6,9 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted class FImdlp(TransformerMixin, BaseEstimator): + def __init__(self, proposal=True): + self.proposal = proposal # proposed algorithm or original algorithm + """Fayyad - Irani MDLP discretization algorithm. Parameters @@ -57,11 +60,18 @@ class FImdlp(TransformerMixin, BaseEstimator): X, y = self._check_params_fit( X, y, expected_args=["class_name", "features"], kwargs=kwargs ) - self.n_features_ = X.shape[1] self.X_ = X self.y_ = y - self.discretizer_ = CFImdlp(debug=True, proposed=False) + self.discretizer_ = [None] * self.n_features_ + self.cut_points_ = [None] * self.n_features_ + # Can do it in parallel + for feature in range(self.n_features_): + self.discretizer_[feature] = CFImdlp(proposal=self.proposal) + self.discretizer_[feature].fit(X[:, feature], y) + self.cut_points_[feature] = self.discretizer_[ + feature + ].get_cut_points() return self def transform(self, X): @@ -91,6 +101,15 @@ class FImdlp(TransformerMixin, BaseEstimator): raise ValueError( "Shape of input is different from what was seen in `fit`" ) + result = np.zeros_like(X, dtype=np.int32) - 1 + # Can do it in parallel + for feature in range(self.n_features_): + result[:, feature] = np.searchsorted( + self.cut_points_[feature], X[:, feature] + ) + return result + + def test(self): print("Calculating cut points in python for first feature") yz = self.y_.copy() xz = X[:, 0].copy() @@ -102,7 +121,7 @@ class FImdlp(TransformerMixin, BaseEstimator): print(f"Cut point: ({xz[i-1]}, {xz[i]}) ({yz[i-1]}, {yz[i]})") cuts.append((xz[i] + xz[i - 1]) / 2) print("Cuts calculados en python: ", cuts) - print("Cuts calculados en C++") + print("-- Cuts calculados en C++ --") print("Cut points for each feature in Iris dataset:") for i in range(0, 1): # datax = self.X_[np.argsort(self.X_[:, i]), i] @@ -123,12 +142,6 @@ class FImdlp(TransformerMixin, BaseEstimator): print(X_translated) print("*******************************") print("Disretized values:") - print(self.discretizer_.transform(datax)) + print(self.discretizer_.get_discretized_values()) print("*******************************") - print("indices:", np.argsort(X[:, 0])) - # Xcutpoints = self.discretizer_.cut_points_ant(datax, y_) - # print( - # f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: " - # f"{Xcutpoints}" - # ) return X diff --git a/fimdlp/testcpp/FImdlp_unittest.cc b/fimdlp/testcpp/FImdlp_unittest.cc index d14a49d..b3d504c 100644 --- a/fimdlp/testcpp/FImdlp_unittest.cc +++ b/fimdlp/testcpp/FImdlp_unittest.cc @@ -37,6 +37,19 @@ namespace mdlp { prev = X[testSortedIndices[i]]; } } + void checkCutPoints(cutPoints_t& expected) + { + int expectedSize = expected.size(); + EXPECT_EQ(cutPoints.size(), expectedSize); + for (auto i = 0; i < expectedSize; i++) { + EXPECT_EQ(cutPoints[i].start, expected[i].start); + EXPECT_EQ(cutPoints[i].end, expected[i].end); + EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber); + EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision); + EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision); + } + } + }; TEST_F(TestFImdlp, SortIndices) { @@ -60,22 +73,13 @@ namespace mdlp { TEST_F(TestFImdlp, ComputeCutPointsOriginal) { cutPoints_t computed, expected; - int expectedSize = 3; expected = { { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 }, { 6, 10, -1, 5.45, 3.4028234663852886e+38 } }; setCutPoints(cutPoints_t()); computeCutPointsOriginal(); - computed = getCutPoints(); - EXPECT_EQ(computed.size(), expectedSize); - for (auto i = 0; i < expectedSize; i++) { - EXPECT_EQ(computed[i].start, expected[i].start); - EXPECT_EQ(computed[i].end, expected[i].end); - EXPECT_EQ(computed[i].classNumber, expected[i].classNumber); - EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision); - EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision); - } + checkCutPoints(expected); } TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) { @@ -83,22 +87,13 @@ namespace mdlp { expected = { { 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 }, }; - int expectedSize = 1; X = { 0, 1, 2, 2 }; y = { 1, 1, 1, 2 }; fit(X, y); computeCutPointsOriginal(); - computed = getCutPoints(); - EXPECT_EQ(computed.size(), expectedSize); - for (auto i = 0; i < expectedSize; i++) { - EXPECT_EQ(computed[i].start, expected[i].start); - EXPECT_EQ(computed[i].end, expected[i].end); - EXPECT_EQ(computed[i].classNumber, expected[i].classNumber); - EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision); - EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision); - } + checkCutPoints(expected); } - TEST_F(TestFImdlp, ComputeCutPointsProposed) + TEST_F(TestFImdlp, ComputeCutPointsProposal) { cutPoints_t computed, expected; expected = { @@ -106,57 +101,20 @@ namespace mdlp { { 6, 9, -1, 5.4, 5.85 }, { 9, 10, -1, 5.85, 3.4028234663852886e+38 } }; - int expectedSize = 4; - computeCutPointsProposed(); - computed = getCutPoints(); - EXPECT_EQ(computed.size(), expectedSize); - for (auto i = 0; i < expectedSize; i++) { - EXPECT_EQ(computed[i].start, expected[i].start); - EXPECT_EQ(computed[i].end, expected[i].end); - EXPECT_EQ(computed[i].classNumber, expected[i].classNumber); - EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision); - EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision); - } + computeCutPointsProposal(); + checkCutPoints(expected); } - TEST_F(TestFImdlp, ComputeCutPointsProposedGCase) + TEST_F(TestFImdlp, ComputeCutPointsProposalGCase) { cutPoints_t computed, expected; expected = { { 0, 3, -1, -3.4028234663852886e+38, 1.5 }, { 3, 4, -1, 1.5, 3.4028234663852886e+38 } }; - int expectedSize = 2; X = { 0, 1, 2, 2 }; y = { 1, 1, 1, 2 }; fit(X, y); - computeCutPointsProposed(); - computed = getCutPoints(); - EXPECT_EQ(computed.size(), expectedSize); - for (auto i = 0; i < expectedSize; i++) { - EXPECT_EQ(computed[i].start, expected[i].start); - EXPECT_EQ(computed[i].end, expected[i].end); - EXPECT_EQ(computed[i].classNumber, expected[i].classNumber); - EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision); - EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision); - } - } - TEST_F(TestFImdlp, ApplyCutPoints) - { - cutPoints_t expected = { - { 0, 4, 17, -3.4028234663852886e+38, 5.1 }, { 4, 6, 31, 5.1, 5.4 }, - { 6, 8, 59, 5.4, 5.85 }, - { 8, 10, 41, 5.85, 3.4028234663852886e+38 } - }; - setCutPoints(expected); - applyCutPoints(); - labels expected_x = getDiscretizedValues(); - indices_t indices_x = getIndices(); - for (auto i = 0; i < 5; i++) { - std::cout << "cutPoint[" << i << "].start = " << expected[i].start << std::endl; - for (auto j = expected[i].start; j < expected[i].end; j++) { - std::cout << expected_x[j] << expected[i].classNumber << std::endl; - EXPECT_EQ(expected_x[indices_x[j]], expected[i].classNumber); - } - } + computeCutPointsProposal(); + checkCutPoints(expected); } } \ No newline at end of file diff --git a/sample.py b/sample.py index 36d4699..c988b65 100644 --- a/sample.py +++ b/sample.py @@ -65,11 +65,11 @@ features = data.feature_names # test.fit(X, y, features=features) # test.transform(X) # test.get_cut_points() -for proposed in [True, False]: +for proposal in [True, False]: X = data.data y = data.target - print("*** Proposed: ", proposed) - test = CFImdlp(debug=True, proposed=proposed) + print("*** Proposal: ", proposal) + test = CFImdlp(debug=True, proposal=proposal) test.fit(X[:, 0], y) result = test.get_cut_points() for item in result: @@ -118,7 +118,7 @@ for proposed in [True, False]: # X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9] # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2] # indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7] -# clf = CFImdlp(debug=True, proposed=False) +# clf = CFImdlp(debug=True, proposal=False) # clf.fit(X, y) # print(clf.get_cut_points()) # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]