diff --git a/fimdlp/CPPFImdlp.cpp b/fimdlp/CPPFImdlp.cpp index acc4d67..a28d4c4 100644 --- a/fimdlp/CPPFImdlp.cpp +++ b/fimdlp/CPPFImdlp.cpp @@ -174,7 +174,10 @@ namespace mdlp { printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur); } while (idx < numElements && xCur == xPivot); - if (yPivot == -1 || yPrev != yCur) { + // Check if the class changed and there are more than 1 element + if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur)) { + // Must we add the entropy criteria here? + // if (totalEntropy - (entropyLeft + entropyRight) < 0) { Accept cut point } cutPoint.start = start; cutPoint.end = idx; start = idx; @@ -200,9 +203,11 @@ namespace mdlp { printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue); cutPts.push_back(cutPoint); } - if (debug) + if (debug) { + std::cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << std::endl; for (auto cutPt : cutPts) - std::cout << "Proposed: Cut point: " << cutPt; + std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposed: Cut point: " << cutPt; + } cutPoints = cutPts; } void CPPFImdlp::computeCutPointsOriginal() @@ -218,8 +223,11 @@ namespace mdlp { yPrev = y[idx]; for (index = 0; index < size_t(indices.size()) - 1; index++) { idx = indices[index]; - // Definition 2 Cut points are always on boundaries - if (y[idx] != yPrev && xPrev < X[idx]) { + // Definition 2 Cut points are always on class boundaries && + // there are more than 1 items in the interval + if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1) { + // Must we add the entropy criteria here? + // if (totalEntropy - (entropyLeft + entropyRight) < 0) { Accept cut point } if (first) { first = false; cutPoint.fromValue = std::numeric_limits::lowest(); @@ -245,9 +253,11 @@ namespace mdlp { } else cutPts.back().toValue = std::numeric_limits::max(); cutPts.back().end = X.size(); - if (debug) + if (debug) { + std::cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, indices.size(), numClasses) << std::endl; for (auto cutPt : cutPts) - std::cout << "Original: Cut point: " << cutPt; + std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << ": Original: Cut point: " << cutPt; + } cutPoints = cutPts; } // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes diff --git a/fimdlp/CPPFImdlp.h b/fimdlp/CPPFImdlp.h index 97d7c35..5f29fb8 100644 --- a/fimdlp/CPPFImdlp.h +++ b/fimdlp/CPPFImdlp.h @@ -4,7 +4,7 @@ #include namespace mdlp { class CPPFImdlp { - private: + protected: bool proposed; // proposed algorithm or original algorithm int precision; bool debug; @@ -16,7 +16,6 @@ namespace mdlp { int numClasses; cutPoints_t cutPoints; - protected: void setCutPoints(cutPoints_t); static indices_t sortIndices(samples&); void computeCutPointsOriginal(); diff --git a/fimdlp/testcpp/FImdlp_unittest.cc b/fimdlp/testcpp/FImdlp_unittest.cc index 67744d7..d14a49d 100644 --- a/fimdlp/testcpp/FImdlp_unittest.cc +++ b/fimdlp/testcpp/FImdlp_unittest.cc @@ -2,15 +2,9 @@ #include "../Metrics.h" #include "../CPPFImdlp.h" namespace mdlp { - class TestMetrics : public CPPFImdlp, public testing::Test { + class TestFImdlp : public CPPFImdlp, public testing::Test { public: - TestMetrics() : CPPFImdlp(true, 6, true) {} - indices_t indices; // sorted indices to use with X and y - samples X; - labels y; - samples xDiscretized; - int numClasses; - float precision_test = 0.000001; + TestFImdlp() : CPPFImdlp(true, 6, true) {} void SetUp() { // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0] @@ -19,7 +13,19 @@ namespace mdlp { y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; fit(X, y); } - void check_sorted_vector(samples& X_, indices_t indices_) + void initCutPoints() + { + setCutPoints(cutPoints_t()); + } + void initIndices() + { + indices = indices_t(); + } + void initDiscretized() + { + xDiscretized = labels(); + } + void checkSortedVector(samples& X_, indices_t indices_) { X = X_; indices = indices_; @@ -32,113 +38,109 @@ namespace mdlp { } } }; - // - TEST_F(TestMetrics, SortIndices) + TEST_F(TestFImdlp, SortIndices) { X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; - indices_t indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 }; - check_sorted_vector(X, indices); + indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 }; + checkSortedVector(X, indices); X = { 5.77, 5.88, 5.99 }; indices = { 0, 1, 2 }; - check_sorted_vector(X, indices); + checkSortedVector(X, indices); X = { 5.33, 5.22, 5.11 }; indices = { 2, 1, 0 }; - check_sorted_vector(X, indices); + checkSortedVector(X, indices); } - TEST_F(TestMetrics, EvaluateCutPoint) + TEST_F(TestFImdlp, EvaluateCutPoint) { cutPoint_t rest, candidate; - rest.start = 0; - rest.end = 10; - rest.classNumber = -1; - rest.fromValue = -1; - rest.toValue = 1000; - candidate.start = 0; - candidate.end = 4; - candidate.fromValue = -1; - candidate.toValue = 5.15; - candidate.classNumber = -1; + rest = { 0, 10, -1, -1, 1000 }; + candidate = { 0, 4, -1, -1, 5.15 }; EXPECT_FALSE(evaluateCutPoint(rest, candidate)); } - TEST_F(TestMetrics, ComputeCutPointsOriginal) + TEST_F(TestFImdlp, ComputeCutPointsOriginal) { cutPoints_t computed, expected; + int expectedSize = 3; expected = { { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 }, - { 6, 7, -1, 5.45, 5.65 }, { 7, 10, -1, 5.65, 3.4028234663852886e+38 } + { 6, 10, -1, 5.45, 3.4028234663852886e+38 } }; + setCutPoints(cutPoints_t()); computeCutPointsOriginal(); computed = getCutPoints(); - EXPECT_EQ(computed.size(), 4); - for (auto i = 0; i < 4; i++) { + EXPECT_EQ(computed.size(), expectedSize); + for (auto i = 0; i < expectedSize; i++) { EXPECT_EQ(computed[i].start, expected[i].start); EXPECT_EQ(computed[i].end, expected[i].end); EXPECT_EQ(computed[i].classNumber, expected[i].classNumber); - EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision_test); - EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision_test); + EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision); + EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision); } } - TEST_F(TestMetrics, ComputeCutPointsOriginalGCase) + TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) { cutPoints_t computed, expected; expected = { { 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 }, }; + int expectedSize = 1; X = { 0, 1, 2, 2 }; y = { 1, 1, 1, 2 }; fit(X, y); computeCutPointsOriginal(); computed = getCutPoints(); - EXPECT_EQ(computed.size(), 1); - for (auto i = 0; i < 1; i++) { + EXPECT_EQ(computed.size(), expectedSize); + for (auto i = 0; i < expectedSize; i++) { EXPECT_EQ(computed[i].start, expected[i].start); EXPECT_EQ(computed[i].end, expected[i].end); EXPECT_EQ(computed[i].classNumber, expected[i].classNumber); - EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision_test); - EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision_test); + EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision); + EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision); } } - TEST_F(TestMetrics, ComputeCutPointsProposed) + TEST_F(TestFImdlp, ComputeCutPointsProposed) { cutPoints_t computed, expected; expected = { - { 0, 4, -1, -3.4028234663852886e+38, 5.1 }, { 4, 5, -1, 5.1, 5.2 }, - { 5, 6, -1, 5.2, 5.4 }, { 6, 9, -1, 5.4, 5.85 }, + { 0, 4, -1, -3.4028234663852886e+38, 5.1 }, { 4, 6, -1, 5.1, 5.4 }, + { 6, 9, -1, 5.4, 5.85 }, { 9, 10, -1, 5.85, 3.4028234663852886e+38 } }; + int expectedSize = 4; computeCutPointsProposed(); computed = getCutPoints(); - EXPECT_EQ(computed.size(), 5); - for (auto i = 0; i < 5; i++) { + EXPECT_EQ(computed.size(), expectedSize); + for (auto i = 0; i < expectedSize; i++) { EXPECT_EQ(computed[i].start, expected[i].start); EXPECT_EQ(computed[i].end, expected[i].end); EXPECT_EQ(computed[i].classNumber, expected[i].classNumber); - EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision_test); - EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision_test); + EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision); + EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision); } } - TEST_F(TestMetrics, ComputeCutPointsProposedGCase) + TEST_F(TestFImdlp, ComputeCutPointsProposedGCase) { cutPoints_t computed, expected; expected = { { 0, 3, -1, -3.4028234663852886e+38, 1.5 }, { 3, 4, -1, 1.5, 3.4028234663852886e+38 } }; + int expectedSize = 2; X = { 0, 1, 2, 2 }; y = { 1, 1, 1, 2 }; fit(X, y); computeCutPointsProposed(); computed = getCutPoints(); - EXPECT_EQ(computed.size(), 2); - for (auto i = 0; i < 1; i++) { + EXPECT_EQ(computed.size(), expectedSize); + for (auto i = 0; i < expectedSize; i++) { EXPECT_EQ(computed[i].start, expected[i].start); EXPECT_EQ(computed[i].end, expected[i].end); EXPECT_EQ(computed[i].classNumber, expected[i].classNumber); - EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision_test); - EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision_test); + EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision); + EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision); } } - TEST_F(TestMetrics, ApplyCutPoints) + TEST_F(TestFImdlp, ApplyCutPoints) { cutPoints_t expected = { { 0, 4, 17, -3.4028234663852886e+38, 5.1 }, { 4, 6, 31, 5.1, 5.4 }, diff --git a/sample.py b/sample.py index ffb8f01..36d4699 100644 --- a/sample.py +++ b/sample.py @@ -69,13 +69,14 @@ for proposed in [True, False]: X = data.data y = data.target print("*** Proposed: ", proposed) - test = CFImdlp(debug=False, proposed=proposed) + test = CFImdlp(debug=True, proposed=proposed) test.fit(X[:, 0], y) result = test.get_cut_points() for item in result: print( - f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})" - f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]" + f"Class={item['classNumber']} - ({item['start']:3d}, " + f"{item['end']:3d}) -> ({item['fromValue']:3.1f}, " + f"{item['toValue']:3.1f}]" ) print(test.get_discretized_values()) print("+" * 40) @@ -114,11 +115,14 @@ for proposed in [True, False]: # # k = test.cut_points_ant(X[:, 0], y) # # print(k) # # test.debug_points(X[:, 0], y) -X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9] -indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7] +# X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9] +# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2] +# indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7] +# clf = CFImdlp(debug=True, proposed=False) +# clf.fit(X, y) +# print(clf.get_cut_points()) # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2] # # To check # indices2 = np.argsort(X) - Xs = np.array(X)[indices2] - ys = np.array(y)[indices2] - +# Xs = np.array(X)[indices2] +# ys = np.array(y)[indices2]