Merge remote-tracking branch 'origin/main' into main

This commit is contained in:
2022-12-04 10:56:23 +01:00
4 changed files with 82 additions and 67 deletions

View File

@@ -174,7 +174,10 @@ namespace mdlp {
printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
}
while (idx < numElements && xCur == xPivot);
if (yPivot == -1 || yPrev != yCur) {
// Check if the class changed and there are more than 1 element
if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur)) {
// Must we add the entropy criteria here?
// if (totalEntropy - (entropyLeft + entropyRight) < 0) { Accept cut point }
cutPoint.start = start;
cutPoint.end = idx;
start = idx;
@@ -200,9 +203,11 @@ namespace mdlp {
printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
cutPts.push_back(cutPoint);
}
if (debug)
if (debug) {
std::cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << std::endl;
for (auto cutPt : cutPts)
std::cout << "Proposed: Cut point: " << cutPt;
std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposed: Cut point: " << cutPt;
}
cutPoints = cutPts;
}
void CPPFImdlp::computeCutPointsOriginal()
@@ -218,8 +223,11 @@ namespace mdlp {
yPrev = y[idx];
for (index = 0; index < size_t(indices.size()) - 1; index++) {
idx = indices[index];
// Definition 2 Cut points are always on boundaries
if (y[idx] != yPrev && xPrev < X[idx]) {
// Definition 2 Cut points are always on class boundaries &&
// there are more than 1 items in the interval
if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1) {
// Must we add the entropy criteria here?
// if (totalEntropy - (entropyLeft + entropyRight) < 0) { Accept cut point }
if (first) {
first = false;
cutPoint.fromValue = std::numeric_limits<float>::lowest();
@@ -245,9 +253,11 @@ namespace mdlp {
} else
cutPts.back().toValue = std::numeric_limits<float>::max();
cutPts.back().end = X.size();
if (debug)
if (debug) {
std::cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, indices.size(), numClasses) << std::endl;
for (auto cutPt : cutPts)
std::cout << "Original: Cut point: " << cutPt;
std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << ": Original: Cut point: " << cutPt;
}
cutPoints = cutPts;
}
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes

View File

@@ -4,7 +4,7 @@
#include <utility>
namespace mdlp {
class CPPFImdlp {
private:
protected:
bool proposed; // proposed algorithm or original algorithm
int precision;
bool debug;
@@ -16,7 +16,6 @@ namespace mdlp {
int numClasses;
cutPoints_t cutPoints;
protected:
void setCutPoints(cutPoints_t);
static indices_t sortIndices(samples&);
void computeCutPointsOriginal();

View File

@@ -2,15 +2,9 @@
#include "../Metrics.h"
#include "../CPPFImdlp.h"
namespace mdlp {
class TestMetrics : public CPPFImdlp, public testing::Test {
class TestFImdlp : public CPPFImdlp, public testing::Test {
public:
TestMetrics() : CPPFImdlp(true, 6, true) {}
indices_t indices; // sorted indices to use with X and y
samples X;
labels y;
samples xDiscretized;
int numClasses;
float precision_test = 0.000001;
TestFImdlp() : CPPFImdlp(true, 6, true) {}
void SetUp()
{
// 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
@@ -19,7 +13,19 @@ namespace mdlp {
y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
fit(X, y);
}
void check_sorted_vector(samples& X_, indices_t indices_)
void initCutPoints()
{
setCutPoints(cutPoints_t());
}
void initIndices()
{
indices = indices_t();
}
void initDiscretized()
{
xDiscretized = labels();
}
void checkSortedVector(samples& X_, indices_t indices_)
{
X = X_;
indices = indices_;
@@ -32,113 +38,109 @@ namespace mdlp {
}
}
};
//
TEST_F(TestMetrics, SortIndices)
TEST_F(TestFImdlp, SortIndices)
{
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
indices_t indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 };
check_sorted_vector(X, indices);
indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 };
checkSortedVector(X, indices);
X = { 5.77, 5.88, 5.99 };
indices = { 0, 1, 2 };
check_sorted_vector(X, indices);
checkSortedVector(X, indices);
X = { 5.33, 5.22, 5.11 };
indices = { 2, 1, 0 };
check_sorted_vector(X, indices);
checkSortedVector(X, indices);
}
TEST_F(TestMetrics, EvaluateCutPoint)
TEST_F(TestFImdlp, EvaluateCutPoint)
{
cutPoint_t rest, candidate;
rest.start = 0;
rest.end = 10;
rest.classNumber = -1;
rest.fromValue = -1;
rest.toValue = 1000;
candidate.start = 0;
candidate.end = 4;
candidate.fromValue = -1;
candidate.toValue = 5.15;
candidate.classNumber = -1;
rest = { 0, 10, -1, -1, 1000 };
candidate = { 0, 4, -1, -1, 5.15 };
EXPECT_FALSE(evaluateCutPoint(rest, candidate));
}
TEST_F(TestMetrics, ComputeCutPointsOriginal)
TEST_F(TestFImdlp, ComputeCutPointsOriginal)
{
cutPoints_t computed, expected;
int expectedSize = 3;
expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
{ 6, 7, -1, 5.45, 5.65 }, { 7, 10, -1, 5.65, 3.4028234663852886e+38 }
{ 6, 10, -1, 5.45, 3.4028234663852886e+38 }
};
setCutPoints(cutPoints_t());
computeCutPointsOriginal();
computed = getCutPoints();
EXPECT_EQ(computed.size(), 4);
for (auto i = 0; i < 4; i++) {
EXPECT_EQ(computed.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(computed[i].start, expected[i].start);
EXPECT_EQ(computed[i].end, expected[i].end);
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision_test);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision_test);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
}
}
TEST_F(TestMetrics, ComputeCutPointsOriginalGCase)
TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase)
{
cutPoints_t computed, expected;
expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 },
};
int expectedSize = 1;
X = { 0, 1, 2, 2 };
y = { 1, 1, 1, 2 };
fit(X, y);
computeCutPointsOriginal();
computed = getCutPoints();
EXPECT_EQ(computed.size(), 1);
for (auto i = 0; i < 1; i++) {
EXPECT_EQ(computed.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(computed[i].start, expected[i].start);
EXPECT_EQ(computed[i].end, expected[i].end);
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision_test);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision_test);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
}
}
TEST_F(TestMetrics, ComputeCutPointsProposed)
TEST_F(TestFImdlp, ComputeCutPointsProposed)
{
cutPoints_t computed, expected;
expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 5.1 }, { 4, 5, -1, 5.1, 5.2 },
{ 5, 6, -1, 5.2, 5.4 }, { 6, 9, -1, 5.4, 5.85 },
{ 0, 4, -1, -3.4028234663852886e+38, 5.1 }, { 4, 6, -1, 5.1, 5.4 },
{ 6, 9, -1, 5.4, 5.85 },
{ 9, 10, -1, 5.85, 3.4028234663852886e+38 }
};
int expectedSize = 4;
computeCutPointsProposed();
computed = getCutPoints();
EXPECT_EQ(computed.size(), 5);
for (auto i = 0; i < 5; i++) {
EXPECT_EQ(computed.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(computed[i].start, expected[i].start);
EXPECT_EQ(computed[i].end, expected[i].end);
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision_test);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision_test);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
}
}
TEST_F(TestMetrics, ComputeCutPointsProposedGCase)
TEST_F(TestFImdlp, ComputeCutPointsProposedGCase)
{
cutPoints_t computed, expected;
expected = {
{ 0, 3, -1, -3.4028234663852886e+38, 1.5 },
{ 3, 4, -1, 1.5, 3.4028234663852886e+38 }
};
int expectedSize = 2;
X = { 0, 1, 2, 2 };
y = { 1, 1, 1, 2 };
fit(X, y);
computeCutPointsProposed();
computed = getCutPoints();
EXPECT_EQ(computed.size(), 2);
for (auto i = 0; i < 1; i++) {
EXPECT_EQ(computed.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(computed[i].start, expected[i].start);
EXPECT_EQ(computed[i].end, expected[i].end);
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision_test);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision_test);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
}
}
TEST_F(TestMetrics, ApplyCutPoints)
TEST_F(TestFImdlp, ApplyCutPoints)
{
cutPoints_t expected = {
{ 0, 4, 17, -3.4028234663852886e+38, 5.1 }, { 4, 6, 31, 5.1, 5.4 },

View File

@@ -69,13 +69,14 @@ for proposed in [True, False]:
X = data.data
y = data.target
print("*** Proposed: ", proposed)
test = CFImdlp(debug=False, proposed=proposed)
test = CFImdlp(debug=True, proposed=proposed)
test.fit(X[:, 0], y)
result = test.get_cut_points()
for item in result:
print(
f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
f"Class={item['classNumber']} - ({item['start']:3d}, "
f"{item['end']:3d}) -> ({item['fromValue']:3.1f}, "
f"{item['toValue']:3.1f}]"
)
print(test.get_discretized_values())
print("+" * 40)
@@ -114,11 +115,14 @@ for proposed in [True, False]:
# # k = test.cut_points_ant(X[:, 0], y)
# # print(k)
# # test.debug_points(X[:, 0], y)
X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
# X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
# clf = CFImdlp(debug=True, proposed=False)
# clf.fit(X, y)
# print(clf.get_cut_points())
# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# # To check
# indices2 = np.argsort(X)
Xs = np.array(X)[indices2]
ys = np.array(y)[indices2]
# Xs = np.array(X)[indices2]
# ys = np.array(y)[indices2]