mirror of
https://github.com/rmontanana/mdlp.git
synced 2025-08-18 00:45:57 +00:00
Add max_depth and min_length as hyperparams
This commit is contained in:
@@ -8,7 +8,11 @@
|
|||||||
|
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
|
|
||||||
CPPFImdlp::CPPFImdlp(): indices(indices_t()), X(samples_t()), y(labels_t()),
|
CPPFImdlp::CPPFImdlp():depth(0), max_depth(numeric_limits<int>::max()), min_length(3), indices(indices_t()), X(samples_t()), y(labels_t()),
|
||||||
|
metrics(Metrics(y, indices))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
CPPFImdlp::CPPFImdlp(int min_length_, int max_depth_): depth(0), max_depth(max_depth_), min_length(min_length_), indices(indices_t()), X(samples_t()), y(labels_t()),
|
||||||
metrics(Metrics(y, indices))
|
metrics(Metrics(y, indices))
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@@ -25,9 +29,15 @@ namespace mdlp {
|
|||||||
if (X.empty() || y.empty()) {
|
if (X.empty() || y.empty()) {
|
||||||
throw invalid_argument("X and y must have at least one element");
|
throw invalid_argument("X and y must have at least one element");
|
||||||
}
|
}
|
||||||
|
if (min_length < 3) {
|
||||||
|
throw invalid_argument("min_length must be greater than 2");
|
||||||
|
}
|
||||||
|
if (max_depth < 1) {
|
||||||
|
throw invalid_argument("max_depth must be greater than 0");
|
||||||
|
}
|
||||||
indices = sortIndices(X_, y_);
|
indices = sortIndices(X_, y_);
|
||||||
metrics.setData(y, indices);
|
metrics.setData(y, indices);
|
||||||
computeCutPoints(0, X.size());
|
computeCutPoints(0, X.size(), 1);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -60,12 +70,14 @@ namespace mdlp {
|
|||||||
return { (actual + previous) / 2, cut };
|
return { (actual + previous) / 2, cut };
|
||||||
}
|
}
|
||||||
|
|
||||||
void CPPFImdlp::computeCutPoints(size_t start, size_t end)
|
void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_)
|
||||||
{
|
{
|
||||||
size_t cut;
|
size_t cut;
|
||||||
pair<precision_t, size_t> result;
|
pair<precision_t, size_t> result;
|
||||||
if (end - start < 3)
|
// Check if the interval length and the depth are Ok
|
||||||
|
if (end - start < min_length || depth_ > max_depth)
|
||||||
return;
|
return;
|
||||||
|
depth = depth_ > depth ? depth_ : depth;
|
||||||
cut = getCandidate(start, end);
|
cut = getCandidate(start, end);
|
||||||
if (cut == numeric_limits<size_t>::max())
|
if (cut == numeric_limits<size_t>::max())
|
||||||
return;
|
return;
|
||||||
@@ -73,8 +85,8 @@ namespace mdlp {
|
|||||||
result = valueCutPoint(start, cut, end);
|
result = valueCutPoint(start, cut, end);
|
||||||
cut = result.second;
|
cut = result.second;
|
||||||
cutPoints.push_back(result.first);
|
cutPoints.push_back(result.first);
|
||||||
computeCutPoints(start, cut);
|
computeCutPoints(start, cut, depth_ + 1);
|
||||||
computeCutPoints(cut, end);
|
computeCutPoints(cut, end, depth_ + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -158,4 +170,8 @@ namespace mdlp {
|
|||||||
sort(output.begin(), output.end());
|
sort(output.begin(), output.end());
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
int CPPFImdlp::get_depth()
|
||||||
|
{
|
||||||
|
return depth;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -10,19 +10,23 @@ namespace mdlp {
|
|||||||
indices_t indices;
|
indices_t indices;
|
||||||
samples_t X;
|
samples_t X;
|
||||||
labels_t y;
|
labels_t y;
|
||||||
|
int depth, max_depth;
|
||||||
|
size_t min_length;
|
||||||
Metrics metrics;
|
Metrics metrics;
|
||||||
cutPoints_t cutPoints;
|
cutPoints_t cutPoints;
|
||||||
|
|
||||||
static indices_t sortIndices(samples_t&, labels_t&);
|
static indices_t sortIndices(samples_t&, labels_t&);
|
||||||
void computeCutPoints(size_t, size_t);
|
void computeCutPoints(size_t, size_t, int);
|
||||||
bool mdlp(size_t, size_t, size_t);
|
bool mdlp(size_t, size_t, size_t);
|
||||||
size_t getCandidate(size_t, size_t);
|
size_t getCandidate(size_t, size_t);
|
||||||
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
|
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
|
||||||
public:
|
public:
|
||||||
CPPFImdlp();
|
CPPFImdlp();
|
||||||
|
CPPFImdlp(int, int);
|
||||||
~CPPFImdlp();
|
~CPPFImdlp();
|
||||||
CPPFImdlp& fit(samples_t&, labels_t&);
|
CPPFImdlp& fit(samples_t&, labels_t&);
|
||||||
samples_t getCutPoints();
|
cutPoints_t getCutPoints();
|
||||||
|
int get_depth();
|
||||||
inline string version() { return "1.1.1"; };
|
inline string version() { return "1.1.1"; };
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@@ -8,6 +8,7 @@ namespace mdlp {
|
|||||||
class TestFImdlp: public CPPFImdlp, public testing::Test {
|
class TestFImdlp: public CPPFImdlp, public testing::Test {
|
||||||
public:
|
public:
|
||||||
precision_t precision = 0.000001;
|
precision_t precision = 0.000001;
|
||||||
|
//precision_t precision = 0.000000000001;
|
||||||
TestFImdlp(): CPPFImdlp() {}
|
TestFImdlp(): CPPFImdlp() {}
|
||||||
void SetUp()
|
void SetUp()
|
||||||
{
|
{
|
||||||
@@ -25,18 +26,16 @@ namespace mdlp {
|
|||||||
prev = X[testSortedIndices[i]];
|
prev = X[testSortedIndices[i]];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void checkCutPoints(cutPoints_t& expected)
|
void checkCutPoints(cutPoints_t& computed, cutPoints_t& expected)
|
||||||
{
|
{
|
||||||
int expectedSize = expected.size();
|
EXPECT_EQ(computed.size(), expected.size());
|
||||||
EXPECT_EQ(cutPoints.size(), expectedSize);
|
for (unsigned long i = 0; i < computed.size(); i++) {
|
||||||
for (unsigned long i = 0; i < cutPoints.size(); i++) {
|
EXPECT_NEAR(computed[i], expected[i], precision);
|
||||||
EXPECT_NEAR(cutPoints[i], expected[i], precision);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<typename T, typename A>
|
template<typename T, typename A>
|
||||||
void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
|
void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
|
||||||
{
|
{
|
||||||
EXPECT_EQ(expected.size(), computed.size());
|
|
||||||
ASSERT_EQ(expected.size(), computed.size());
|
ASSERT_EQ(expected.size(), computed.size());
|
||||||
for (auto i = 0; i < expected.size(); i++) {
|
for (auto i = 0; i < expected.size(); i++) {
|
||||||
EXPECT_NEAR(expected[i], computed[i], precision);
|
EXPECT_NEAR(expected[i], computed[i], precision);
|
||||||
@@ -55,6 +54,20 @@ namespace mdlp {
|
|||||||
EXPECT_EQ(result.second, limit);
|
EXPECT_EQ(result.second, limit);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
void test_dataset(CPPFImdlp& test, string filename, vector<cutPoints_t>& expected, int depths[])
|
||||||
|
{
|
||||||
|
ArffFiles file;
|
||||||
|
file.load("../datasets/" + filename, true);
|
||||||
|
vector<samples_t>& X = file.getX();
|
||||||
|
labels_t& y = file.getY();
|
||||||
|
auto attributes = file.getAttributes();
|
||||||
|
for (auto feature = 0; feature < attributes.size(); feature++) {
|
||||||
|
test.fit(X[feature], y);
|
||||||
|
EXPECT_EQ(test.get_depth(), depths[feature]);
|
||||||
|
auto computed = test.getCutPoints();
|
||||||
|
checkCutPoints(computed, expected[feature]);
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
TEST_F(TestFImdlp, FitErrorEmptyDataset)
|
TEST_F(TestFImdlp, FitErrorEmptyDataset)
|
||||||
{
|
{
|
||||||
@@ -68,6 +81,15 @@ namespace mdlp {
|
|||||||
y = { 1, 2 };
|
y = { 1, 2 };
|
||||||
EXPECT_THROW(fit(X, y), std::invalid_argument);
|
EXPECT_THROW(fit(X, y), std::invalid_argument);
|
||||||
}
|
}
|
||||||
|
TEST_F(TestFImdlp, FitErrorMinLengtMaxDepth)
|
||||||
|
{
|
||||||
|
auto testLength = CPPFImdlp(2, 10);
|
||||||
|
auto testDepth = CPPFImdlp(3, 0);
|
||||||
|
X = { 1, 2, 3 };
|
||||||
|
y = { 1, 2, 3 };
|
||||||
|
EXPECT_THROW(testLength.fit(X, y), invalid_argument);
|
||||||
|
EXPECT_THROW(testDepth.fit(X, y), invalid_argument);
|
||||||
|
}
|
||||||
TEST_F(TestFImdlp, SortIndices)
|
TEST_F(TestFImdlp, SortIndices)
|
||||||
{
|
{
|
||||||
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
|
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
|
||||||
@@ -114,7 +136,7 @@ namespace mdlp {
|
|||||||
TEST_F(TestFImdlp, TestArtificialDataset)
|
TEST_F(TestFImdlp, TestArtificialDataset)
|
||||||
{
|
{
|
||||||
fit(X, y);
|
fit(X, y);
|
||||||
computeCutPoints(0, 20);
|
computeCutPoints(0, 20, 1);
|
||||||
cutPoints_t expected = { 5.05 };
|
cutPoints_t expected = { 5.05 };
|
||||||
vector<precision_t> computed = getCutPoints();
|
vector<precision_t> computed = getCutPoints();
|
||||||
computed = getCutPoints();
|
computed = getCutPoints();
|
||||||
@@ -126,28 +148,15 @@ namespace mdlp {
|
|||||||
}
|
}
|
||||||
TEST_F(TestFImdlp, TestIris)
|
TEST_F(TestFImdlp, TestIris)
|
||||||
{
|
{
|
||||||
ArffFiles file;
|
|
||||||
string path = "../datasets/";
|
|
||||||
|
|
||||||
file.load(path + "iris.arff", true);
|
|
||||||
int items = file.getSize();
|
|
||||||
vector<samples_t>& X = file.getX();
|
|
||||||
vector<cutPoints_t> expected = {
|
vector<cutPoints_t> expected = {
|
||||||
{ 5.4499998092651367, 5.75 },
|
{ 5.45, 5.75 },
|
||||||
{ 2.75, 2.85, 2.95, 3.05, 3.35 },
|
{ 2.75, 2.85, 2.95, 3.05, 3.35 },
|
||||||
{ 2.4500000476837158, 4.75, 5.0500001907348633 },
|
{ 2.45, 4.75, 5.05 },
|
||||||
{ 0.80000001192092896, 1.75 }
|
{ 0.8, 1.75 }
|
||||||
};
|
};
|
||||||
labels_t& y = file.getY();
|
int depths[] = { 3, 5, 5, 5 };
|
||||||
auto attributes = file.getAttributes();
|
auto test = CPPFImdlp();
|
||||||
for (auto feature = 0; feature < attributes.size(); feature++) {
|
test_dataset(test, "iris.arff", expected, depths);
|
||||||
fit(X[feature], y);
|
|
||||||
vector<precision_t> computed = getCutPoints();
|
|
||||||
EXPECT_EQ(computed.size(), expected[feature].size());
|
|
||||||
for (auto i = 0; i < computed.size(); i++) {
|
|
||||||
EXPECT_NEAR(computed[i], expected[feature][i], precision);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
TEST_F(TestFImdlp, ComputeCutPointsGCase)
|
TEST_F(TestFImdlp, ComputeCutPointsGCase)
|
||||||
{
|
{
|
||||||
@@ -156,7 +165,8 @@ namespace mdlp {
|
|||||||
samples_t X_ = { 0, 1, 2, 2, 2 };
|
samples_t X_ = { 0, 1, 2, 2, 2 };
|
||||||
labels_t y_ = { 1, 1, 1, 2, 2 };
|
labels_t y_ = { 1, 1, 1, 2, 2 };
|
||||||
fit(X_, y_);
|
fit(X_, y_);
|
||||||
checkCutPoints(expected);
|
auto computed = getCutPoints();
|
||||||
|
checkCutPoints(computed, expected);
|
||||||
}
|
}
|
||||||
TEST_F(TestFImdlp, ValueCutPoint)
|
TEST_F(TestFImdlp, ValueCutPoint)
|
||||||
{
|
{
|
||||||
@@ -178,4 +188,43 @@ namespace mdlp {
|
|||||||
samples_t X4c = { 3.1, 3.2, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7 };
|
samples_t X4c = { 3.1, 3.2, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7 };
|
||||||
test_result(X4c, 4, 6.9 / 2, 2, "4c");
|
test_result(X4c, 4, 6.9 / 2, 2, "4c");
|
||||||
}
|
}
|
||||||
|
TEST_F(TestFImdlp, MaxDepth)
|
||||||
|
{
|
||||||
|
// Set max_depth to 2
|
||||||
|
auto test = CPPFImdlp(3, 1);
|
||||||
|
vector<cutPoints_t> expected = {
|
||||||
|
{ 5.45 },
|
||||||
|
{ 3.35 },
|
||||||
|
{ 2.45 },
|
||||||
|
{0.8 }
|
||||||
|
};
|
||||||
|
int depths[] = { 1, 1, 1, 1 };
|
||||||
|
test_dataset(test, "iris.arff", expected, depths);
|
||||||
|
}
|
||||||
|
TEST_F(TestFImdlp, MinLength)
|
||||||
|
{
|
||||||
|
// Set min_length to 75
|
||||||
|
auto test = CPPFImdlp(75, 100);
|
||||||
|
vector<cutPoints_t> expected = {
|
||||||
|
{ 5.45, 5.75 },
|
||||||
|
{ 2.85, 3.35 },
|
||||||
|
{ 2.45, 4.75 },
|
||||||
|
{ 0.8, 1.75 }
|
||||||
|
};
|
||||||
|
int depths[] = { 3, 3, 3, 3 };
|
||||||
|
test_dataset(test, "iris.arff", expected, depths);
|
||||||
|
}
|
||||||
|
TEST_F(TestFImdlp, MinLengthMaxDepth)
|
||||||
|
{
|
||||||
|
// Set min_length to 75
|
||||||
|
auto test = CPPFImdlp(75, 2);
|
||||||
|
vector<cutPoints_t> expected = {
|
||||||
|
{ 5.45, 5.75 },
|
||||||
|
{ 2.85, 3.35 },
|
||||||
|
{ 2.45, 4.75 },
|
||||||
|
{ 0.8, 1.75 }
|
||||||
|
};
|
||||||
|
int depths[] = { 2, 2, 2, 2 };
|
||||||
|
test_dataset(test, "iris.arff", expected, depths);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -9,4 +9,4 @@ if test $? -ne 0; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
cd build
|
cd build
|
||||||
ctest --output-on-failure
|
ctest --output-on-failure|grep -v profiling
|
||||||
|
Reference in New Issue
Block a user