mirror of
https://github.com/rmontanana/mdlp.git
synced 2025-08-16 07:55:58 +00:00
Refactor Algorithm
This commit is contained in:
163
CPPFImdlp.cpp
163
CPPFImdlp.cpp
@@ -4,15 +4,12 @@
|
||||
#include <cmath>
|
||||
#include "CPPFImdlp.h"
|
||||
#include "Metrics.h"
|
||||
// OJO QUITAR ESTO
|
||||
#include <iostream>
|
||||
namespace mdlp {
|
||||
CPPFImdlp::CPPFImdlp(int algorithm):algorithm(algorithm), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices))
|
||||
{
|
||||
}
|
||||
CPPFImdlp::~CPPFImdlp()
|
||||
= default;
|
||||
|
||||
CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_)
|
||||
{
|
||||
X = X_;
|
||||
@@ -24,22 +21,21 @@ namespace mdlp {
|
||||
if (X.size() == 0 || y.size() == 0) {
|
||||
throw invalid_argument("X and y must have at least one element");
|
||||
}
|
||||
indices = sortIndices2(X_, y_);
|
||||
indices = sortIndices(X_, y_);
|
||||
metrics.setData(y, indices);
|
||||
switch (algorithm) {
|
||||
case 0:
|
||||
computeCutPoints(0, X.size());
|
||||
break;
|
||||
case 1:
|
||||
computeCutPointsProposal(0, X.size());
|
||||
break;
|
||||
case 2:
|
||||
computeCutPointsAlternative(0, X.size());
|
||||
break;
|
||||
default:
|
||||
throw invalid_argument("algorithm must be 0 or 1");
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
precision_t CPPFImdlp::value_cut_point(size_t start, size_t idx)
|
||||
precision_t CPPFImdlp::halfWayValueCutPoint(size_t start, size_t idx)
|
||||
{
|
||||
size_t idxPrev = idx - 1;
|
||||
precision_t previous = X[indices[idxPrev]], actual = X[indices[idx]];
|
||||
@@ -49,7 +45,7 @@ namespace mdlp {
|
||||
}
|
||||
return (previous + actual) / 2;
|
||||
}
|
||||
tuple<precision_t, size_t> CPPFImdlp::value_proposal_cut_point(size_t start, size_t cut, size_t end)
|
||||
tuple<precision_t, size_t> CPPFImdlp::completeValueCutPoint(size_t start, size_t cut, size_t end)
|
||||
{
|
||||
size_t idxPrev = cut - 1;
|
||||
precision_t previous, next, actual;
|
||||
@@ -66,63 +62,23 @@ namespace mdlp {
|
||||
cut--;
|
||||
return make_tuple((previous + actual) / 2, cut);
|
||||
}
|
||||
// void CPPFImdlp::computeCutPoints(size_t start, size_t end)
|
||||
// {
|
||||
// size_t cut;
|
||||
// if (end - start < 2)
|
||||
// return;
|
||||
// cut = getCandidate(start, end);
|
||||
// if (cut == numeric_limits<size_t>::max() || !mdlp(start, cut, end)) {
|
||||
// // cut == max means that there is no candidate in the interval
|
||||
// // No boundary found, so we add both ends of the interval as cutpoints
|
||||
// // because they were selected by the algorithm before
|
||||
// if (start != 0)
|
||||
// cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2);
|
||||
// if (end != X.size())
|
||||
// cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2);
|
||||
// //cout << "!!!Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl;
|
||||
// return;
|
||||
// }
|
||||
// // cout << "*Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl;
|
||||
// computeCutPoints(start, cut);
|
||||
// computeCutPoints(cut, end);
|
||||
// }
|
||||
// void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end)
|
||||
// {
|
||||
// size_t cut;
|
||||
// if (end - start < 2)
|
||||
// return;
|
||||
// cut = getCandidate(start, end);
|
||||
// if (cut == numeric_limits<size_t>::max() || !mdlp(start, cut, end)) {
|
||||
// // cut == max means that there is no candidate in the interval
|
||||
// // No boundary found, so we add both ends of the interval as cutpoints
|
||||
// // because they were selected by the algorithm before
|
||||
// if (start != 0)
|
||||
// cutPoints.push_back(value_cut_point(0, start));
|
||||
// if (end != X.size())
|
||||
// cutPoints.push_back(value_cut_point(start, end));
|
||||
// //cout << "!!!Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl;
|
||||
// return;
|
||||
// }
|
||||
// // cout << "*Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl;
|
||||
// computeCutPointsAlternative(start, cut);
|
||||
// computeCutPointsAlternative(cut, end);
|
||||
// }
|
||||
void CPPFImdlp::computeCutPoints(size_t start, size_t end)
|
||||
{
|
||||
size_t cut;
|
||||
tuple<precision_t, size_t> result;
|
||||
if (end - start < 2)
|
||||
return;
|
||||
cut = getCandidate(start, end);
|
||||
if (cut == numeric_limits<size_t>::max())
|
||||
return;
|
||||
if (mdlp(start, cut, end)) {
|
||||
cutPoints.push_back(value_cut_point(start, cut));
|
||||
//cout << "+Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl;
|
||||
}
|
||||
result = completeValueCutPoint(start, cut, end);
|
||||
cut = get<1>(result);
|
||||
cutPoints.push_back(get<0>(result));
|
||||
computeCutPoints(start, cut);
|
||||
computeCutPoints(cut, end);
|
||||
}
|
||||
}
|
||||
void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end)
|
||||
{
|
||||
size_t cut;
|
||||
@@ -132,67 +88,11 @@ namespace mdlp {
|
||||
if (cut == numeric_limits<size_t>::max())
|
||||
return;
|
||||
if (mdlp(start, cut, end)) {
|
||||
cutPoints.push_back(value_cut_point(start, cut));
|
||||
//cout << "+Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl;
|
||||
cutPoints.push_back(halfWayValueCutPoint(start, cut));
|
||||
computeCutPointsAlternative(start, cut);
|
||||
computeCutPointsAlternative(cut, end);
|
||||
}
|
||||
}
|
||||
// void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end)
|
||||
// {
|
||||
// size_t cut;
|
||||
// if (end - start < 2)
|
||||
// return;
|
||||
// cut = getCandidateWeka(start, end);
|
||||
// if (cut == numeric_limits<size_t>::max())
|
||||
// return;
|
||||
// if (mdlp(start, cut, end)) {
|
||||
// cutPoints.push_back(value_cut_point(start, cut));
|
||||
// //cout << "+Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl;
|
||||
// }
|
||||
// computeCutPointsAlternative(start, cut);
|
||||
// computeCutPointsAlternative(cut, end);
|
||||
// }
|
||||
void CPPFImdlp::computeCutPointsProposal(size_t start, size_t end)
|
||||
{
|
||||
size_t cut;
|
||||
tuple<precision_t, size_t> result;
|
||||
if (end - start < 2)
|
||||
return;
|
||||
cut = getCandidate(start, end);
|
||||
if (cut == numeric_limits<size_t>::max())
|
||||
return;
|
||||
if (mdlp(start, cut, end)) {
|
||||
//cout << "+Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl;
|
||||
result = value_proposal_cut_point(start, cut, end);
|
||||
cut = get<1>(result);
|
||||
cutPoints.push_back(get<0>(result));
|
||||
//cout << "*Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl;
|
||||
computeCutPointsProposal(start, cut);
|
||||
computeCutPointsProposal(cut, end);
|
||||
}
|
||||
|
||||
}
|
||||
size_t CPPFImdlp::getCandidateWeka(size_t start, size_t end)
|
||||
{
|
||||
/* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
|
||||
E(A, TA; S) is minimal amogst all the candidate cut points. */
|
||||
size_t candidate = numeric_limits<size_t>::max(), elements = end - start;
|
||||
precision_t entropy_left, entropy_right, minEntropy;
|
||||
minEntropy = metrics.entropy(start, end);
|
||||
for (auto idx = start + 1; idx < end; idx++) {
|
||||
// Cutpoints are always on boundaries (definition 2)
|
||||
if (X[indices[idx - 1]] < X[indices[idx]]) {
|
||||
entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx);
|
||||
entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end);
|
||||
if (entropy_left + entropy_right < minEntropy) {
|
||||
minEntropy = entropy_left + entropy_right;
|
||||
candidate = idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
size_t CPPFImdlp::getCandidate(size_t start, size_t end)
|
||||
{
|
||||
/* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
|
||||
@@ -229,11 +129,25 @@ namespace mdlp {
|
||||
ent1 = metrics.entropy(start, cut);
|
||||
ent2 = metrics.entropy(cut, end);
|
||||
ig = metrics.informationGain(start, cut, end);
|
||||
delta = log(pow(3, precision_t(k)) - 2) -
|
||||
delta = log2(pow(3, precision_t(k)) - 2) -
|
||||
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
|
||||
precision_t term = 1 / N * (log(N - 1) + delta);
|
||||
precision_t term = 1 / N * (log2(N - 1) + delta);
|
||||
return ig > term;
|
||||
}
|
||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
||||
indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
|
||||
{
|
||||
indices_t idx(X_.size());
|
||||
iota(idx.begin(), idx.end(), 0);
|
||||
for (size_t i = 0; i < X_.size(); i++)
|
||||
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2)
|
||||
{
|
||||
if (X_[i1] == X_[i2]) return y_[i1] < y_[i2];
|
||||
else
|
||||
return X_[i1] < X_[i2];
|
||||
});
|
||||
return idx;
|
||||
}
|
||||
cutPoints_t CPPFImdlp::getCutPoints()
|
||||
{
|
||||
// Remove duplicates and sort
|
||||
@@ -246,27 +160,4 @@ namespace mdlp {
|
||||
sort(output.begin(), output.end());
|
||||
return output;
|
||||
}
|
||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
||||
indices_t CPPFImdlp::sortIndices(samples_t& X_)
|
||||
{
|
||||
indices_t idx(X_.size());
|
||||
iota(idx.begin(), idx.end(), 0);
|
||||
for (size_t i = 0; i < X_.size(); i++)
|
||||
stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
|
||||
{ return X_[i1] < X_[i2]; });
|
||||
return idx;
|
||||
}
|
||||
indices_t CPPFImdlp::sortIndices2(samples_t& X_, labels_t& y_)
|
||||
{
|
||||
indices_t idx(X_.size());
|
||||
iota(idx.begin(), idx.end(), 0);
|
||||
for (size_t i = 0; i < X_.size(); i++)
|
||||
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2)
|
||||
{
|
||||
if (X_[i1] == X_[i2]) return y_[i1] < y_[i2];
|
||||
else
|
||||
return X_[i1] < X_[i2];
|
||||
});
|
||||
return idx;
|
||||
}
|
||||
}
|
||||
|
18
CPPFImdlp.h
18
CPPFImdlp.h
@@ -9,29 +9,25 @@ namespace mdlp {
|
||||
class CPPFImdlp {
|
||||
protected:
|
||||
int algorithm;
|
||||
indices_t indices; // sorted indices to use with X and y
|
||||
indices_t indices;
|
||||
samples_t X;
|
||||
labels_t y;
|
||||
Metrics metrics;
|
||||
cutPoints_t cutPoints;
|
||||
|
||||
static indices_t sortIndices(samples_t&);
|
||||
static indices_t sortIndices2(samples_t&, labels_t&);
|
||||
static indices_t sortIndices(samples_t&, labels_t&);
|
||||
void computeCutPoints(size_t, size_t);
|
||||
void computeCutPointsAlternative(size_t, size_t);
|
||||
bool mdlp(size_t, size_t, size_t);
|
||||
size_t getCandidate(size_t, size_t);
|
||||
size_t getCandidateWeka(size_t, size_t);
|
||||
void computeCutPointsAlternative(size_t, size_t);
|
||||
void computeCutPointsProposal(size_t, size_t);
|
||||
precision_t value_cut_point(size_t, size_t);
|
||||
tuple<precision_t, size_t> value_proposal_cut_point(size_t, size_t, size_t);
|
||||
|
||||
precision_t halfWayValueCutPoint(size_t, size_t);
|
||||
tuple<precision_t, size_t> completeValueCutPoint(size_t, size_t, size_t);
|
||||
public:
|
||||
CPPFImdlp(int);
|
||||
CPPFImdlp(int algorithm = 0);
|
||||
~CPPFImdlp();
|
||||
CPPFImdlp& fit(samples_t&, labels_t&);
|
||||
samples_t getCutPoints();
|
||||
inline string version() { return "0.8.1"; };
|
||||
inline string version() { return "0.9.7"; };
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -39,7 +39,7 @@ namespace mdlp {
|
||||
for (auto count : counts) {
|
||||
if (count > 0) {
|
||||
p = (precision_t)count / nElements;
|
||||
ventropy -= p * log(p);
|
||||
ventropy -= p * log2(p);
|
||||
}
|
||||
}
|
||||
entropyCache[make_tuple(start, end)] = ventropy;
|
||||
|
@@ -1,2 +1,9 @@
|
||||
# mdlp
|
||||
Discretization algorithm based on the paper by Fayyad & Irani Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning
|
||||
Discretization algorithm based on the paper by Fayyad & Irani [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](https://www.ijcai.org/Proceedings/93-2/Papers/022.pdf)
|
||||
|
||||
The implementation tries to mitigate the problem of different label values with the same value of the variable:
|
||||
|
||||
- Sorts the values of the variable using the label values as a tie-breaker
|
||||
- Once found a valid candidate for the split, it checks if the previous value is the same as actual one, and tries to get previous one, or next if the former is not possible.
|
||||
|
||||
The algorithm returns the cut points for the variable.
|
152
feature0
152
feature0
@@ -1,152 +0,0 @@
|
||||
+++++++++++++++++++++++
|
||||
( 0, 13) -> (4.3, 0)
|
||||
( 1, 8) -> (4.4, 0)
|
||||
( 2, 38) -> (4.4, 0)
|
||||
( 3, 42) -> (4.4, 0)
|
||||
( 4, 41) -> (4.5, 0)
|
||||
( 5, 3) -> (4.6, 0)
|
||||
( 6, 6) -> (4.6, 0)
|
||||
( 7, 22) -> (4.6, 0)
|
||||
( 8, 47) -> (4.6, 0)
|
||||
( 9, 2) -> (4.7, 0)
|
||||
( 10, 29) -> (4.7, 0)
|
||||
( 11, 11) -> (4.8, 0)
|
||||
( 12, 12) -> (4.8, 0)
|
||||
( 13, 24) -> (4.8, 0)
|
||||
( 14, 30) -> (4.8, 0)
|
||||
( 15, 45) -> (4.8, 0)
|
||||
( 16, 1) -> (4.9, 0)
|
||||
( 17, 9) -> (4.9, 0)
|
||||
( 18, 34) -> (4.9, 0)
|
||||
( 19, 37) -> (4.9, 0)
|
||||
( 20, 57) -> (4.9, 1) candidate Total Entropy: 0.633 E. left: 0.000 E. right: 0.855 = 0.539 (0, 54) No
|
||||
( 21, 106) -> (4.9, 2)
|
||||
( 22, 4) -> (5.0, 0)
|
||||
( 23, 7) -> (5.0, 0)
|
||||
( 24, 25) -> (5.0, 0)
|
||||
( 25, 26) -> (5.0, 0)
|
||||
( 26, 35) -> (5.0, 0)
|
||||
( 27, 40) -> (5.0, 0)
|
||||
( 28, 43) -> (5.0, 0)
|
||||
( 29, 49) -> (5.0, 0)
|
||||
( 30, 60) -> (5.0, 1)
|
||||
( 31, 93) -> (5.0, 1)
|
||||
( 32, 0) -> (5.1, 0)
|
||||
( 33, 17) -> (5.1, 0)
|
||||
( 34, 19) -> (5.1, 0)
|
||||
( 35, 21) -> (5.1, 0)
|
||||
( 36, 23) -> (5.1, 0)
|
||||
( 37, 39) -> (5.1, 0)
|
||||
( 38, 44) -> (5.1, 0)
|
||||
( 39, 46) -> (5.1, 0)
|
||||
( 40, 98) -> (5.1, 1)
|
||||
( 41, 27) -> (5.2, 0)
|
||||
( 42, 28) -> (5.2, 0)
|
||||
( 43, 32) -> (5.2, 0)
|
||||
( 44, 59) -> (5.2, 1)
|
||||
( 45, 48) -> (5.3, 0)
|
||||
( 46, 5) -> (5.4, 0)
|
||||
( 47, 10) -> (5.4, 0)
|
||||
( 48, 16) -> (5.4, 0)
|
||||
( 49, 20) -> (5.4, 0)
|
||||
( 50, 31) -> (5.4, 0)
|
||||
( 51, 84) -> (5.4, 1)
|
||||
( 52, 33) -> (5.5, 0)
|
||||
( 53, 36) -> (5.5, 0)
|
||||
( 54, 53) -> (5.5, 1) 1st cut Total Entropy: 1.585 E. left: 0.633 E. right: 1.167 = 0.975 (0, 150) Sí => 5.450
|
||||
( 55, 80) -> (5.5, 1)
|
||||
( 56, 81) -> (5.5, 1)
|
||||
( 57, 89) -> (5.5, 1)
|
||||
( 58, 90) -> (5.5, 1)
|
||||
( 59, 64) -> (5.6, 1)
|
||||
( 60, 66) -> (5.6, 1)
|
||||
( 61, 69) -> (5.6, 1)
|
||||
( 62, 88) -> (5.6, 1)
|
||||
( 63, 94) -> (5.6, 1)
|
||||
( 64, 121) -> (5.6, 2) Candidate Total Entropy: 1.167 E. left: 0.966 E. right: 0.939 = 0.946 (54, 77) No
|
||||
( 65, 15) -> (5.7, 0)
|
||||
( 66, 18) -> (5.7, 0)
|
||||
( 67, 55) -> (5.7, 1)
|
||||
( 68, 79) -> (5.7, 1)
|
||||
( 69, 95) -> (5.7, 1)
|
||||
( 70, 96) -> (5.7, 1)
|
||||
( 71, 99) -> (5.7, 1)
|
||||
( 72, 113) -> (5.7, 2)
|
||||
( 73, 14) -> (5.8, 0)
|
||||
( 74, 67) -> (5.8, 1)
|
||||
( 75, 82) -> (5.8, 1)
|
||||
( 76, 92) -> (5.8, 1)
|
||||
( 77, 101) -> (5.8, 2) 2nd cut Total Entropy: 1.167 E. left: 0.966 E. right: 0.939 = 0.946 (54, 150) Sí => 5.750
|
||||
( 78, 114) -> (5.8, 2)
|
||||
( 79, 142) -> (5.8, 2)
|
||||
( 80, 61) -> (5.9, 1)
|
||||
( 81, 70) -> (5.9, 1)
|
||||
( 82, 149) -> (5.9, 2)
|
||||
( 83, 62) -> (6.0, 1)
|
||||
( 84, 78) -> (6.0, 1)
|
||||
( 85, 83) -> (6.0, 1)
|
||||
( 86, 85) -> (6.0, 1)
|
||||
( 87, 119) -> (6.0, 2)
|
||||
( 88, 138) -> (6.0, 2)
|
||||
( 89, 63) -> (6.1, 1)
|
||||
( 90, 71) -> (6.1, 1)
|
||||
( 91, 73) -> (6.1, 1)
|
||||
( 92, 91) -> (6.1, 1)
|
||||
( 93, 127) -> (6.1, 2)
|
||||
( 94, 134) -> (6.1, 2)
|
||||
( 95, 68) -> (6.2, 1)
|
||||
( 96, 97) -> (6.2, 1)
|
||||
( 97, 126) -> (6.2, 2)
|
||||
( 98, 148) -> (6.2, 2)
|
||||
( 99, 56) -> (6.3, 1)
|
||||
(100, 72) -> (6.3, 1)
|
||||
(101, 87) -> (6.3, 1)
|
||||
(102, 100) -> (6.3, 2)
|
||||
(103, 103) -> (6.3, 2)
|
||||
(104, 123) -> (6.3, 2)
|
||||
(105, 133) -> (6.3, 2)
|
||||
(106, 136) -> (6.3, 2)
|
||||
(107, 146) -> (6.3, 2)
|
||||
(108, 51) -> (6.4, 1)
|
||||
(109, 74) -> (6.4, 1)
|
||||
(110, 111) -> (6.4, 2)
|
||||
(111, 115) -> (6.4, 2)
|
||||
(112, 128) -> (6.4, 2)
|
||||
(113, 132) -> (6.4, 2)
|
||||
(114, 137) -> (6.4, 2)
|
||||
(115, 54) -> (6.5, 1)
|
||||
(116, 104) -> (6.5, 2)
|
||||
(117, 110) -> (6.5, 2)
|
||||
(118, 116) -> (6.5, 2)
|
||||
(119, 147) -> (6.5, 2)
|
||||
(120, 58) -> (6.6, 1)
|
||||
(121, 75) -> (6.6, 1)
|
||||
(122, 65) -> (6.7, 1)
|
||||
(123, 77) -> (6.7, 1)
|
||||
(124, 86) -> (6.7, 1)
|
||||
(125, 108) -> (6.7, 2)
|
||||
(126, 124) -> (6.7, 2)
|
||||
(127, 140) -> (6.7, 2)
|
||||
(128, 144) -> (6.7, 2)
|
||||
(129, 145) -> (6.7, 2)
|
||||
(130, 76) -> (6.8, 1)
|
||||
(131, 112) -> (6.8, 2)
|
||||
(132, 143) -> (6.8, 2)
|
||||
(133, 52) -> (6.9, 1)
|
||||
(134, 120) -> (6.9, 2)
|
||||
(135, 139) -> (6.9, 2)
|
||||
(136, 141) -> (6.9, 2)
|
||||
(137, 50) -> (7.0, 1)
|
||||
(138, 102) -> (7.1, 2) candidate Total Entropy: 0.939 E. left: 0.984 E. right: 0.000 = 0.822 (77, 150) No
|
||||
(139, 109) -> (7.2, 2)
|
||||
(140, 125) -> (7.2, 2)
|
||||
(141, 129) -> (7.2, 2)
|
||||
(142, 107) -> (7.3, 2)
|
||||
(143, 130) -> (7.4, 2)
|
||||
(144, 105) -> (7.6, 2)
|
||||
(145, 117) -> (7.7, 2)
|
||||
(146, 118) -> (7.7, 2)
|
||||
(147, 122) -> (7.7, 2)
|
||||
(148, 135) -> (7.7, 2)
|
||||
(149, 131) -> (7.9, 2)
|
||||
+++++++++++++++++++++++
|
@@ -1,5 +1,4 @@
|
||||
#include "ArffFiles.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
|
@@ -1,95 +0,0 @@
|
||||
#include "ArffFiles.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <iomanip>
|
||||
#include "../CPPFImdlp.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace mdlp;
|
||||
|
||||
tuple<precision_t, size_t> getCutPoint(samples_t& X, labels_t& y, size_t start, size_t cut, size_t end)
|
||||
{
|
||||
size_t idxPrev = cut - 1;
|
||||
precision_t previous, next, actual;
|
||||
previous = X[idxPrev];
|
||||
next = actual = X[cut];
|
||||
// definition 2 of the paper => X[t-1] < X[t]
|
||||
while (idxPrev-- > start && actual == previous) {
|
||||
previous = X[idxPrev];
|
||||
}
|
||||
// get the last equal value of X in the interval
|
||||
while (actual == X[cut++] && cut < end);
|
||||
if (previous == actual && cut < end)
|
||||
actual = X[cut];
|
||||
cut--;
|
||||
return make_tuple((previous + actual) / 2, cut);
|
||||
}
|
||||
|
||||
void show_points(samples_t& X, labels_t& y, size_t start, size_t end)
|
||||
{
|
||||
cout << "Interval: " << start << " - " << end << endl;
|
||||
tuple<precision_t, size_t> cutPoint;
|
||||
size_t cut = start + 1;
|
||||
if (start >= end) {
|
||||
return;
|
||||
}
|
||||
while (y[cut - 1] == y[cut] && cut < end)
|
||||
cut++;
|
||||
if (cut != end) {
|
||||
cutPoint = getCutPoint(X, y, start, cut, end);
|
||||
cout << cut << ": " << fixed << setprecision(1) << X[cut] << " " << y[cut] << endl;
|
||||
cout << "Cut point: " << get<0>(cutPoint) << " at " << get<1>(cutPoint) << endl;
|
||||
show_points(X, y, start, get<1>(cutPoint));
|
||||
show_points(X, y, get<1>(cutPoint), end);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
ArffFiles file;
|
||||
vector<string> lines;
|
||||
string path = "../tests/";
|
||||
map<string, bool > datasets = {
|
||||
{"01", true},
|
||||
{"02", true},
|
||||
{"03", true},
|
||||
{"04", true}
|
||||
};
|
||||
if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {
|
||||
cout << "Usage: " << argv[0] << " {01, 02, 03, 04}" << endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
file.load(path + argv[1] + ".arff", datasets[argv[1]]);
|
||||
auto attributes = file.getAttributes();
|
||||
int items = file.getSize();
|
||||
cout << "Number of lines: " << items << endl;
|
||||
cout << "Attributes: " << endl;
|
||||
for (auto attribute : attributes) {
|
||||
cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl;
|
||||
}
|
||||
cout << "Class name: " << file.getClassName() << endl;
|
||||
cout << "Class type: " << file.getClassType() << endl;
|
||||
cout << "Data: " << endl;
|
||||
vector<samples_t>& X = file.getX();
|
||||
labels_t& y = file.getY();
|
||||
for (int i = 0; i < y.size(); i++) {
|
||||
for (auto feature : X) {
|
||||
cout << i << ": " << fixed << setprecision(1) << feature[i] << " ";
|
||||
}
|
||||
cout << y[i] << endl;
|
||||
}
|
||||
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0);
|
||||
for (auto i = 0; i < attributes.size(); i++) {
|
||||
cout << "Cut points for " << get<0>(attributes[i]) << endl;
|
||||
cout << "--------------------------" << setprecision(3) << endl;
|
||||
test.fit(X[i], y);
|
||||
for (auto item : test.getCutPoints()) {
|
||||
cout << item << endl;
|
||||
}
|
||||
}
|
||||
cout << "Function test" << endl;
|
||||
show_points(X[0], y, 0, items);
|
||||
return 0;
|
||||
}
|
@@ -1,35 +0,0 @@
|
||||
% .
|
||||
|
||||
@RELATION 01
|
||||
|
||||
@ATTRIBUTE X REAL
|
||||
@ATTRIBUTE class {0,1,2}
|
||||
|
||||
@DATA
|
||||
1, 0
|
||||
1, 0
|
||||
1, 0
|
||||
1, 0
|
||||
1, 0
|
||||
1, 0
|
||||
1, 0
|
||||
2, 0
|
||||
2, 0
|
||||
2, 0
|
||||
2, 1
|
||||
2, 2
|
||||
2, 2
|
||||
2, 2
|
||||
2, 2
|
||||
3, 0
|
||||
3, 0
|
||||
3, 0
|
||||
3, 0
|
||||
3, 0
|
||||
3, 1
|
||||
3, 1
|
||||
3, 1
|
||||
3, 2
|
||||
3, 2
|
||||
4, 0
|
||||
4, 1
|
@@ -1,25 +0,0 @@
|
||||
% .
|
||||
|
||||
@RELATION 01
|
||||
|
||||
@ATTRIBUTE X REAL
|
||||
@ATTRIBUTE class {0,1,2}
|
||||
|
||||
@DATA
|
||||
2, 0
|
||||
3, 0
|
||||
3, 0
|
||||
3, 0
|
||||
3, 0
|
||||
3, 0
|
||||
3, 1
|
||||
3, 1
|
||||
3, 1
|
||||
3, 2
|
||||
3, 2
|
||||
4, 0
|
||||
4, 1
|
||||
4, 1
|
||||
4, 1
|
||||
4, 1
|
||||
4, 1
|
@@ -1,24 +0,0 @@
|
||||
% .
|
||||
|
||||
@RELATION 01
|
||||
|
||||
@ATTRIBUTE X REAL
|
||||
@ATTRIBUTE class {0,1,2}
|
||||
|
||||
@DATA
|
||||
3, 0
|
||||
3, 0
|
||||
3, 0
|
||||
3, 0
|
||||
3, 0
|
||||
3, 1
|
||||
3, 1
|
||||
3, 1
|
||||
3, 2
|
||||
3, 2
|
||||
4, 0
|
||||
4, 1
|
||||
4, 1
|
||||
4, 1
|
||||
4, 1
|
||||
4, 1
|
@@ -4,31 +4,26 @@
|
||||
#include <iostream>
|
||||
|
||||
namespace mdlp {
|
||||
class TestFImdlp : public CPPFImdlp, public testing::Test {
|
||||
class TestFImdlp: public CPPFImdlp, public testing::Test {
|
||||
public:
|
||||
precision_t precision = 0.000001;
|
||||
|
||||
TestFImdlp() : CPPFImdlp(false) {}
|
||||
|
||||
void SetUp() {
|
||||
TestFImdlp(): CPPFImdlp() {}
|
||||
void SetUp()
|
||||
{
|
||||
// 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
|
||||
//(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
|
||||
X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9};
|
||||
y = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2};
|
||||
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
|
||||
y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
||||
algorithm = false;
|
||||
fit(X, y);
|
||||
}
|
||||
|
||||
void setalgorithm(bool value) {
|
||||
void setalgorithm(bool value)
|
||||
{
|
||||
algorithm = value;
|
||||
}
|
||||
|
||||
// void initIndices()
|
||||
// {
|
||||
// indices = indices_t();
|
||||
// }
|
||||
void checkSortedVector() {
|
||||
indices_t testSortedIndices = sortIndices(X);
|
||||
void checkSortedVector()
|
||||
{
|
||||
indices_t testSortedIndices = sortIndices(X, y);
|
||||
precision_t prev = X[testSortedIndices[0]];
|
||||
for (auto i = 0; i < X.size(); ++i) {
|
||||
EXPECT_EQ(testSortedIndices[i], indices[i]);
|
||||
@@ -36,54 +31,55 @@ namespace mdlp {
|
||||
prev = X[testSortedIndices[i]];
|
||||
}
|
||||
}
|
||||
|
||||
void checkCutPoints(cutPoints_t &expected) {
|
||||
void checkCutPoints(cutPoints_t& expected)
|
||||
{
|
||||
int expectedSize = expected.size();
|
||||
EXPECT_EQ(cutPoints.size(), expectedSize);
|
||||
for (auto i = 0; i < cutPoints.size(); i++) {
|
||||
EXPECT_NEAR(cutPoints[i], expected[i], precision);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename A>
|
||||
void checkVectors(std::vector<T, A> const &expected, std::vector<T, A> const &computed) {
|
||||
void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
|
||||
{
|
||||
EXPECT_EQ(expected.size(), computed.size());
|
||||
ASSERT_EQ(expected.size(), computed.size());
|
||||
for (auto i = 0; i < expected.size(); i++) {
|
||||
EXPECT_NEAR(expected[i], computed[i],precision);
|
||||
EXPECT_NEAR(expected[i], computed[i], precision);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(TestFImdlp, FitErrorEmptyDataset) {
|
||||
TEST_F(TestFImdlp, FitErrorEmptyDataset)
|
||||
{
|
||||
X = samples_t();
|
||||
y = labels_t();
|
||||
EXPECT_THROW(fit(X, y), std::invalid_argument);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, FitErrorDifferentSize) {
|
||||
X = {1, 2, 3};
|
||||
y = {1, 2};
|
||||
TEST_F(TestFImdlp, FitErrorDifferentSize)
|
||||
{
|
||||
X = { 1, 2, 3 };
|
||||
y = { 1, 2 };
|
||||
EXPECT_THROW(fit(X, y), std::invalid_argument);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, SortIndices) {
|
||||
X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9};
|
||||
indices = {4, 3, 6, 8, 2, 1, 5, 0, 9, 7};
|
||||
TEST_F(TestFImdlp, SortIndices)
|
||||
{
|
||||
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
|
||||
indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 };
|
||||
checkSortedVector();
|
||||
X = {5.77, 5.88, 5.99};
|
||||
indices = {0, 1, 2};
|
||||
X = { 5.77, 5.88, 5.99 };
|
||||
indices = { 0, 1, 2 };
|
||||
checkSortedVector();
|
||||
X = {5.33, 5.22, 5.11};
|
||||
indices = {2, 1, 0};
|
||||
X = { 5.33, 5.22, 5.11 };
|
||||
indices = { 2, 1, 0 };
|
||||
checkSortedVector();
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, TestDataset) {
|
||||
algorithm = false;
|
||||
TEST_F(TestFImdlp, TestDataset)
|
||||
{
|
||||
algorithm = 0;
|
||||
fit(X, y);
|
||||
computeCutPointsOriginal(0, 10);
|
||||
cutPoints_t expected = {5.6499996185302734};
|
||||
computeCutPoints(0, 10);
|
||||
cutPoints_t expected = { 5.6499996185302734 };
|
||||
vector<precision_t> computed = getCutPoints();
|
||||
computed = getCutPoints();
|
||||
int expectedSize = expected.size();
|
||||
@@ -92,49 +88,49 @@ namespace mdlp {
|
||||
EXPECT_NEAR(computed[i], expected[i], precision);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, ComputeCutPointsOriginal) {
|
||||
cutPoints_t expected = {5.65};
|
||||
TEST_F(TestFImdlp, ComputeCutPoints)
|
||||
{
|
||||
cutPoints_t expected = { 5.65 };
|
||||
algorithm = false;
|
||||
computeCutPointsOriginal(0, 10);
|
||||
computeCutPoints(0, 10);
|
||||
checkCutPoints(expected);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) {
|
||||
TEST_F(TestFImdlp, ComputeCutPointsGCase)
|
||||
{
|
||||
cutPoints_t expected;
|
||||
algorithm = false;
|
||||
expected = {2};
|
||||
samples_t X_ = {0, 1, 2, 2};
|
||||
labels_t y_ = {1, 1, 1, 2};
|
||||
expected = { 2 };
|
||||
samples_t X_ = { 0, 1, 2, 2 };
|
||||
labels_t y_ = { 1, 1, 1, 2 };
|
||||
fit(X_, y_);
|
||||
checkCutPoints(expected);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, ComputeCutPointsalgorithm) {
|
||||
TEST_F(TestFImdlp, ComputeCutPointsalAlternative)
|
||||
{
|
||||
algorithm = true;
|
||||
cutPoints_t expected;
|
||||
expected = {};
|
||||
fit(X, y);
|
||||
computeCutPointsalgorithm();
|
||||
computeCutPointsAlternative(0, 10);
|
||||
checkCutPoints(expected);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, ComputeCutPointsalgorithmGCase) {
|
||||
TEST_F(TestFImdlp, ComputeCutPointsAlternativeGCase)
|
||||
{
|
||||
cutPoints_t expected;
|
||||
expected = {1.5};
|
||||
expected = { 1.5 };
|
||||
algorithm = true;
|
||||
samples_t X_ = {0, 1, 2, 2};
|
||||
labels_t y_ = {1, 1, 1, 2};
|
||||
samples_t X_ = { 0, 1, 2, 2 };
|
||||
labels_t y_ = { 1, 1, 1, 2 };
|
||||
fit(X_, y_);
|
||||
checkCutPoints(expected);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, GetCutPoints) {
|
||||
samples_t computed, expected = {5.65};
|
||||
TEST_F(TestFImdlp, GetCutPoints)
|
||||
{
|
||||
samples_t computed, expected = { 5.65 };
|
||||
algorithm = false;
|
||||
computeCutPointsOriginal(0, 10);
|
||||
computeCutPoints(0, 10);
|
||||
computed = getCutPoints();
|
||||
for (auto item: cutPoints)
|
||||
for (auto item : cutPoints)
|
||||
cout << setprecision(6) << item << endl;
|
||||
checkVectors(expected, computed);
|
||||
}
|
||||
|
Reference in New Issue
Block a user