Remove alternative and Classic

Refactor ValueCutPoint
Reefactor sameValues in getCandidate
This commit is contained in:
2023-02-20 18:23:05 +01:00
parent 04c1772019
commit dec1295933
14 changed files with 198 additions and 203 deletions

26
.github/workflows/build.yml vendored Normal file
View File

@@ -0,0 +1,26 @@
name: Build
on:
push:
branches:
- main
jobs:
build:
name: Build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis
- uses: sonarsource/sonarqube-scan-action@master
env:
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
# If you wish to fail your job when the Quality Gate is red, uncomment the
# following lines. This would typically be used to fail a deployment.
# - uses: sonarsource/sonarqube-quality-gate-action@master
# timeout-minutes: 5
# env:
# SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}

1
.gitignore vendored
View File

@@ -35,3 +35,4 @@
.idea .idea
cmake-* cmake-*
**/CMakeFiles **/CMakeFiles
sonar-project.properties

22
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,22 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "(lldb) Launch",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/sample/build/sample",
"args": [
"test"
],
"stopAtEntry": false,
"cwd": "${workspaceRoot}/sample/build/",
"environment": [],
"externalConsole": false,
"MIMode": "lldb"
}
]
}

5
.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,5 @@
{
"sonarlint.connectedMode.project": {
"projectKey": "rmontanana_mdlp_AYZkjILJHyjW-meBaElG"
}
}

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
project(mdlp) project(mdlp)
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 11)
add_library(mdlp CPPFImdlp.cpp Metrics.cpp) add_library(mdlp CPPFImdlp.cpp Metrics.cpp)

View File

@@ -6,13 +6,15 @@
#include "Metrics.h" #include "Metrics.h"
namespace mdlp { namespace mdlp {
CPPFImdlp::CPPFImdlp(int algorithm) : algorithm(algorithm), indices(indices_t()), X(samples_t()), y(labels_t()),
metrics(Metrics(y, indices)) {
}
CPPFImdlp::CPPFImdlp(): indices(indices_t()), X(samples_t()), y(labels_t()),
metrics(Metrics(y, indices))
{
}
CPPFImdlp::~CPPFImdlp() = default; CPPFImdlp::~CPPFImdlp() = default;
CPPFImdlp &CPPFImdlp::fit(samples_t &X_, labels_t &y_) { CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_)
{
X = X_; X = X_;
y = y_; y = y_;
cutPoints.clear(); cutPoints.clear();
@@ -24,117 +26,75 @@ namespace mdlp {
} }
indices = sortIndices(X_, y_); indices = sortIndices(X_, y_);
metrics.setData(y, indices); metrics.setData(y, indices);
switch (algorithm) { computeCutPoints(0, X.size());
case 0:
computeCutPoints(0, X.size());
break;
case 1:
computeCutPointsAlternative(0, X.size());
break;
case 2:
indices = sortIndices1(X_);
metrics.setData(y, indices);
computeCutPointsClassic(0, X.size());
break;
default:
throw invalid_argument("algorithm must be 0, 1 or 2");
}
return *this; return *this;
} }
precision_t CPPFImdlp::halfWayValueCutPoint(size_t start, size_t idx) { pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
size_t idxPrev = idx - 1; {
precision_t previous = X[indices[idxPrev]], actual = X[indices[idx]]; size_t n, m, idxPrev = cut - 1 <= start ? cut - 1 : cut;
// definition 2 of the paper => X[t-1] < X[t] size_t idxNext = cut + 1 < end ? cut + 1 : cut;
while (idxPrev-- > start && actual == previous) { bool backWall; // true if duplicates reach begining of the interval
previous = X[indices[idxPrev]]; precision_t previous, actual, next;
}
return (previous + actual) / 2;
}
tuple<precision_t, size_t> CPPFImdlp::completeValueCutPoint(size_t start, size_t cut, size_t end) {
size_t idxPrev = cut - 1;
bool fforward = false;
precision_t previous, actual;
previous = X[indices[idxPrev]]; previous = X[indices[idxPrev]];
actual = X[indices[cut]]; actual = X[indices[cut]];
next = X[indices[idxNext]];
// definition 2 of the paper => X[t-1] < X[t] // definition 2 of the paper => X[t-1] < X[t]
while (idxPrev-- > start && actual == previous) { // get the first equal value of X in the interval
previous = X[indices[idxPrev]]; while (idxPrev > start && actual == previous) {
previous = X[indices[--idxPrev]];
} }
backWall = idxPrev == start && actual == previous;
// get the last equal value of X in the interval // get the last equal value of X in the interval
while (actual == X[indices[cut]] && cut + 1 < end) { while (idxNext < end - 1 && actual == next) {
cut++; next = X[indices[++idxNext]];
fforward = true;
} }
if (fforward) // # of duplicates before cutpoint
cut--; n = cut - 1 - idxPrev;
// try to get the next value if it can't be found backwards // # of duplicates after cutpoint
if (previous == actual && cut + 1 < end) m = idxNext - cut - 1;
actual = X[indices[cut + 1]]; // Decide which values to use
return make_tuple((previous + actual) / 2, cut); cut = cut + (backWall ? m + 1 : -n);
actual = X[indices[cut]];
return { (actual + previous) / 2, cut };
} }
void CPPFImdlp::computeCutPoints(size_t start, size_t end) { void CPPFImdlp::computeCutPoints(size_t start, size_t end)
{
size_t cut; size_t cut;
tuple<precision_t, size_t> result; pair<precision_t, size_t> result;
if (end - start < 2) if (end - start < 3)
return; return;
cut = getCandidate(start, end); cut = getCandidate(start, end);
if (cut == numeric_limits<size_t>::max()) if (cut == numeric_limits<size_t>::max())
return; return;
if (mdlp(start, cut, end)) { if (mdlp(start, cut, end)) {
result = completeValueCutPoint(start, cut, end); result = valueCutPoint(start, cut, end);
cut = get<1>(result); cut = result.second;
cutPoints.push_back(get<0>(result)); cutPoints.push_back(result.first);
computeCutPoints(start, cut); computeCutPoints(start, cut);
computeCutPoints(cut, end); computeCutPoints(cut, end);
} }
} }
void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end) { size_t CPPFImdlp::getCandidate(size_t start, size_t end)
size_t cut; {
if (end - start < 2)
return;
cut = getCandidate(start, end);
if (cut == numeric_limits<size_t>::max())
return;
if (mdlp(start, cut, end)) {
cutPoints.push_back(halfWayValueCutPoint(start, cut));
computeCutPointsAlternative(start, cut);
computeCutPointsAlternative(cut, end);
}
}
void CPPFImdlp::computeCutPointsClassic(size_t start, size_t end) {
size_t cut;
cut = getCandidate(start, end);
if (cut == numeric_limits<size_t>::max() || !mdlp(start, cut, end)) {
// cut.value == -1 means that there is no candidate in the interval
// No boundary found, so we add both ends of the interval as cutpoints
// because they were selected by the algorithm before
if (start == end)
return;
if (start != 0)
cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2);
if (end != X.size())
cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2);
return;
}
computeCutPoints(start, cut);
computeCutPoints(cut, end);
}
size_t CPPFImdlp::getCandidate(size_t start, size_t end) {
/* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
E(A, TA; S) is minimal amongst all the candidate cut points. */ E(A, TA; S) is minimal amongst all the candidate cut points. */
size_t candidate = numeric_limits<size_t>::max(), elements = end - start; size_t candidate = numeric_limits<size_t>::max(), elements = end - start;
bool same_values = true; bool sameValues = true;
precision_t entropy_left, entropy_right, minEntropy; precision_t entropy_left, entropy_right, minEntropy;
// Check if all the values of the variable in the interval are the same
for (size_t idx = start + 1; idx < end; idx++) {
if (X[indices[idx]] != X[indices[start]]) {
sameValues = false;
break;
}
}
if (sameValues)
return candidate;
minEntropy = metrics.entropy(start, end); minEntropy = metrics.entropy(start, end);
for (auto idx = start + 1; idx < end; idx++) { for (size_t idx = start + 1; idx < end; idx++) {
if (X[indices[idx]] != X[indices[idx - 1]])
same_values = false;
// Cutpoints are always on boundaries (definition 2) // Cutpoints are always on boundaries (definition 2)
if (y[indices[idx]] == y[indices[idx - 1]]) if (y[indices[idx]] == y[indices[idx - 1]])
continue; continue;
@@ -145,13 +105,11 @@ namespace mdlp {
candidate = idx; candidate = idx;
} }
} }
// If all the values of the variable in the interval are the same, it doesn't consider the cut point
if (same_values)
candidate = numeric_limits<size_t>::max();
return candidate; return candidate;
} }
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) { bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
{
int k, k1, k2; int k, k1, k2;
precision_t ig, delta; precision_t ig, delta;
precision_t ent, ent1, ent2; precision_t ent, ent1, ent2;
@@ -167,37 +125,28 @@ namespace mdlp {
ent2 = metrics.entropy(cut, end); ent2 = metrics.entropy(cut, end);
ig = metrics.informationGain(start, cut, end); ig = metrics.informationGain(start, cut, end);
delta = log2(pow(3, precision_t(k)) - 2) - delta = log2(pow(3, precision_t(k)) - 2) -
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
precision_t term = 1 / N * (log2(N - 1) + delta); precision_t term = 1 / N * (log2(N - 1) + delta);
return ig > term; return ig > term;
} }
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples_t &X_, labels_t &y_) { indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
{
indices_t idx(X_.size()); indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0); iota(idx.begin(), idx.end(), 0);
for (size_t i = 0; i < X_.size(); i++) for (size_t i = 0; i < X_.size(); i++)
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) { stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
if (X_[i1] == X_[i2]) if (X_[i1] == X_[i2])
return y_[i1] < y_[i2]; return y_[i1] < y_[i2];
else else
return X_[i1] < X_[i2];
});
return idx;
}
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices1(samples_t &X_) {
indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0);
for (size_t i = 0; i < X_.size(); i++)
stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) {
return X_[i1] < X_[i2]; return X_[i1] < X_[i2];
}); });
return idx; return idx;
} }
cutPoints_t CPPFImdlp::getCutPoints() { cutPoints_t CPPFImdlp::getCutPoints()
{
// Remove duplicates and sort // Remove duplicates and sort
cutPoints_t output(cutPoints.size()); cutPoints_t output(cutPoints.size());
set<precision_t> s; set<precision_t> s;

View File

@@ -3,12 +3,10 @@
#include "typesFImdlp.h" #include "typesFImdlp.h"
#include "Metrics.h" #include "Metrics.h"
#include <utility> #include <utility>
#include <tuple>
#include <string> #include <string>
namespace mdlp { namespace mdlp {
class CPPFImdlp { class CPPFImdlp {
protected: protected:
int algorithm;
indices_t indices; indices_t indices;
samples_t X; samples_t X;
labels_t y; labels_t y;
@@ -16,20 +14,16 @@ namespace mdlp {
cutPoints_t cutPoints; cutPoints_t cutPoints;
static indices_t sortIndices(samples_t&, labels_t&); static indices_t sortIndices(samples_t&, labels_t&);
static indices_t sortIndices1(samples_t&);
void computeCutPoints(size_t, size_t); void computeCutPoints(size_t, size_t);
void computeCutPointsAlternative(size_t, size_t);
void computeCutPointsClassic(size_t, size_t);
bool mdlp(size_t, size_t, size_t); bool mdlp(size_t, size_t, size_t);
size_t getCandidate(size_t, size_t); size_t getCandidate(size_t, size_t);
precision_t halfWayValueCutPoint(size_t, size_t); pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
tuple<precision_t, size_t> completeValueCutPoint(size_t, size_t, size_t);
public: public:
CPPFImdlp(int algorithm = 0); CPPFImdlp();
~CPPFImdlp(); ~CPPFImdlp();
CPPFImdlp& fit(samples_t&, labels_t&); CPPFImdlp& fit(samples_t&, labels_t&);
samples_t getCutPoints(); samples_t getCutPoints();
inline string version() { return "1.0.0"; }; inline string version() { return "1.1.0"; };
}; };
} }
#endif #endif

View File

@@ -29,8 +29,8 @@ namespace mdlp {
labels_t counts(numClasses + 1, 0); labels_t counts(numClasses + 1, 0);
if (end - start < 2) if (end - start < 2)
return 0; return 0;
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) { if (entropyCache.find({ start, end }) != entropyCache.end()) {
return entropyCache[make_tuple(start, end)]; return entropyCache[{start, end}];
} }
for (auto i = &indices[start]; i != &indices[end]; ++i) { for (auto i = &indices[start]; i != &indices[end]; ++i) {
counts[y[*i]]++; counts[y[*i]]++;
@@ -42,7 +42,7 @@ namespace mdlp {
ventropy -= p * log2(p); ventropy -= p * log2(p);
} }
} }
entropyCache[make_tuple(start, end)] = ventropy; entropyCache[{start, end}] = ventropy;
return ventropy; return ventropy;
} }
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end) precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)

View File

@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
project(main) project(main)
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 11)
add_executable(sample sample.cpp ../tests/ArffFiles.cpp ../Metrics.cpp ../CPPFImdlp.cpp) add_executable(sample sample.cpp ../tests/ArffFiles.cpp ../Metrics.cpp ../CPPFImdlp.cpp)

View File

@@ -7,18 +7,20 @@
using namespace std; using namespace std;
using namespace mdlp; using namespace mdlp;
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
ArffFiles file; ArffFiles file;
vector<string> lines; vector<string> lines;
string path = "../../tests/datasets/"; string path = "../../tests/datasets/";
map<string, bool > datasets = { map<string, bool> datasets = {
{"mfeat-factors", true}, {"mfeat-factors", true},
{"iris", true}, {"iris", true},
{"letter", true}, {"letter", true},
{"glass", true}, {"glass", true},
{"kdd_JapaneseVowels", false}, {"kdd_JapaneseVowels", false},
{"test", true} {"mfeat-factors", true},
{"test", true}
}; };
if (argc != 2 || datasets.find(argv[1]) == datasets.end()) { if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {
cout << "Usage: " << argv[0] << " {mfeat-factors, glass, iris, letter, kdd_JapaneseVowels, test}" << endl; cout << "Usage: " << argv[0] << " {mfeat-factors, glass, iris, letter, kdd_JapaneseVowels, test}" << endl;
@@ -44,9 +46,11 @@ int main(int argc, char** argv)
} }
cout << y[i] << endl; cout << y[i] << endl;
} }
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0); mdlp::CPPFImdlp test = mdlp::CPPFImdlp();
for (auto i = 0; i < attributes.size(); i++) { for (auto i = 0; i < attributes.size(); i++) {
auto min_max = minmax_element(X[i].begin(), X[i].end());
cout << "Cut points for " << get<0>(attributes[i]) << endl; cout << "Cut points for " << get<0>(attributes[i]) << endl;
cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl;
cout << "--------------------------" << setprecision(3) << endl; cout << "--------------------------" << setprecision(3) << endl;
test.fit(X[i], y); test.fit(X[i], y);
for (auto item : test.getCutPoints()) { for (auto item : test.getCutPoints()) {

View File

@@ -17,7 +17,7 @@ unsigned long int ArffFiles::getSize()
{ {
return lines.size(); return lines.size();
} }
vector<tuple<string, string>> ArffFiles::getAttributes() vector<pair<string, string>> ArffFiles::getAttributes()
{ {
return attributes; return attributes;
} }
@@ -50,7 +50,7 @@ void ArffFiles::load(string fileName, bool classLast)
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line); stringstream ss(line);
ss >> keyword >> attribute >> type; ss >> keyword >> attribute >> type;
attributes.push_back(make_tuple(attribute, type)); attributes.push_back({ attribute, type });
continue; continue;
} }
if (line[0] == '@') { if (line[0] == '@') {

View File

@@ -2,12 +2,11 @@
#define ARFFFILES_H #define ARFFFILES_H
#include <string> #include <string>
#include <vector> #include <vector>
#include <tuple>
using namespace std; using namespace std;
class ArffFiles { class ArffFiles {
private: private:
vector<string> lines; vector<string> lines;
vector<tuple<string, string>> attributes; vector<pair<string, string>> attributes;
string className, classType; string className, classType;
vector<vector<float>> X; vector<vector<float>> X;
vector<int> y; vector<int> y;
@@ -22,7 +21,7 @@ public:
string trim(const string&); string trim(const string&);
vector<vector<float>>& getX(); vector<vector<float>>& getX();
vector<int>& getY(); vector<int>& getY();
vector<tuple<string, string>> getAttributes(); vector<pair<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels_t); vector<int> factorize(const vector<string>& labels_t);
}; };
#endif #endif

View File

@@ -13,18 +13,13 @@ namespace mdlp {
{ {
X = { 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.95, 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; X = { 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.95, 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
algorithm = false;
fit(X, y); fit(X, y);
} }
void setalgorithm(bool value)
{
algorithm = value;
}
void checkSortedVector() void checkSortedVector()
{ {
indices_t testSortedIndices = sortIndices(X, y); indices_t testSortedIndices = sortIndices(X, y);
precision_t prev = X[testSortedIndices[0]]; precision_t prev = X[testSortedIndices[0]];
for (auto i = 0; i < X.size(); ++i) { for (unsigned long i = 0; i < X.size(); ++i) {
EXPECT_EQ(testSortedIndices[i], indices[i]); EXPECT_EQ(testSortedIndices[i], indices[i]);
EXPECT_LE(prev, X[testSortedIndices[i]]); EXPECT_LE(prev, X[testSortedIndices[i]]);
prev = X[testSortedIndices[i]]; prev = X[testSortedIndices[i]];
@@ -34,7 +29,7 @@ namespace mdlp {
{ {
int expectedSize = expected.size(); int expectedSize = expected.size();
EXPECT_EQ(cutPoints.size(), expectedSize); EXPECT_EQ(cutPoints.size(), expectedSize);
for (auto i = 0; i < cutPoints.size(); i++) { for (unsigned long i = 0; i < cutPoints.size(); i++) {
EXPECT_NEAR(cutPoints[i], expected[i], precision); EXPECT_NEAR(cutPoints[i], expected[i], precision);
} }
} }
@@ -47,6 +42,19 @@ namespace mdlp {
EXPECT_NEAR(expected[i], computed[i], precision); EXPECT_NEAR(expected[i], computed[i], precision);
} }
} }
bool test_result(samples_t& X_, size_t cut, float midPoint, size_t limit, string title)
{
pair<precision_t, size_t> result;
labels_t y_ = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
X = X_;
y = y_;
indices = sortIndices(X, y);
cout << "* " << title << endl;
result = valueCutPoint(0, cut, 10);
EXPECT_NEAR(result.first, midPoint, precision);
EXPECT_EQ(result.second, limit);
return true;
}
}; };
TEST_F(TestFImdlp, FitErrorEmptyDataset) TEST_F(TestFImdlp, FitErrorEmptyDataset)
{ {
@@ -54,11 +62,6 @@ namespace mdlp {
y = labels_t(); y = labels_t();
EXPECT_THROW(fit(X, y), std::invalid_argument); EXPECT_THROW(fit(X, y), std::invalid_argument);
} }
TEST_F(TestFImdlp, FitErrorIncorrectAlgorithm)
{
algorithm = 2;
EXPECT_THROW(fit(X, y), std::invalid_argument);
}
TEST_F(TestFImdlp, FitErrorDifferentSize) TEST_F(TestFImdlp, FitErrorDifferentSize)
{ {
X = { 1, 2, 3 }; X = { 1, 2, 3 };
@@ -83,31 +86,41 @@ namespace mdlp {
y = { 2, 2, 1 }; y = { 2, 2, 1 };
indices = { 1, 2, 0 }; indices = { 1, 2, 0 };
} }
TEST_F(TestFImdlp, TestArtificialDatasetAlternative) TEST_F(TestFImdlp, TestShortDatasets)
{ {
algorithm = 1; vector<precision_t> computed;
X = { 1 };
y = { 1 };
fit(X, y); fit(X, y);
computeCutPoints(0, 20);
cutPoints_t expected = { 5.0500001907348633 };
vector<precision_t> computed = getCutPoints();
computed = getCutPoints(); computed = getCutPoints();
int expectedSize = expected.size(); EXPECT_EQ(computed.size(), 0);
EXPECT_EQ(computed.size(), expected.size()); X = { 1, 3 };
for (auto i = 0; i < computed.size(); i++) { y = { 1, 2 };
EXPECT_NEAR(computed[i], expected[i], precision); fit(X, y);
} computed = getCutPoints();
EXPECT_EQ(computed.size(), 0);
X = { 2, 4 };
y = { 1, 2 };
fit(X, y);
computed = getCutPoints();
EXPECT_EQ(computed.size(), 0);
X = { 1, 2, 3 };
y = { 1, 2, 2 };
fit(X, y);
computed = getCutPoints();
EXPECT_EQ(computed.size(), 1);
EXPECT_NEAR(computed[0], 1.5, precision);
} }
TEST_F(TestFImdlp, TestArtificialDataset) TEST_F(TestFImdlp, TestArtificialDataset)
{ {
algorithm = 0;
fit(X, y); fit(X, y);
computeCutPoints(0, 20); computeCutPoints(0, 20);
cutPoints_t expected = { 5.0500001907348633 }; cutPoints_t expected = { 5.05 };
vector<precision_t> computed = getCutPoints(); vector<precision_t> computed = getCutPoints();
computed = getCutPoints(); computed = getCutPoints();
int expectedSize = expected.size(); int expectedSize = expected.size();
EXPECT_EQ(computed.size(), expected.size()); EXPECT_EQ(computed.size(), expected.size());
for (auto i = 0; i < computed.size(); i++) { for (unsigned long i = 0; i < computed.size(); i++) {
EXPECT_NEAR(computed[i], expected[i], precision); EXPECT_NEAR(computed[i], expected[i], precision);
} }
} }
@@ -116,44 +129,17 @@ namespace mdlp {
ArffFiles file; ArffFiles file;
string path = "../datasets/"; string path = "../datasets/";
file.load(path + "iris.arff", true);
int items = file.getSize();
vector<samples_t>& X = file.getX();
vector<cutPoints_t> expected = {
{ 5.4499998092651367, 6.25 },
{ 2.8499999046325684, 3, 3.0499999523162842, 3.3499999046325684 },
{ 2.4500000476837158, 4.75, 5.0500001907348633 },
{ 0.80000001192092896, 1.4500000476837158, 1.75 }
};
labels_t& y = file.getY();
auto attributes = file.getAttributes();
algorithm = 0;
for (auto feature = 0; feature < attributes.size(); feature++) {
fit(X[feature], y);
vector<precision_t> computed = getCutPoints();
EXPECT_EQ(computed.size(), expected[feature].size());
for (auto i = 0; i < computed.size(); i++) {
EXPECT_NEAR(computed[i], expected[feature][i], precision);
}
}
}
TEST_F(TestFImdlp, TestIrisAlternative)
{
ArffFiles file;
string path = "../datasets/";
file.load(path + "iris.arff", true); file.load(path + "iris.arff", true);
int items = file.getSize(); int items = file.getSize();
vector<samples_t>& X = file.getX(); vector<samples_t>& X = file.getX();
vector<cutPoints_t> expected = { vector<cutPoints_t> expected = {
{ 5.4499998092651367, 5.75 }, { 5.4499998092651367, 5.75 },
{ 2.8499999046325684, 3.3499999046325684 }, { 2.75, 2.85, 2.95, 3.05, 3.35 },
{ 2.4500000476837158, 4.75 }, { 2.4500000476837158, 4.75, 5.0500001907348633 },
{ 0.80000001192092896, 1.75 } { 0.80000001192092896, 1.75 }
}; };
labels_t& y = file.getY(); labels_t& y = file.getY();
auto attributes = file.getAttributes(); auto attributes = file.getAttributes();
algorithm = 1;
for (auto feature = 0; feature < attributes.size(); feature++) { for (auto feature = 0; feature < attributes.size(); feature++) {
fit(X[feature], y); fit(X[feature], y);
vector<precision_t> computed = getCutPoints(); vector<precision_t> computed = getCutPoints();
@@ -166,21 +152,30 @@ namespace mdlp {
TEST_F(TestFImdlp, ComputeCutPointsGCase) TEST_F(TestFImdlp, ComputeCutPointsGCase)
{ {
cutPoints_t expected; cutPoints_t expected;
algorithm = 0;
expected = { 1.5 }; expected = { 1.5 };
samples_t X_ = { 0, 1, 2, 2 }; samples_t X_ = { 0, 1, 2, 2, 2 };
labels_t y_ = { 1, 1, 1, 2 }; labels_t y_ = { 1, 1, 1, 2, 2 };
fit(X_, y_); fit(X_, y_);
checkCutPoints(expected); checkCutPoints(expected);
} }
TEST_F(TestFImdlp, ComputeCutPointsAlternativeGCase) TEST_F(TestFImdlp, CompleteValueCutPoint)
{ {
cutPoints_t expected; // Case titles as stated in the doc
expected = { 1.5 }; samples_t X1a{ 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0 };
algorithm = true; test_result(X1a, 6, 7.3 / 2, 6, "1a");
samples_t X_ = { 0, 1, 2, 2 }; samples_t X2a = { 3.1, 3.2, 3.3, 3.4, 3.7, 3.7, 3.7, 3.8, 3.9, 4.0 };
labels_t y_ = { 1, 1, 1, 2 }; test_result(X2a, 6, 7.1 / 2, 4, "2a");
fit(X_, y_); samples_t X2b = { 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.8, 3.9, 4.0 };
checkCutPoints(expected); test_result(X2b, 6, 7.5 / 2, 7, "2b");
samples_t X3a = { 3.1, 3.2, 3.3, 3.4, 3.7, 3.7, 3.7, 3.8, 3.9, 4.0 };
test_result(X3a, 4, 7.1 / 2, 4, "3a");
samples_t X3b = { 3.1, 3.2, 3.3, 3.4, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7 };
test_result(X3b, 4, 7.1 / 2, 4, "3b");
samples_t X4a = { 3.1, 3.2, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.9, 4.0 };
test_result(X4a, 4, 6.9 / 2, 2, "4a");
samples_t X4b = { 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.8, 3.9, 4.0 };
test_result(X4b, 4, 7.5 / 2, 7, "4b");
samples_t X4c = { 3.1, 3.2, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7 };
test_result(X4c, 4, 6.9 / 2, 2, "4c");
} }
} }

View File

@@ -11,7 +11,7 @@ namespace mdlp {
typedef vector<int> labels_t; typedef vector<int> labels_t;
typedef vector<size_t> indices_t; typedef vector<size_t> indices_t;
typedef vector<precision_t> cutPoints_t; typedef vector<precision_t> cutPoints_t;
typedef map<tuple<int, int>, precision_t> cacheEnt_t; typedef map<pair<int, int>, precision_t> cacheEnt_t;
typedef map<tuple<int, int, int>, precision_t> cacheIg_t; typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
} }
#endif #endif