mirror of
https://github.com/rmontanana/mdlp.git
synced 2025-08-15 15:35:55 +00:00
Merge pull request #6 from rmontanana/max_cut_points_entropy
Max cut points entropy
This commit is contained in:
@@ -7,16 +7,18 @@
|
||||
|
||||
namespace mdlp {
|
||||
|
||||
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_),
|
||||
max_depth(max_depth_),
|
||||
proposed_cuts(proposed) {
|
||||
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed): min_length(min_length_),
|
||||
max_depth(max_depth_),
|
||||
proposed_cuts(proposed)
|
||||
{
|
||||
}
|
||||
|
||||
CPPFImdlp::CPPFImdlp() = default;
|
||||
|
||||
CPPFImdlp::~CPPFImdlp() = default;
|
||||
|
||||
size_t CPPFImdlp::compute_max_num_cut_points() const {
|
||||
size_t CPPFImdlp::compute_max_num_cut_points() const
|
||||
{
|
||||
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples
|
||||
if (proposed_cuts == 0) {
|
||||
return numeric_limits<size_t>::max();
|
||||
@@ -29,7 +31,8 @@ namespace mdlp {
|
||||
return static_cast<size_t>(proposed_cuts);
|
||||
}
|
||||
|
||||
void CPPFImdlp::fit(samples_t &X_, labels_t &y_) {
|
||||
void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
|
||||
{
|
||||
X = X_;
|
||||
y = y_;
|
||||
num_cut_points = compute_max_num_cut_points();
|
||||
@@ -50,9 +53,17 @@ namespace mdlp {
|
||||
indices = sortIndices(X_, y_);
|
||||
metrics.setData(y, indices);
|
||||
computeCutPoints(0, X.size(), 1);
|
||||
sort(cutPoints.begin(), cutPoints.end());
|
||||
if (num_cut_points > 0) {
|
||||
// Select the best (with lower entropy) cut points
|
||||
while (cutPoints.size() > num_cut_points) {
|
||||
resizeCutPoints();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) {
|
||||
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
|
||||
{
|
||||
size_t n;
|
||||
size_t m;
|
||||
size_t idxPrev = cut - 1 >= start ? cut - 1 : cut;
|
||||
@@ -81,14 +92,13 @@ namespace mdlp {
|
||||
// Decide which values to use
|
||||
cut = cut + (backWall ? m + 1 : -n);
|
||||
actual = X[indices[cut]];
|
||||
return {(actual + previous) / 2, cut};
|
||||
return { (actual + previous) / 2, cut };
|
||||
}
|
||||
|
||||
void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_) {
|
||||
void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_)
|
||||
{
|
||||
size_t cut;
|
||||
pair<precision_t, size_t> result;
|
||||
if (cutPoints.size() == num_cut_points)
|
||||
return;
|
||||
// Check if the interval length and the depth are Ok
|
||||
if (end - start < min_length || depth_ > max_depth)
|
||||
return;
|
||||
@@ -105,7 +115,8 @@ namespace mdlp {
|
||||
}
|
||||
}
|
||||
|
||||
size_t CPPFImdlp::getCandidate(size_t start, size_t end) {
|
||||
size_t CPPFImdlp::getCandidate(size_t start, size_t end)
|
||||
{
|
||||
/* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
|
||||
E(A, TA; S) is minimal amongst all the candidate cut points. */
|
||||
size_t candidate = numeric_limits<size_t>::max();
|
||||
@@ -138,7 +149,8 @@ namespace mdlp {
|
||||
return candidate;
|
||||
}
|
||||
|
||||
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) {
|
||||
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
|
||||
{
|
||||
int k;
|
||||
int k1;
|
||||
int k2;
|
||||
@@ -156,13 +168,14 @@ namespace mdlp {
|
||||
ent2 = metrics.entropy(cut, end);
|
||||
ig = metrics.informationGain(start, cut, end);
|
||||
delta = static_cast<precision_t>(log2(pow(3, precision_t(k)) - 2) -
|
||||
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
|
||||
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
|
||||
precision_t term = 1 / N * (log2(N - 1) + delta);
|
||||
return ig > term;
|
||||
}
|
||||
|
||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
||||
indices_t CPPFImdlp::sortIndices(samples_t &X_, labels_t &y_) {
|
||||
indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
|
||||
{
|
||||
indices_t idx(X_.size());
|
||||
iota(idx.begin(), idx.end(), 0);
|
||||
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
|
||||
@@ -170,16 +183,29 @@ namespace mdlp {
|
||||
return y_[i1] < y_[i2];
|
||||
else
|
||||
return X_[i1] < X_[i2];
|
||||
});
|
||||
});
|
||||
return idx;
|
||||
}
|
||||
|
||||
cutPoints_t CPPFImdlp::getCutPoints() {
|
||||
sort(cutPoints.begin(), cutPoints.end());
|
||||
return cutPoints;
|
||||
}
|
||||
|
||||
int CPPFImdlp::get_depth() const {
|
||||
return depth;
|
||||
void CPPFImdlp::resizeCutPoints()
|
||||
{
|
||||
//Compute entropy of each of the whole cutpoint set and discards the biggest value
|
||||
precision_t maxEntropy = 0;
|
||||
precision_t entropy;
|
||||
size_t maxEntropyIdx = 0;
|
||||
size_t begin = 0;
|
||||
size_t end;
|
||||
for (size_t idx = 0; idx < cutPoints.size(); idx++) {
|
||||
end = begin;
|
||||
while (X[indices[end]] < cutPoints[idx] && end < X.size())
|
||||
end++;
|
||||
entropy = metrics.entropy(begin, end);
|
||||
if (entropy > maxEntropy) {
|
||||
maxEntropy = entropy;
|
||||
maxEntropyIdx = idx;
|
||||
}
|
||||
begin = end;
|
||||
}
|
||||
cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx));
|
||||
}
|
||||
}
|
||||
|
17
CPPFImdlp.h
17
CPPFImdlp.h
@@ -24,29 +24,20 @@ namespace mdlp {
|
||||
static indices_t sortIndices(samples_t&, labels_t&);
|
||||
|
||||
void computeCutPoints(size_t, size_t, int);
|
||||
|
||||
void resizeCutPoints();
|
||||
bool mdlp(size_t, size_t, size_t);
|
||||
|
||||
size_t getCandidate(size_t, size_t);
|
||||
|
||||
size_t compute_max_num_cut_points() const;
|
||||
|
||||
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
|
||||
|
||||
public:
|
||||
CPPFImdlp();
|
||||
|
||||
CPPFImdlp(size_t, int, float);
|
||||
|
||||
~CPPFImdlp();
|
||||
|
||||
void fit(samples_t&, labels_t&);
|
||||
|
||||
cutPoints_t getCutPoints();
|
||||
|
||||
int get_depth() const;
|
||||
|
||||
static inline string version() { return "1.1.1"; };
|
||||
inline cutPoints_t getCutPoints() const { return cutPoints; };
|
||||
inline int get_depth() const { return depth; };
|
||||
static inline string version() { return "1.1.2"; };
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
27
Metrics.cpp
27
Metrics.cpp
@@ -4,11 +4,13 @@
|
||||
|
||||
using namespace std;
|
||||
namespace mdlp {
|
||||
Metrics::Metrics(labels_t &y_, indices_t &indices_) : y(y_), indices(indices_),
|
||||
numClasses(computeNumClasses(0, indices.size())) {
|
||||
Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_),
|
||||
numClasses(computeNumClasses(0, indices.size()))
|
||||
{
|
||||
}
|
||||
|
||||
int Metrics::computeNumClasses(size_t start, size_t end) {
|
||||
int Metrics::computeNumClasses(size_t start, size_t end)
|
||||
{
|
||||
set<int> nClasses;
|
||||
for (auto i = start; i < end; ++i) {
|
||||
nClasses.insert(y[indices[i]]);
|
||||
@@ -16,7 +18,8 @@ namespace mdlp {
|
||||
return static_cast<int>(nClasses.size());
|
||||
}
|
||||
|
||||
void Metrics::setData(const labels_t &y_, const indices_t &indices_) {
|
||||
void Metrics::setData(const labels_t& y_, const indices_t& indices_)
|
||||
{
|
||||
indices = indices_;
|
||||
y = y_;
|
||||
numClasses = computeNumClasses(0, indices.size());
|
||||
@@ -24,21 +27,22 @@ namespace mdlp {
|
||||
igCache.clear();
|
||||
}
|
||||
|
||||
precision_t Metrics::entropy(size_t start, size_t end) {
|
||||
precision_t Metrics::entropy(size_t start, size_t end)
|
||||
{
|
||||
precision_t p;
|
||||
precision_t ventropy = 0;
|
||||
int nElements = 0;
|
||||
labels_t counts(numClasses + 1, 0);
|
||||
if (end - start < 2)
|
||||
return 0;
|
||||
if (entropyCache.find({start, end}) != entropyCache.end()) {
|
||||
if (entropyCache.find({ start, end }) != entropyCache.end()) {
|
||||
return entropyCache[{start, end}];
|
||||
}
|
||||
for (auto i = &indices[start]; i != &indices[end]; ++i) {
|
||||
counts[y[*i]]++;
|
||||
nElements++;
|
||||
}
|
||||
for (auto count: counts) {
|
||||
for (auto count : counts) {
|
||||
if (count > 0) {
|
||||
p = static_cast<precision_t>(count) / static_cast<precision_t>(nElements);
|
||||
ventropy -= p * log2(p);
|
||||
@@ -48,7 +52,8 @@ namespace mdlp {
|
||||
return ventropy;
|
||||
}
|
||||
|
||||
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end) {
|
||||
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
|
||||
{
|
||||
precision_t iGain;
|
||||
precision_t entropyInterval;
|
||||
precision_t entropyLeft;
|
||||
@@ -63,9 +68,9 @@ namespace mdlp {
|
||||
entropyLeft = entropy(start, cut);
|
||||
entropyRight = entropy(cut, end);
|
||||
iGain = entropyInterval -
|
||||
(static_cast<precision_t>(nElementsLeft) * entropyLeft +
|
||||
static_cast<precision_t>(nElementsRight) * entropyRight) /
|
||||
static_cast<precision_t>(nElements);
|
||||
(static_cast<precision_t>(nElementsLeft) * entropyLeft +
|
||||
static_cast<precision_t>(nElementsRight) * entropyRight) /
|
||||
static_cast<precision_t>(nElements);
|
||||
igCache[make_tuple(start, cut, end)] = iGain;
|
||||
return iGain;
|
||||
}
|
||||
|
12
Metrics.h
12
Metrics.h
@@ -6,20 +6,16 @@
|
||||
namespace mdlp {
|
||||
class Metrics {
|
||||
protected:
|
||||
labels_t &y;
|
||||
indices_t &indices;
|
||||
labels_t& y;
|
||||
indices_t& indices;
|
||||
int numClasses;
|
||||
cacheEnt_t entropyCache = cacheEnt_t();
|
||||
cacheIg_t igCache = cacheIg_t();
|
||||
public:
|
||||
Metrics(labels_t &, indices_t &);
|
||||
|
||||
void setData(const labels_t &, const indices_t &);
|
||||
|
||||
Metrics(labels_t&, indices_t&);
|
||||
void setData(const labels_t&, const indices_t&);
|
||||
int computeNumClasses(size_t, size_t);
|
||||
|
||||
precision_t entropy(size_t, size_t);
|
||||
|
||||
precision_t informationGain(size_t, size_t, size_t);
|
||||
};
|
||||
}
|
||||
|
@@ -1,4 +1,5 @@
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
|
||||
add_executable(sample sample.cpp ../tests/ArffFiles.cpp ../Metrics.cpp ../CPPFImdlp.cpp)
|
||||
|
@@ -113,14 +113,18 @@ void process_file(const string &path, const string &file_name, bool class_last,
|
||||
size_t total = 0;
|
||||
for (auto i = 0; i < attributes.size(); i++) {
|
||||
auto min_max = minmax_element(X[i].begin(), X[i].end());
|
||||
cout << "Cut points for " << get<0>(attributes[i]) << endl;
|
||||
cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl;
|
||||
cout << "--------------------------" << setprecision(3) << endl;
|
||||
cout << "Cut points for feature " << get<0>(attributes[i]) << ": [" << setprecision(3);
|
||||
test.fit(X[i], y);
|
||||
for (auto item: test.getCutPoints()) {
|
||||
cout << item << endl;
|
||||
auto cut_points = test.getCutPoints();
|
||||
for (auto item: cut_points) {
|
||||
cout << item;
|
||||
if (item != cut_points.back())
|
||||
cout << ", ";
|
||||
}
|
||||
total += test.getCutPoints().size();
|
||||
cout << "]" << endl;
|
||||
cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl;
|
||||
cout << "--------------------------" << endl;
|
||||
}
|
||||
cout << "Total cut points ...: " << total << endl;
|
||||
cout << "Total feature states: " << total + attributes.size() << endl;
|
||||
|
@@ -7,35 +7,43 @@ using namespace std;
|
||||
|
||||
ArffFiles::ArffFiles() = default;
|
||||
|
||||
vector<string> ArffFiles::getLines() const {
|
||||
vector<string> ArffFiles::getLines() const
|
||||
{
|
||||
return lines;
|
||||
}
|
||||
|
||||
unsigned long int ArffFiles::getSize() const {
|
||||
unsigned long int ArffFiles::getSize() const
|
||||
{
|
||||
return lines.size();
|
||||
}
|
||||
|
||||
vector<pair<string, string>> ArffFiles::getAttributes() const {
|
||||
vector<pair<string, string>> ArffFiles::getAttributes() const
|
||||
{
|
||||
return attributes;
|
||||
}
|
||||
|
||||
string ArffFiles::getClassName() const {
|
||||
string ArffFiles::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
|
||||
string ArffFiles::getClassType() const {
|
||||
string ArffFiles::getClassType() const
|
||||
{
|
||||
return classType;
|
||||
}
|
||||
|
||||
vector<mdlp::samples_t> &ArffFiles::getX() {
|
||||
vector<mdlp::samples_t>& ArffFiles::getX()
|
||||
{
|
||||
return X;
|
||||
}
|
||||
|
||||
vector<int> &ArffFiles::getY() {
|
||||
vector<int>& ArffFiles::getY()
|
||||
{
|
||||
return y;
|
||||
}
|
||||
|
||||
void ArffFiles::load(const string &fileName, bool classLast) {
|
||||
void ArffFiles::load(const string& fileName, bool classLast)
|
||||
{
|
||||
ifstream file(fileName);
|
||||
if (!file.is_open()) {
|
||||
throw invalid_argument("Unable to open file");
|
||||
@@ -79,7 +87,8 @@ void ArffFiles::load(const string &fileName, bool classLast) {
|
||||
|
||||
}
|
||||
|
||||
void ArffFiles::generateDataset(bool classLast) {
|
||||
void ArffFiles::generateDataset(bool classLast)
|
||||
{
|
||||
X = vector<mdlp::samples_t>(attributes.size(), mdlp::samples_t(lines.size()));
|
||||
auto yy = vector<string>(lines.size(), "");
|
||||
int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0;
|
||||
@@ -99,19 +108,21 @@ void ArffFiles::generateDataset(bool classLast) {
|
||||
y = factorize(yy);
|
||||
}
|
||||
|
||||
string ArffFiles::trim(const string &source) {
|
||||
string ArffFiles::trim(const string& source)
|
||||
{
|
||||
string s(source);
|
||||
s.erase(0, s.find_first_not_of(" \n\r\t"));
|
||||
s.erase(s.find_last_not_of(" \n\r\t") + 1);
|
||||
return s;
|
||||
}
|
||||
|
||||
vector<int> ArffFiles::factorize(const vector<string> &labels_t) {
|
||||
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
|
||||
{
|
||||
vector<int> yy;
|
||||
yy.reserve(labels_t.size());
|
||||
map<string, int> labelMap;
|
||||
int i = 0;
|
||||
for (const string &label: labels_t) {
|
||||
for (const string& label : labels_t) {
|
||||
if (labelMap.find(label) == labelMap.end()) {
|
||||
labelMap[label] = i++;
|
||||
}
|
||||
|
@@ -20,26 +20,16 @@ private:
|
||||
|
||||
public:
|
||||
ArffFiles();
|
||||
|
||||
void load(const string &, bool = true);
|
||||
|
||||
void load(const string&, bool = true);
|
||||
vector<string> getLines() const;
|
||||
|
||||
unsigned long int getSize() const;
|
||||
|
||||
string getClassName() const;
|
||||
|
||||
string getClassType() const;
|
||||
|
||||
static string trim(const string &);
|
||||
|
||||
vector<mdlp::samples_t> &getX();
|
||||
|
||||
vector<int> &getY();
|
||||
|
||||
static string trim(const string&);
|
||||
vector<mdlp::samples_t>& getX();
|
||||
vector<int>& getY();
|
||||
vector<pair<string, string>> getAttributes() const;
|
||||
|
||||
static vector<int> factorize(const vector<string> &labels_t);
|
||||
static vector<int> factorize(const vector<string>& labels_t);
|
||||
};
|
||||
|
||||
#endif
|
@@ -15,23 +15,25 @@ throw; \
|
||||
, etype)
|
||||
|
||||
namespace mdlp {
|
||||
class TestFImdlp : public CPPFImdlp, public testing::Test {
|
||||
class TestFImdlp: public CPPFImdlp, public testing::Test {
|
||||
public:
|
||||
precision_t precision = 0.000001f;
|
||||
|
||||
TestFImdlp() : CPPFImdlp() {}
|
||||
TestFImdlp(): CPPFImdlp() {}
|
||||
|
||||
string data_path;
|
||||
|
||||
void SetUp() override {
|
||||
X = {4.7f, 4.7f, 4.7f, 4.7f, 4.8f, 4.8f, 4.8f, 4.8f, 4.9f, 4.95f, 5.7f, 5.3f, 5.2f, 5.1f, 5.0f, 5.6f, 5.1f,
|
||||
6.0f, 5.1f, 5.9f};
|
||||
y = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2};
|
||||
void SetUp() override
|
||||
{
|
||||
X = { 4.7f, 4.7f, 4.7f, 4.7f, 4.8f, 4.8f, 4.8f, 4.8f, 4.9f, 4.95f, 5.7f, 5.3f, 5.2f, 5.1f, 5.0f, 5.6f, 5.1f,
|
||||
6.0f, 5.1f, 5.9f };
|
||||
y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
||||
fit(X, y);
|
||||
data_path = set_data_path();
|
||||
}
|
||||
|
||||
static string set_data_path() {
|
||||
static string set_data_path()
|
||||
{
|
||||
string path = "../datasets/";
|
||||
ifstream file(path + "iris.arff");
|
||||
if (file.is_open()) {
|
||||
@@ -41,7 +43,8 @@ namespace mdlp {
|
||||
return "../../tests/datasets/";
|
||||
}
|
||||
|
||||
void checkSortedVector() {
|
||||
void checkSortedVector()
|
||||
{
|
||||
indices_t testSortedIndices = sortIndices(X, y);
|
||||
precision_t prev = X[testSortedIndices[0]];
|
||||
for (unsigned long i = 0; i < X.size(); ++i) {
|
||||
@@ -51,7 +54,8 @@ namespace mdlp {
|
||||
}
|
||||
}
|
||||
|
||||
void checkCutPoints(cutPoints_t &computed, cutPoints_t &expected) const {
|
||||
void checkCutPoints(cutPoints_t& computed, cutPoints_t& expected) const
|
||||
{
|
||||
EXPECT_EQ(computed.size(), expected.size());
|
||||
for (unsigned long i = 0; i < computed.size(); i++) {
|
||||
cout << "(" << computed[i] << ", " << expected[i] << ") ";
|
||||
@@ -59,9 +63,10 @@ namespace mdlp {
|
||||
}
|
||||
}
|
||||
|
||||
bool test_result(const samples_t &X_, size_t cut, float midPoint, size_t limit, const string &title) {
|
||||
bool test_result(const samples_t& X_, size_t cut, float midPoint, size_t limit, const string& title)
|
||||
{
|
||||
pair<precision_t, size_t> result;
|
||||
labels_t y_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
|
||||
labels_t y_ = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
|
||||
X = X_;
|
||||
y = y_;
|
||||
indices = sortIndices(X, y);
|
||||
@@ -72,12 +77,13 @@ namespace mdlp {
|
||||
return true;
|
||||
}
|
||||
|
||||
void test_dataset(CPPFImdlp &test, const string &filename, vector<cutPoints_t> &expected,
|
||||
vector<int> &depths) const {
|
||||
void test_dataset(CPPFImdlp& test, const string& filename, vector<cutPoints_t>& expected,
|
||||
vector<int>& depths) const
|
||||
{
|
||||
ArffFiles file;
|
||||
file.load(data_path + filename + ".arff", true);
|
||||
vector<samples_t> &X = file.getX();
|
||||
labels_t &y = file.getY();
|
||||
vector<samples_t>& X = file.getX();
|
||||
labels_t& y = file.getY();
|
||||
auto attributes = file.getAttributes();
|
||||
for (auto feature = 0; feature < attributes.size(); feature++) {
|
||||
test.fit(X[feature], y);
|
||||
@@ -90,92 +96,100 @@ namespace mdlp {
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(TestFImdlp, FitErrorEmptyDataset) {
|
||||
TEST_F(TestFImdlp, FitErrorEmptyDataset)
|
||||
{
|
||||
X = samples_t();
|
||||
y = labels_t();
|
||||
EXPECT_THROW_WITH_MESSAGE(fit(X, y), invalid_argument, "X and y must have at least one element");
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, FitErrorDifferentSize) {
|
||||
X = {1, 2, 3};
|
||||
y = {1, 2};
|
||||
TEST_F(TestFImdlp, FitErrorDifferentSize)
|
||||
{
|
||||
X = { 1, 2, 3 };
|
||||
y = { 1, 2 };
|
||||
EXPECT_THROW_WITH_MESSAGE(fit(X, y), invalid_argument, "X and y must have the same size");
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, FitErrorMinLengtMaxDepth) {
|
||||
TEST_F(TestFImdlp, FitErrorMinLengtMaxDepth)
|
||||
{
|
||||
auto testLength = CPPFImdlp(2, 10, 0);
|
||||
auto testDepth = CPPFImdlp(3, 0, 0);
|
||||
X = {1, 2, 3};
|
||||
y = {1, 2, 3};
|
||||
X = { 1, 2, 3 };
|
||||
y = { 1, 2, 3 };
|
||||
EXPECT_THROW_WITH_MESSAGE(testLength.fit(X, y), invalid_argument, "min_length must be greater than 2");
|
||||
EXPECT_THROW_WITH_MESSAGE(testDepth.fit(X, y), invalid_argument, "max_depth must be greater than 0");
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, JoinFit) {
|
||||
samples_t X_ = {1, 2, 2, 3, 4, 2, 3};
|
||||
labels_t y_ = {0, 0, 1, 2, 3, 4, 5};
|
||||
cutPoints_t expected = {1.5f, 2.5f};
|
||||
TEST_F(TestFImdlp, JoinFit)
|
||||
{
|
||||
samples_t X_ = { 1, 2, 2, 3, 4, 2, 3 };
|
||||
labels_t y_ = { 0, 0, 1, 2, 3, 4, 5 };
|
||||
cutPoints_t expected = { 1.5f, 2.5f };
|
||||
fit(X_, y_);
|
||||
auto computed = getCutPoints();
|
||||
EXPECT_EQ(computed.size(), expected.size());
|
||||
checkCutPoints(computed, expected);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, FitErrorMaxCutPoints) {
|
||||
TEST_F(TestFImdlp, FitErrorMaxCutPoints)
|
||||
{
|
||||
auto testmin = CPPFImdlp(2, 10, -1);
|
||||
auto testmax = CPPFImdlp(3, 0, 200);
|
||||
X = {1, 2, 3};
|
||||
y = {1, 2, 3};
|
||||
X = { 1, 2, 3 };
|
||||
y = { 1, 2, 3 };
|
||||
EXPECT_THROW_WITH_MESSAGE(testmin.fit(X, y), invalid_argument, "wrong proposed num_cuts value");
|
||||
EXPECT_THROW_WITH_MESSAGE(testmax.fit(X, y), invalid_argument, "wrong proposed num_cuts value");
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, SortIndices) {
|
||||
X = {5.7f, 5.3f, 5.2f, 5.1f, 5.0f, 5.6f, 5.1f, 6.0f, 5.1f, 5.9f};
|
||||
y = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2};
|
||||
indices = {4, 3, 6, 8, 2, 1, 5, 0, 9, 7};
|
||||
TEST_F(TestFImdlp, SortIndices)
|
||||
{
|
||||
X = { 5.7f, 5.3f, 5.2f, 5.1f, 5.0f, 5.6f, 5.1f, 6.0f, 5.1f, 5.9f };
|
||||
y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
||||
indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 };
|
||||
checkSortedVector();
|
||||
X = {5.77f, 5.88f, 5.99f};
|
||||
y = {1, 2, 1};
|
||||
indices = {0, 1, 2};
|
||||
X = { 5.77f, 5.88f, 5.99f };
|
||||
y = { 1, 2, 1 };
|
||||
indices = { 0, 1, 2 };
|
||||
checkSortedVector();
|
||||
X = {5.33f, 5.22f, 5.11f};
|
||||
y = {1, 2, 1};
|
||||
indices = {2, 1, 0};
|
||||
X = { 5.33f, 5.22f, 5.11f };
|
||||
y = { 1, 2, 1 };
|
||||
indices = { 2, 1, 0 };
|
||||
checkSortedVector();
|
||||
X = {5.33f, 5.22f, 5.33f};
|
||||
y = {2, 2, 1};
|
||||
indices = {1, 2, 0};
|
||||
X = { 5.33f, 5.22f, 5.33f };
|
||||
y = { 2, 2, 1 };
|
||||
indices = { 1, 2, 0 };
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, TestShortDatasets) {
|
||||
TEST_F(TestFImdlp, TestShortDatasets)
|
||||
{
|
||||
vector<precision_t> computed;
|
||||
X = {1};
|
||||
y = {1};
|
||||
X = { 1 };
|
||||
y = { 1 };
|
||||
fit(X, y);
|
||||
computed = getCutPoints();
|
||||
EXPECT_EQ(computed.size(), 0);
|
||||
X = {1, 3};
|
||||
y = {1, 2};
|
||||
X = { 1, 3 };
|
||||
y = { 1, 2 };
|
||||
fit(X, y);
|
||||
computed = getCutPoints();
|
||||
EXPECT_EQ(computed.size(), 0);
|
||||
X = {2, 4};
|
||||
y = {1, 2};
|
||||
X = { 2, 4 };
|
||||
y = { 1, 2 };
|
||||
fit(X, y);
|
||||
computed = getCutPoints();
|
||||
EXPECT_EQ(computed.size(), 0);
|
||||
X = {1, 2, 3};
|
||||
y = {1, 2, 2};
|
||||
X = { 1, 2, 3 };
|
||||
y = { 1, 2, 2 };
|
||||
fit(X, y);
|
||||
computed = getCutPoints();
|
||||
EXPECT_EQ(computed.size(), 1);
|
||||
EXPECT_NEAR(computed[0], 1.5, precision);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, TestArtificialDataset) {
|
||||
TEST_F(TestFImdlp, TestArtificialDataset)
|
||||
{
|
||||
fit(X, y);
|
||||
cutPoints_t expected = {5.05f};
|
||||
cutPoints_t expected = { 5.05f };
|
||||
vector<precision_t> computed = getCutPoints();
|
||||
EXPECT_EQ(computed.size(), expected.size());
|
||||
for (unsigned long i = 0; i < computed.size(); i++) {
|
||||
@@ -183,49 +197,53 @@ namespace mdlp {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, TestIris) {
|
||||
TEST_F(TestFImdlp, TestIris)
|
||||
{
|
||||
vector<cutPoints_t> expected = {
|
||||
{5.45f, 5.75f},
|
||||
{2.75f, 2.85f, 2.95f, 3.05f, 3.35f},
|
||||
{2.45f, 4.75f, 5.05f},
|
||||
{0.8f, 1.75f}
|
||||
};
|
||||
vector<int> depths = {3, 5, 4, 3};
|
||||
vector<int> depths = { 3, 5, 4, 3 };
|
||||
auto test = CPPFImdlp();
|
||||
test_dataset(test, "iris", expected, depths);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, ComputeCutPointsGCase) {
|
||||
TEST_F(TestFImdlp, ComputeCutPointsGCase)
|
||||
{
|
||||
cutPoints_t expected;
|
||||
expected = {1.5};
|
||||
samples_t X_ = {0, 1, 2, 2, 2};
|
||||
labels_t y_ = {1, 1, 1, 2, 2};
|
||||
expected = { 1.5 };
|
||||
samples_t X_ = { 0, 1, 2, 2, 2 };
|
||||
labels_t y_ = { 1, 1, 1, 2, 2 };
|
||||
fit(X_, y_);
|
||||
auto computed = getCutPoints();
|
||||
checkCutPoints(computed, expected);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, ValueCutPoint) {
|
||||
TEST_F(TestFImdlp, ValueCutPoint)
|
||||
{
|
||||
// Case titles as stated in the doc
|
||||
samples_t X1a{3.1f, 3.2f, 3.3f, 3.4f, 3.5f, 3.6f, 3.7f, 3.8f, 3.9f, 4.0f};
|
||||
samples_t X1a{ 3.1f, 3.2f, 3.3f, 3.4f, 3.5f, 3.6f, 3.7f, 3.8f, 3.9f, 4.0f };
|
||||
test_result(X1a, 6, 7.3f / 2, 6, "1a");
|
||||
samples_t X2a = {3.1f, 3.2f, 3.3f, 3.4f, 3.7f, 3.7f, 3.7f, 3.8f, 3.9f, 4.0f};
|
||||
samples_t X2a = { 3.1f, 3.2f, 3.3f, 3.4f, 3.7f, 3.7f, 3.7f, 3.8f, 3.9f, 4.0f };
|
||||
test_result(X2a, 6, 7.1f / 2, 4, "2a");
|
||||
samples_t X2b = {3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.8f, 3.9f, 4.0f};
|
||||
samples_t X2b = { 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.8f, 3.9f, 4.0f };
|
||||
test_result(X2b, 6, 7.5f / 2, 7, "2b");
|
||||
samples_t X3a = {3.f, 3.2f, 3.3f, 3.4f, 3.7f, 3.7f, 3.7f, 3.8f, 3.9f, 4.0f};
|
||||
samples_t X3a = { 3.f, 3.2f, 3.3f, 3.4f, 3.7f, 3.7f, 3.7f, 3.8f, 3.9f, 4.0f };
|
||||
test_result(X3a, 4, 7.1f / 2, 4, "3a");
|
||||
samples_t X3b = {3.1f, 3.2f, 3.3f, 3.4f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f};
|
||||
samples_t X3b = { 3.1f, 3.2f, 3.3f, 3.4f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f };
|
||||
test_result(X3b, 4, 7.1f / 2, 4, "3b");
|
||||
samples_t X4a = {3.1f, 3.2f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.9f, 4.0f};
|
||||
samples_t X4a = { 3.1f, 3.2f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.9f, 4.0f };
|
||||
test_result(X4a, 4, 6.9f / 2, 2, "4a");
|
||||
samples_t X4b = {3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.8f, 3.9f, 4.0f};
|
||||
samples_t X4b = { 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.8f, 3.9f, 4.0f };
|
||||
test_result(X4b, 4, 7.5f / 2, 7, "4b");
|
||||
samples_t X4c = {3.1f, 3.2f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f};
|
||||
samples_t X4c = { 3.1f, 3.2f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f, 3.7f };
|
||||
test_result(X4c, 4, 6.9f / 2, 2, "4c");
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, MaxDepth) {
|
||||
TEST_F(TestFImdlp, MaxDepth)
|
||||
{
|
||||
// Set max_depth to 1
|
||||
auto test = CPPFImdlp(3, 1, 0);
|
||||
vector<cutPoints_t> expected = {
|
||||
@@ -234,11 +252,12 @@ namespace mdlp {
|
||||
{2.45f},
|
||||
{0.8f}
|
||||
};
|
||||
vector<int> depths = {1, 1, 1, 1};
|
||||
vector<int> depths = { 1, 1, 1, 1 };
|
||||
test_dataset(test, "iris", expected, depths);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, MinLength) {
|
||||
TEST_F(TestFImdlp, MinLength)
|
||||
{
|
||||
auto test = CPPFImdlp(75, 100, 0);
|
||||
// Set min_length to 75
|
||||
vector<cutPoints_t> expected = {
|
||||
@@ -247,11 +266,12 @@ namespace mdlp {
|
||||
{2.45f, 4.75f},
|
||||
{0.8f, 1.75f}
|
||||
};
|
||||
vector<int> depths = {3, 2, 2, 2};
|
||||
vector<int> depths = { 3, 2, 2, 2 };
|
||||
test_dataset(test, "iris", expected, depths);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, MinLengthMaxDepth) {
|
||||
TEST_F(TestFImdlp, MinLengthMaxDepth)
|
||||
{
|
||||
// Set min_length to 75
|
||||
auto test = CPPFImdlp(75, 2, 0);
|
||||
vector<cutPoints_t> expected = {
|
||||
@@ -260,24 +280,27 @@ namespace mdlp {
|
||||
{2.45f, 4.75f},
|
||||
{0.8f, 1.75f}
|
||||
};
|
||||
vector<int> depths = {2, 2, 2, 2};
|
||||
vector<int> depths = { 2, 2, 2, 2 };
|
||||
test_dataset(test, "iris", expected, depths);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, MaxCutPointsInteger) {
|
||||
TEST_F(TestFImdlp, MaxCutPointsInteger)
|
||||
{
|
||||
// Set min_length to 75
|
||||
auto test = CPPFImdlp(75, 2, 1);
|
||||
vector<cutPoints_t> expected = {
|
||||
{5.45f},
|
||||
{3.35f},
|
||||
{2.85f},
|
||||
{2.45f},
|
||||
{0.8f}
|
||||
};
|
||||
vector<int> depths = {1, 1, 1, 1};
|
||||
vector<int> depths = { 2, 2, 2, 2 };
|
||||
test_dataset(test, "iris", expected, depths);
|
||||
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, MaxCutPointsFloat) {
|
||||
TEST_F(TestFImdlp, MaxCutPointsFloat)
|
||||
{
|
||||
// Set min_length to 75
|
||||
auto test = CPPFImdlp(75, 2, 0.2f);
|
||||
vector<cutPoints_t> expected = {
|
||||
@@ -286,19 +309,20 @@ namespace mdlp {
|
||||
{2.45f, 4.75f},
|
||||
{0.8f, 1.75f}
|
||||
};
|
||||
vector<int> depths = {2, 2, 2, 2};
|
||||
vector<int> depths = { 2, 2, 2, 2 };
|
||||
test_dataset(test, "iris", expected, depths);
|
||||
}
|
||||
|
||||
TEST_F(TestFImdlp, ProposedCuts) {
|
||||
vector<pair<float, size_t>> proposed_list = {{0.1f, 2},
|
||||
TEST_F(TestFImdlp, ProposedCuts)
|
||||
{
|
||||
vector<pair<float, size_t>> proposed_list = { {0.1f, 2},
|
||||
{0.5f, 10},
|
||||
{0.07f, 1},
|
||||
{1.0f, 1},
|
||||
{2.0f, 2}};
|
||||
{2.0f, 2} };
|
||||
size_t expected;
|
||||
size_t computed;
|
||||
for (auto proposed_item: proposed_list) {
|
||||
for (auto proposed_item : proposed_list) {
|
||||
tie(proposed_cuts, expected) = proposed_item;
|
||||
computed = compute_max_num_cut_points();
|
||||
ASSERT_EQ(expected, computed);
|
||||
|
@@ -2,46 +2,51 @@
|
||||
#include "../Metrics.h"
|
||||
|
||||
namespace mdlp {
|
||||
class TestMetrics : public Metrics, public testing::Test {
|
||||
class TestMetrics: public Metrics, public testing::Test {
|
||||
public:
|
||||
labels_t y_ = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2};
|
||||
indices_t indices_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
|
||||
labels_t y_ = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
||||
indices_t indices_ = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
|
||||
precision_t precision = 0.000001f;
|
||||
|
||||
TestMetrics() : Metrics(y_, indices_) {};
|
||||
TestMetrics(): Metrics(y_, indices_) {};
|
||||
|
||||
void SetUp() override {
|
||||
void SetUp() override
|
||||
{
|
||||
setData(y_, indices_);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(TestMetrics, NumClasses) {
|
||||
y = {1, 1, 1, 1, 1, 1, 1, 1, 2, 1};
|
||||
TEST_F(TestMetrics, NumClasses)
|
||||
{
|
||||
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
||||
EXPECT_EQ(1, computeNumClasses(4, 8));
|
||||
EXPECT_EQ(2, computeNumClasses(0, 10));
|
||||
EXPECT_EQ(2, computeNumClasses(8, 10));
|
||||
}
|
||||
|
||||
TEST_F(TestMetrics, Entropy) {
|
||||
TEST_F(TestMetrics, Entropy)
|
||||
{
|
||||
EXPECT_EQ(1, entropy(0, 10));
|
||||
EXPECT_EQ(0, entropy(0, 5));
|
||||
y = {1, 1, 1, 1, 1, 1, 1, 1, 2, 1};
|
||||
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
||||
setData(y, indices);
|
||||
ASSERT_NEAR(0.468996f, entropy(0, 10), precision);
|
||||
}
|
||||
|
||||
TEST_F(TestMetrics, EntropyDouble) {
|
||||
y = {0, 0, 1, 2, 3};
|
||||
samples_t expected_entropies = {0.0, 0.0, 0.91829583, 1.5, 1.4575424759098898};
|
||||
TEST_F(TestMetrics, EntropyDouble)
|
||||
{
|
||||
y = { 0, 0, 1, 2, 3 };
|
||||
samples_t expected_entropies = { 0.0, 0.0, 0.91829583, 1.5, 1.4575424759098898 };
|
||||
for (auto idx = 0; idx < y.size(); ++idx) {
|
||||
ASSERT_NEAR(expected_entropies[idx], entropy(0, idx + 1), precision);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TestMetrics, InformationGain) {
|
||||
TEST_F(TestMetrics, InformationGain)
|
||||
{
|
||||
ASSERT_NEAR(1, informationGain(0, 5, 10), precision);
|
||||
ASSERT_NEAR(1, informationGain(0, 5, 10), precision); // For cache
|
||||
y = {1, 1, 1, 1, 1, 1, 1, 1, 2, 1};
|
||||
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
||||
setData(y, indices);
|
||||
ASSERT_NEAR(0.108032f, informationGain(0, 5, 10), precision);
|
||||
}
|
||||
|
Reference in New Issue
Block a user