From cfade7a556b7a5c8b050737e7134c517d9ddca28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 19 Mar 2023 19:13:37 +0100 Subject: [PATCH] Remove unneeded loop in sortIndices Add some static casts --- CMakeLists.txt | 4 ++ CPPFImdlp.cpp | 74 ++++++++++++++------------ CPPFImdlp.h | 20 +++++-- Metrics.cpp | 46 +++++++++------- Metrics.h | 18 ++++--- sample/sample.cpp | 26 ++++----- tests/ArffFiles.cpp | 124 +++++++++++++++++++++++-------------------- tests/ArffFiles.h | 39 ++++++++++---- tests/CMakeLists.txt | 5 +- 9 files changed, 210 insertions(+), 146 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a711c93..cabfc07 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,10 @@ cmake_minimum_required(VERSION 3.20) project(mdlp) +if (POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif () + set(CMAKE_CXX_STANDARD 11) add_library(mdlp CPPFImdlp.cpp Metrics.cpp sample/sample.cpp) diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index 0f73173..50cdc48 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -5,20 +5,22 @@ #include #include "CPPFImdlp.h" #include "Metrics.h" + namespace mdlp { - CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed): min_length(min_length_), - max_depth(max_depth_), proposed_cuts(proposed) - { + CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_), + max_depth(max_depth_), + proposed_cuts(proposed) { } + CPPFImdlp::CPPFImdlp() = default; + CPPFImdlp::~CPPFImdlp() = default; - size_t CPPFImdlp::compute_max_num_cut_points() const - { + size_t CPPFImdlp::compute_max_num_cut_points() const { // Set the actual maximum number of cut points as a number or as a percentage of the number of samples if (proposed_cuts == 0) { - return numeric_limits::max(); + return numeric_limits::max(); } if (proposed_cuts < 0 || proposed_cuts > static_cast(X.size())) { throw invalid_argument("wrong proposed num_cuts value"); @@ -28,8 +30,7 @@ namespace mdlp { return static_cast(proposed_cuts); } - void CPPFImdlp::fit(samples_t& X_, labels_t& y_) - { + void CPPFImdlp::fit(samples_t &X_, labels_t &y_) { X = X_; y = y_; num_cut_points = compute_max_num_cut_points(); @@ -52,12 +53,15 @@ namespace mdlp { computeCutPoints(0, X.size(), 1); } - pair CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) - { - size_t n, m, idxPrev = cut - 1 >= start ? cut - 1 : cut; + pair CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) { + size_t n; + size_t m; + size_t idxPrev = cut - 1 >= start ? cut - 1 : cut; size_t idxNext = cut + 1 < end ? cut + 1 : cut; bool backWall; // true if duplicates reach begining of the interval - precision_t previous, actual, next; + precision_t previous; + precision_t actual; + precision_t next; previous = X[indices[idxPrev]]; actual = X[indices[cut]]; next = X[indices[idxNext]]; @@ -78,11 +82,10 @@ namespace mdlp { // Decide which values to use cut = cut + (backWall ? m + 1 : -n); actual = X[indices[cut]]; - return { (actual + previous) / 2, cut }; + return {(actual + previous) / 2, cut}; } - void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_) - { + void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_) { size_t cut; pair result; if (cutPoints.size() == num_cut_points) @@ -103,13 +106,15 @@ namespace mdlp { } } - size_t CPPFImdlp::getCandidate(size_t start, size_t end) - { + size_t CPPFImdlp::getCandidate(size_t start, size_t end) { /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which E(A, TA; S) is minimal amongst all the candidate cut points. */ - size_t candidate = numeric_limits::max(), elements = end - start; + size_t candidate = numeric_limits::max(); + size_t elements = end - start; bool sameValues = true; - precision_t entropy_left, entropy_right, minEntropy; + precision_t entropy_left; + precision_t entropy_right; + precision_t minEntropy; // Check if all the values of the variable in the interval are the same for (size_t idx = start + 1; idx < end; idx++) { if (X[indices[idx]] != X[indices[start]]) { @@ -134,11 +139,15 @@ namespace mdlp { return candidate; } - bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) - { - int k, k1, k2; - precision_t ig, delta; - precision_t ent, ent1, ent2; + bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) { + int k; + int k1; + int k2; + precision_t ig; + precision_t delta; + precision_t ent; + precision_t ent1; + precision_t ent2; auto N = precision_t(end - start); k = metrics.computeNumClasses(start, end); k1 = metrics.computeNumClasses(start, cut); @@ -148,33 +157,30 @@ namespace mdlp { ent2 = metrics.entropy(cut, end); ig = metrics.informationGain(start, cut, end); delta = static_cast(log2(pow(3, precision_t(k)) - 2) - - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2)); + (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2)); precision_t term = 1 / N * (log2(N - 1) + delta); return ig > term; } // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes - indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_) - { + indices_t CPPFImdlp::sortIndices(samples_t &X_, labels_t &y_) { indices_t idx(X_.size()); iota(idx.begin(), idx.end(), 0); - for (size_t i = 0; i < X_.size(); i++) - stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) { + stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) { if (X_[i1] == X_[i2]) return y_[i1] < y_[i2]; else return X_[i1] < X_[i2]; - }); + }); return idx; } - cutPoints_t CPPFImdlp::getCutPoints() - { + cutPoints_t CPPFImdlp::getCutPoints() { sort(cutPoints.begin(), cutPoints.end()); return cutPoints; } - int CPPFImdlp::get_depth() const - { + + int CPPFImdlp::get_depth() const { return depth; } } diff --git a/CPPFImdlp.h b/CPPFImdlp.h index 574d8cb..0c0311e 100644 --- a/CPPFImdlp.h +++ b/CPPFImdlp.h @@ -1,9 +1,11 @@ #ifndef CPPFIMDLP_H #define CPPFIMDLP_H + #include "typesFImdlp.h" #include "Metrics.h" #include #include + namespace mdlp { class CPPFImdlp { protected: @@ -18,20 +20,32 @@ namespace mdlp { cutPoints_t cutPoints; size_t num_cut_points = numeric_limits::max(); - static indices_t sortIndices(samples_t&, labels_t&); + static indices_t sortIndices(samples_t &, labels_t &); + void computeCutPoints(size_t, size_t, int); + bool mdlp(size_t, size_t, size_t); + size_t getCandidate(size_t, size_t); + size_t compute_max_num_cut_points() const; + pair valueCutPoint(size_t, size_t, size_t); + public: CPPFImdlp(); + CPPFImdlp(size_t, int, float); + ~CPPFImdlp(); - void fit(samples_t&, labels_t&); + + void fit(samples_t &, labels_t &); + cutPoints_t getCutPoints(); + int get_depth() const; - inline string version() { return "1.1.1"; }; + + inline string version() const { return "1.1.1"; }; }; } #endif diff --git a/Metrics.cpp b/Metrics.cpp index 766e508..c77da01 100644 --- a/Metrics.cpp +++ b/Metrics.cpp @@ -1,63 +1,71 @@ #include "Metrics.h" #include #include + using namespace std; namespace mdlp { - Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t()) - { + Metrics::Metrics(labels_t &y_, indices_t &indices_) : y(y_), indices(indices_), + numClasses(computeNumClasses(0, indices.size())) { } - int Metrics::computeNumClasses(size_t start, size_t end) - { + + int Metrics::computeNumClasses(size_t start, size_t end) { set nClasses; for (auto i = start; i < end; ++i) { nClasses.insert(y[indices[i]]); } - return nClasses.size(); + return static_cast(nClasses.size()); } - void Metrics::setData(labels_t& y_, indices_t& indices_) - { + + void Metrics::setData(const labels_t &y_, const indices_t &indices_) { indices = indices_; y = y_; numClasses = computeNumClasses(0, indices.size()); entropyCache.clear(); igCache.clear(); } - precision_t Metrics::entropy(size_t start, size_t end) - { - precision_t p, ventropy = 0; + + precision_t Metrics::entropy(size_t start, size_t end) { + precision_t p; + precision_t ventropy = 0; int nElements = 0; labels_t counts(numClasses + 1, 0); if (end - start < 2) return 0; - if (entropyCache.find({ start, end }) != entropyCache.end()) { + if (entropyCache.find({start, end}) != entropyCache.end()) { return entropyCache[{start, end}]; } for (auto i = &indices[start]; i != &indices[end]; ++i) { counts[y[*i]]++; nElements++; } - for (auto count : counts) { + for (auto count: counts) { if (count > 0) { - p = (precision_t)count / nElements; + p = static_cast(count) / static_cast(nElements); ventropy -= p * log2(p); } } entropyCache[{start, end}] = ventropy; return ventropy; } - precision_t Metrics::informationGain(size_t start, size_t cut, size_t end) - { + + precision_t Metrics::informationGain(size_t start, size_t cut, size_t end) { precision_t iGain; - precision_t entropyInterval, entropyLeft, entropyRight; - int nElementsLeft = cut - start, nElementsRight = end - cut; - int nElements = end - start; + precision_t entropyInterval; + precision_t entropyLeft; + precision_t entropyRight; + size_t nElementsLeft = cut - start; + size_t nElementsRight = end - cut; + size_t nElements = end - start; if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) { return igCache[make_tuple(start, cut, end)]; } entropyInterval = entropy(start, end); entropyLeft = entropy(start, cut); entropyRight = entropy(cut, end); - iGain = entropyInterval - ((precision_t)nElementsLeft * entropyLeft + (precision_t)nElementsRight * entropyRight) / nElements; + iGain = entropyInterval - + (static_cast(nElementsLeft) * entropyLeft + + static_cast(nElementsRight) * entropyRight) / + static_cast(nElements); igCache[make_tuple(start, cut, end)] = iGain; return iGain; } diff --git a/Metrics.h b/Metrics.h index a371502..4046a6d 100644 --- a/Metrics.h +++ b/Metrics.h @@ -1,19 +1,25 @@ #ifndef CCMETRICS_H #define CCMETRICS_H + #include "typesFImdlp.h" + namespace mdlp { class Metrics { protected: - labels_t& y; - indices_t& indices; + labels_t &y; + indices_t &indices; int numClasses; - cacheEnt_t entropyCache; - cacheIg_t igCache; + cacheEnt_t entropyCache = cacheEnt_t(); + cacheIg_t igCache = cacheIg_t(); public: - Metrics(labels_t&, indices_t&); - void setData(labels_t&, indices_t&); + Metrics(labels_t &, indices_t &); + + void setData(const labels_t &, const indices_t &); + int computeNumClasses(size_t, size_t); + precision_t entropy(size_t, size_t); + precision_t informationGain(size_t, size_t, size_t); }; } diff --git a/sample/sample.cpp b/sample/sample.cpp index cd2bbde..49d5d91 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -38,17 +38,17 @@ tuple parse_arguments(int argc, char **argv) { int max_depth = numeric_limits::max(); int min_length = 3; float max_cutpoints = 0; - static struct option long_options[] = { + const option long_options[] = { {"help", no_argument, nullptr, 'h'}, {"file", required_argument, nullptr, 'f'}, {"path", required_argument, nullptr, 'p'}, {"max_depth", required_argument, nullptr, 'm'}, {"max_cutpoints", required_argument, nullptr, 'c'}, {"min_length", required_argument, nullptr, 'n'}, - {nullptr, 0, nullptr, 0} + {nullptr, no_argument, nullptr, 0} }; while (true) { - auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, nullptr); + const auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, nullptr); if (c == -1) break; switch (c) { @@ -56,16 +56,16 @@ tuple parse_arguments(int argc, char **argv) { usage(argv[0]); exit(0); case 'f': - file_name = optarg; + file_name = string(optarg); break; case 'm': - max_depth = (int) strtol(optarg, nullptr, 10); + max_depth = stoi(optarg); break; case 'n': - min_length = (int) strtol(optarg, nullptr, 10); + min_length = stoi(optarg); break; case 'c': - max_cutpoints = strtof(optarg, nullptr); + max_cutpoints = stof(optarg); break; case 'p': path = optarg; @@ -92,7 +92,7 @@ void process_file(const string &path, const string &file_name, bool class_last, file.load(path + file_name + ".arff", class_last); auto attributes = file.getAttributes(); - int items = file.getSize(); + auto items = file.getSize(); cout << "Number of lines: " << items << endl; cout << "Attributes: " << endl; for (auto attribute: attributes) { @@ -109,7 +109,7 @@ void process_file(const string &path, const string &file_name, bool class_last, } cout << y[i] << endl; } - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); + auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); auto total = 0; for (auto i = 0; i < attributes.size(); i++) { auto min_max = minmax_element(X[i].begin(), X[i].end()); @@ -141,7 +141,7 @@ void process_all_files(const map &datasets, const string &path, in size_t timing = 0; int cut_points = 0; for (auto i = 0; i < attributes.size(); i++) { - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); + auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); test.fit(X[i], y); std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); @@ -163,8 +163,10 @@ int main(int argc, char **argv) { {"mfeat-factors", true}, {"test", true} }; - string file_name, path; - int max_depth, min_length; + string file_name; + string path; + int max_depth; + int min_length; float max_cutpoints; tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv); if (datasets.find(file_name) == datasets.end() && file_name != "all") { diff --git a/tests/ArffFiles.cpp b/tests/ArffFiles.cpp index a7286eb..405a57e 100644 --- a/tests/ArffFiles.cpp +++ b/tests/ArffFiles.cpp @@ -6,82 +6,88 @@ using namespace std; ArffFiles::ArffFiles() = default; -vector ArffFiles::getLines() -{ + +vector ArffFiles::getLines() const { return lines; } -unsigned long int ArffFiles::getSize() -{ + +unsigned long int ArffFiles::getSize() const { return lines.size(); } -vector> ArffFiles::getAttributes() -{ + +vector> ArffFiles::getAttributes() const { return attributes; } -string ArffFiles::getClassName() -{ + +string ArffFiles::getClassName() const { return className; } -string ArffFiles::getClassType() -{ + +string ArffFiles::getClassType() const { return classType; } -vector>& ArffFiles::getX() -{ + +vector> &ArffFiles::getX() { return X; } -vector& ArffFiles::getY() -{ + +vector &ArffFiles::getY() { return y; } -void ArffFiles::load(const string fileName, bool classLast) -{ + +void ArffFiles::load(const string &fileName, bool classLast) { ifstream file(fileName); - if (file.is_open()) { - string line, keyword, attribute, type, type_w; - while (getline(file, line)) { - if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { - continue; - } - if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { - stringstream ss(line); - ss >> keyword >> attribute; - type = ""; - while(ss >> type_w) - type += type_w + " "; - attributes.emplace_back(attribute, type ); - continue; - } - if (line[0] == '@') { - continue; - } - lines.push_back(line); - } - file.close(); - if (attributes.empty()) - throw invalid_argument("No attributes found"); - if (classLast) { - className = get<0>(attributes.back()); - classType = get<1>(attributes.back()); - attributes.pop_back(); - } else { - className = get<0>(attributes.front()); - classType = get<1>(attributes.front()); - attributes.erase(attributes.begin()); - } - generateDataset(classLast); - } else + if (!file.is_open()) { throw invalid_argument("Unable to open file"); + } + string line; + string keyword; + string attribute; + string type; + string type_w; + while (getline(file, line)) { + if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { + stringstream ss(line); + ss >> keyword >> attribute; + type = ""; + while (ss >> type_w) + type += type_w + " "; + attributes.emplace_back(attribute, type); + continue; + } + if (line[0] == '@') { + continue; + } + lines.push_back(line); + } + file.close(); + if (attributes.empty()) + throw invalid_argument("No attributes found"); + if (classLast) { + className = get<0>(attributes.back()); + classType = get<1>(attributes.back()); + attributes.pop_back(); + } else { + className = get<0>(attributes.front()); + classType = get<1>(attributes.front()); + attributes.erase(attributes.begin()); + } + generateDataset(classLast); + } -void ArffFiles::generateDataset(bool classLast) -{ + +void ArffFiles::generateDataset(bool classLast) { X = vector>(attributes.size(), vector(lines.size())); - vector yy = vector(lines.size(), ""); + auto yy = vector(lines.size(), ""); int labelIndex = classLast ? static_cast(attributes.size()) : 0; for (size_t i = 0; i < lines.size(); i++) { stringstream ss(lines[i]); string value; - int pos = 0, xIndex = 0; + int pos = 0; + int xIndex = 0; while (getline(ss, value, ',')) { if (pos++ == labelIndex) { yy[i] = value; @@ -92,20 +98,20 @@ void ArffFiles::generateDataset(bool classLast) } y = factorize(yy); } -string ArffFiles::trim(const string& source) -{ + +string ArffFiles::trim(const string &source) { string s(source); s.erase(0, s.find_first_not_of(" \n\r\t")); s.erase(s.find_last_not_of(" \n\r\t") + 1); return s; } -vector ArffFiles::factorize(const vector& labels_t) -{ + +vector ArffFiles::factorize(const vector &labels_t) { vector yy; yy.reserve(labels_t.size()); map labelMap; int i = 0; - for (const string &label : labels_t) { + for (const string &label: labels_t) { if (labelMap.find(label) == labelMap.end()) { labelMap[label] = i++; } diff --git a/tests/ArffFiles.h b/tests/ArffFiles.h index 8590d04..38531af 100644 --- a/tests/ArffFiles.h +++ b/tests/ArffFiles.h @@ -1,27 +1,44 @@ #ifndef ARFFFILES_H #define ARFFFILES_H + #include #include + using namespace std; + class ArffFiles { private: vector lines; vector> attributes; - string className, classType; + string className; + string classType; vector> X; vector y; + void generateDataset(bool); + public: ArffFiles(); - void load(string, bool = true); - vector getLines(); - unsigned long int getSize(); - string getClassName(); - string getClassType(); - static string trim(const string&); - vector>& getX(); - vector& getY(); - vector> getAttributes(); - static vector factorize(const vector& labels_t); + + void load(const string &, bool = true); + + vector getLines() const; + + unsigned long int getSize() const; + + string getClassName() const; + + string getClassType() const; + + static string trim(const string &); + + vector> &getX(); + + vector &getY(); + + vector> getAttributes() const; + + static vector factorize(const vector &labels_t); }; + #endif \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 67cc084..d1c1e77 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -4,9 +4,10 @@ include(FetchContent) include_directories(${GTEST_INCLUDE_DIRS}) + FetchContent_Declare( - googletest - URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip + googletest + URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip ) # For Windows: Prevent overriding the parent project's compiler/linker settings set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)