mirror of
https://github.com/rmontanana/mdlp.git
synced 2025-08-15 15:35:55 +00:00
Remove unneeded loop in sortIndices
Add some static casts
This commit is contained in:
@@ -1,6 +1,10 @@
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
project(mdlp)
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
endif ()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
|
||||
add_library(mdlp CPPFImdlp.cpp Metrics.cpp sample/sample.cpp)
|
||||
|
@@ -5,20 +5,22 @@
|
||||
#include <limits>
|
||||
#include "CPPFImdlp.h"
|
||||
#include "Metrics.h"
|
||||
|
||||
namespace mdlp {
|
||||
|
||||
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed): min_length(min_length_),
|
||||
max_depth(max_depth_), proposed_cuts(proposed)
|
||||
{
|
||||
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_),
|
||||
max_depth(max_depth_),
|
||||
proposed_cuts(proposed) {
|
||||
}
|
||||
|
||||
CPPFImdlp::CPPFImdlp() = default;
|
||||
|
||||
CPPFImdlp::~CPPFImdlp() = default;
|
||||
|
||||
size_t CPPFImdlp::compute_max_num_cut_points() const
|
||||
{
|
||||
size_t CPPFImdlp::compute_max_num_cut_points() const {
|
||||
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples
|
||||
if (proposed_cuts == 0) {
|
||||
return numeric_limits<size_t>::max();
|
||||
return numeric_limits<size_t>::max();
|
||||
}
|
||||
if (proposed_cuts < 0 || proposed_cuts > static_cast<float>(X.size())) {
|
||||
throw invalid_argument("wrong proposed num_cuts value");
|
||||
@@ -28,8 +30,7 @@ namespace mdlp {
|
||||
return static_cast<size_t>(proposed_cuts);
|
||||
}
|
||||
|
||||
void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
|
||||
{
|
||||
void CPPFImdlp::fit(samples_t &X_, labels_t &y_) {
|
||||
X = X_;
|
||||
y = y_;
|
||||
num_cut_points = compute_max_num_cut_points();
|
||||
@@ -52,12 +53,15 @@ namespace mdlp {
|
||||
computeCutPoints(0, X.size(), 1);
|
||||
}
|
||||
|
||||
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
|
||||
{
|
||||
size_t n, m, idxPrev = cut - 1 >= start ? cut - 1 : cut;
|
||||
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) {
|
||||
size_t n;
|
||||
size_t m;
|
||||
size_t idxPrev = cut - 1 >= start ? cut - 1 : cut;
|
||||
size_t idxNext = cut + 1 < end ? cut + 1 : cut;
|
||||
bool backWall; // true if duplicates reach begining of the interval
|
||||
precision_t previous, actual, next;
|
||||
precision_t previous;
|
||||
precision_t actual;
|
||||
precision_t next;
|
||||
previous = X[indices[idxPrev]];
|
||||
actual = X[indices[cut]];
|
||||
next = X[indices[idxNext]];
|
||||
@@ -78,11 +82,10 @@ namespace mdlp {
|
||||
// Decide which values to use
|
||||
cut = cut + (backWall ? m + 1 : -n);
|
||||
actual = X[indices[cut]];
|
||||
return { (actual + previous) / 2, cut };
|
||||
return {(actual + previous) / 2, cut};
|
||||
}
|
||||
|
||||
void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_)
|
||||
{
|
||||
void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_) {
|
||||
size_t cut;
|
||||
pair<precision_t, size_t> result;
|
||||
if (cutPoints.size() == num_cut_points)
|
||||
@@ -103,13 +106,15 @@ namespace mdlp {
|
||||
}
|
||||
}
|
||||
|
||||
size_t CPPFImdlp::getCandidate(size_t start, size_t end)
|
||||
{
|
||||
size_t CPPFImdlp::getCandidate(size_t start, size_t end) {
|
||||
/* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
|
||||
E(A, TA; S) is minimal amongst all the candidate cut points. */
|
||||
size_t candidate = numeric_limits<size_t>::max(), elements = end - start;
|
||||
size_t candidate = numeric_limits<size_t>::max();
|
||||
size_t elements = end - start;
|
||||
bool sameValues = true;
|
||||
precision_t entropy_left, entropy_right, minEntropy;
|
||||
precision_t entropy_left;
|
||||
precision_t entropy_right;
|
||||
precision_t minEntropy;
|
||||
// Check if all the values of the variable in the interval are the same
|
||||
for (size_t idx = start + 1; idx < end; idx++) {
|
||||
if (X[indices[idx]] != X[indices[start]]) {
|
||||
@@ -134,11 +139,15 @@ namespace mdlp {
|
||||
return candidate;
|
||||
}
|
||||
|
||||
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
|
||||
{
|
||||
int k, k1, k2;
|
||||
precision_t ig, delta;
|
||||
precision_t ent, ent1, ent2;
|
||||
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) {
|
||||
int k;
|
||||
int k1;
|
||||
int k2;
|
||||
precision_t ig;
|
||||
precision_t delta;
|
||||
precision_t ent;
|
||||
precision_t ent1;
|
||||
precision_t ent2;
|
||||
auto N = precision_t(end - start);
|
||||
k = metrics.computeNumClasses(start, end);
|
||||
k1 = metrics.computeNumClasses(start, cut);
|
||||
@@ -148,33 +157,30 @@ namespace mdlp {
|
||||
ent2 = metrics.entropy(cut, end);
|
||||
ig = metrics.informationGain(start, cut, end);
|
||||
delta = static_cast<float>(log2(pow(3, precision_t(k)) - 2) -
|
||||
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
|
||||
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
|
||||
precision_t term = 1 / N * (log2(N - 1) + delta);
|
||||
return ig > term;
|
||||
}
|
||||
|
||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
||||
indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
|
||||
{
|
||||
indices_t CPPFImdlp::sortIndices(samples_t &X_, labels_t &y_) {
|
||||
indices_t idx(X_.size());
|
||||
iota(idx.begin(), idx.end(), 0);
|
||||
for (size_t i = 0; i < X_.size(); i++)
|
||||
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
|
||||
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
|
||||
if (X_[i1] == X_[i2])
|
||||
return y_[i1] < y_[i2];
|
||||
else
|
||||
return X_[i1] < X_[i2];
|
||||
});
|
||||
});
|
||||
return idx;
|
||||
}
|
||||
|
||||
cutPoints_t CPPFImdlp::getCutPoints()
|
||||
{
|
||||
cutPoints_t CPPFImdlp::getCutPoints() {
|
||||
sort(cutPoints.begin(), cutPoints.end());
|
||||
return cutPoints;
|
||||
}
|
||||
int CPPFImdlp::get_depth() const
|
||||
{
|
||||
|
||||
int CPPFImdlp::get_depth() const {
|
||||
return depth;
|
||||
}
|
||||
}
|
||||
|
20
CPPFImdlp.h
20
CPPFImdlp.h
@@ -1,9 +1,11 @@
|
||||
#ifndef CPPFIMDLP_H
|
||||
#define CPPFIMDLP_H
|
||||
|
||||
#include "typesFImdlp.h"
|
||||
#include "Metrics.h"
|
||||
#include <utility>
|
||||
#include <string>
|
||||
|
||||
namespace mdlp {
|
||||
class CPPFImdlp {
|
||||
protected:
|
||||
@@ -18,20 +20,32 @@ namespace mdlp {
|
||||
cutPoints_t cutPoints;
|
||||
size_t num_cut_points = numeric_limits<size_t>::max();
|
||||
|
||||
static indices_t sortIndices(samples_t&, labels_t&);
|
||||
static indices_t sortIndices(samples_t &, labels_t &);
|
||||
|
||||
void computeCutPoints(size_t, size_t, int);
|
||||
|
||||
bool mdlp(size_t, size_t, size_t);
|
||||
|
||||
size_t getCandidate(size_t, size_t);
|
||||
|
||||
size_t compute_max_num_cut_points() const;
|
||||
|
||||
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
|
||||
|
||||
public:
|
||||
CPPFImdlp();
|
||||
|
||||
CPPFImdlp(size_t, int, float);
|
||||
|
||||
~CPPFImdlp();
|
||||
void fit(samples_t&, labels_t&);
|
||||
|
||||
void fit(samples_t &, labels_t &);
|
||||
|
||||
cutPoints_t getCutPoints();
|
||||
|
||||
int get_depth() const;
|
||||
inline string version() { return "1.1.1"; };
|
||||
|
||||
inline string version() const { return "1.1.1"; };
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
46
Metrics.cpp
46
Metrics.cpp
@@ -1,63 +1,71 @@
|
||||
#include "Metrics.h"
|
||||
#include <set>
|
||||
#include <cmath>
|
||||
|
||||
using namespace std;
|
||||
namespace mdlp {
|
||||
Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
|
||||
{
|
||||
Metrics::Metrics(labels_t &y_, indices_t &indices_) : y(y_), indices(indices_),
|
||||
numClasses(computeNumClasses(0, indices.size())) {
|
||||
}
|
||||
int Metrics::computeNumClasses(size_t start, size_t end)
|
||||
{
|
||||
|
||||
int Metrics::computeNumClasses(size_t start, size_t end) {
|
||||
set<int> nClasses;
|
||||
for (auto i = start; i < end; ++i) {
|
||||
nClasses.insert(y[indices[i]]);
|
||||
}
|
||||
return nClasses.size();
|
||||
return static_cast<int>(nClasses.size());
|
||||
}
|
||||
void Metrics::setData(labels_t& y_, indices_t& indices_)
|
||||
{
|
||||
|
||||
void Metrics::setData(const labels_t &y_, const indices_t &indices_) {
|
||||
indices = indices_;
|
||||
y = y_;
|
||||
numClasses = computeNumClasses(0, indices.size());
|
||||
entropyCache.clear();
|
||||
igCache.clear();
|
||||
}
|
||||
precision_t Metrics::entropy(size_t start, size_t end)
|
||||
{
|
||||
precision_t p, ventropy = 0;
|
||||
|
||||
precision_t Metrics::entropy(size_t start, size_t end) {
|
||||
precision_t p;
|
||||
precision_t ventropy = 0;
|
||||
int nElements = 0;
|
||||
labels_t counts(numClasses + 1, 0);
|
||||
if (end - start < 2)
|
||||
return 0;
|
||||
if (entropyCache.find({ start, end }) != entropyCache.end()) {
|
||||
if (entropyCache.find({start, end}) != entropyCache.end()) {
|
||||
return entropyCache[{start, end}];
|
||||
}
|
||||
for (auto i = &indices[start]; i != &indices[end]; ++i) {
|
||||
counts[y[*i]]++;
|
||||
nElements++;
|
||||
}
|
||||
for (auto count : counts) {
|
||||
for (auto count: counts) {
|
||||
if (count > 0) {
|
||||
p = (precision_t)count / nElements;
|
||||
p = static_cast<precision_t>(count) / static_cast<precision_t>(nElements);
|
||||
ventropy -= p * log2(p);
|
||||
}
|
||||
}
|
||||
entropyCache[{start, end}] = ventropy;
|
||||
return ventropy;
|
||||
}
|
||||
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
|
||||
{
|
||||
|
||||
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end) {
|
||||
precision_t iGain;
|
||||
precision_t entropyInterval, entropyLeft, entropyRight;
|
||||
int nElementsLeft = cut - start, nElementsRight = end - cut;
|
||||
int nElements = end - start;
|
||||
precision_t entropyInterval;
|
||||
precision_t entropyLeft;
|
||||
precision_t entropyRight;
|
||||
size_t nElementsLeft = cut - start;
|
||||
size_t nElementsRight = end - cut;
|
||||
size_t nElements = end - start;
|
||||
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
|
||||
return igCache[make_tuple(start, cut, end)];
|
||||
}
|
||||
entropyInterval = entropy(start, end);
|
||||
entropyLeft = entropy(start, cut);
|
||||
entropyRight = entropy(cut, end);
|
||||
iGain = entropyInterval - ((precision_t)nElementsLeft * entropyLeft + (precision_t)nElementsRight * entropyRight) / nElements;
|
||||
iGain = entropyInterval -
|
||||
(static_cast<precision_t>(nElementsLeft) * entropyLeft +
|
||||
static_cast<precision_t>(nElementsRight) * entropyRight) /
|
||||
static_cast<precision_t>(nElements);
|
||||
igCache[make_tuple(start, cut, end)] = iGain;
|
||||
return iGain;
|
||||
}
|
||||
|
18
Metrics.h
18
Metrics.h
@@ -1,19 +1,25 @@
|
||||
#ifndef CCMETRICS_H
|
||||
#define CCMETRICS_H
|
||||
|
||||
#include "typesFImdlp.h"
|
||||
|
||||
namespace mdlp {
|
||||
class Metrics {
|
||||
protected:
|
||||
labels_t& y;
|
||||
indices_t& indices;
|
||||
labels_t &y;
|
||||
indices_t &indices;
|
||||
int numClasses;
|
||||
cacheEnt_t entropyCache;
|
||||
cacheIg_t igCache;
|
||||
cacheEnt_t entropyCache = cacheEnt_t();
|
||||
cacheIg_t igCache = cacheIg_t();
|
||||
public:
|
||||
Metrics(labels_t&, indices_t&);
|
||||
void setData(labels_t&, indices_t&);
|
||||
Metrics(labels_t &, indices_t &);
|
||||
|
||||
void setData(const labels_t &, const indices_t &);
|
||||
|
||||
int computeNumClasses(size_t, size_t);
|
||||
|
||||
precision_t entropy(size_t, size_t);
|
||||
|
||||
precision_t informationGain(size_t, size_t, size_t);
|
||||
};
|
||||
}
|
||||
|
@@ -38,17 +38,17 @@ tuple<string, string, int, int, float> parse_arguments(int argc, char **argv) {
|
||||
int max_depth = numeric_limits<int>::max();
|
||||
int min_length = 3;
|
||||
float max_cutpoints = 0;
|
||||
static struct option long_options[] = {
|
||||
const option long_options[] = {
|
||||
{"help", no_argument, nullptr, 'h'},
|
||||
{"file", required_argument, nullptr, 'f'},
|
||||
{"path", required_argument, nullptr, 'p'},
|
||||
{"max_depth", required_argument, nullptr, 'm'},
|
||||
{"max_cutpoints", required_argument, nullptr, 'c'},
|
||||
{"min_length", required_argument, nullptr, 'n'},
|
||||
{nullptr, 0, nullptr, 0}
|
||||
{nullptr, no_argument, nullptr, 0}
|
||||
};
|
||||
while (true) {
|
||||
auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, nullptr);
|
||||
const auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, nullptr);
|
||||
if (c == -1)
|
||||
break;
|
||||
switch (c) {
|
||||
@@ -56,16 +56,16 @@ tuple<string, string, int, int, float> parse_arguments(int argc, char **argv) {
|
||||
usage(argv[0]);
|
||||
exit(0);
|
||||
case 'f':
|
||||
file_name = optarg;
|
||||
file_name = string(optarg);
|
||||
break;
|
||||
case 'm':
|
||||
max_depth = (int) strtol(optarg, nullptr, 10);
|
||||
max_depth = stoi(optarg);
|
||||
break;
|
||||
case 'n':
|
||||
min_length = (int) strtol(optarg, nullptr, 10);
|
||||
min_length = stoi(optarg);
|
||||
break;
|
||||
case 'c':
|
||||
max_cutpoints = strtof(optarg, nullptr);
|
||||
max_cutpoints = stof(optarg);
|
||||
break;
|
||||
case 'p':
|
||||
path = optarg;
|
||||
@@ -92,7 +92,7 @@ void process_file(const string &path, const string &file_name, bool class_last,
|
||||
|
||||
file.load(path + file_name + ".arff", class_last);
|
||||
auto attributes = file.getAttributes();
|
||||
int items = file.getSize();
|
||||
auto items = file.getSize();
|
||||
cout << "Number of lines: " << items << endl;
|
||||
cout << "Attributes: " << endl;
|
||||
for (auto attribute: attributes) {
|
||||
@@ -109,7 +109,7 @@ void process_file(const string &path, const string &file_name, bool class_last,
|
||||
}
|
||||
cout << y[i] << endl;
|
||||
}
|
||||
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
|
||||
auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
|
||||
auto total = 0;
|
||||
for (auto i = 0; i < attributes.size(); i++) {
|
||||
auto min_max = minmax_element(X[i].begin(), X[i].end());
|
||||
@@ -141,7 +141,7 @@ void process_all_files(const map<string, bool> &datasets, const string &path, in
|
||||
size_t timing = 0;
|
||||
int cut_points = 0;
|
||||
for (auto i = 0; i < attributes.size(); i++) {
|
||||
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
|
||||
auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
|
||||
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
|
||||
test.fit(X[i], y);
|
||||
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
|
||||
@@ -163,8 +163,10 @@ int main(int argc, char **argv) {
|
||||
{"mfeat-factors", true},
|
||||
{"test", true}
|
||||
};
|
||||
string file_name, path;
|
||||
int max_depth, min_length;
|
||||
string file_name;
|
||||
string path;
|
||||
int max_depth;
|
||||
int min_length;
|
||||
float max_cutpoints;
|
||||
tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv);
|
||||
if (datasets.find(file_name) == datasets.end() && file_name != "all") {
|
||||
|
@@ -6,82 +6,88 @@
|
||||
using namespace std;
|
||||
|
||||
ArffFiles::ArffFiles() = default;
|
||||
vector<string> ArffFiles::getLines()
|
||||
{
|
||||
|
||||
vector<string> ArffFiles::getLines() const {
|
||||
return lines;
|
||||
}
|
||||
unsigned long int ArffFiles::getSize()
|
||||
{
|
||||
|
||||
unsigned long int ArffFiles::getSize() const {
|
||||
return lines.size();
|
||||
}
|
||||
vector<pair<string, string>> ArffFiles::getAttributes()
|
||||
{
|
||||
|
||||
vector<pair<string, string>> ArffFiles::getAttributes() const {
|
||||
return attributes;
|
||||
}
|
||||
string ArffFiles::getClassName()
|
||||
{
|
||||
|
||||
string ArffFiles::getClassName() const {
|
||||
return className;
|
||||
}
|
||||
string ArffFiles::getClassType()
|
||||
{
|
||||
|
||||
string ArffFiles::getClassType() const {
|
||||
return classType;
|
||||
}
|
||||
vector<vector<float>>& ArffFiles::getX()
|
||||
{
|
||||
|
||||
vector<vector<float>> &ArffFiles::getX() {
|
||||
return X;
|
||||
}
|
||||
vector<int>& ArffFiles::getY()
|
||||
{
|
||||
|
||||
vector<int> &ArffFiles::getY() {
|
||||
return y;
|
||||
}
|
||||
void ArffFiles::load(const string fileName, bool classLast)
|
||||
{
|
||||
|
||||
void ArffFiles::load(const string &fileName, bool classLast) {
|
||||
ifstream file(fileName);
|
||||
if (file.is_open()) {
|
||||
string line, keyword, attribute, type, type_w;
|
||||
while (getline(file, line)) {
|
||||
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
|
||||
continue;
|
||||
}
|
||||
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
|
||||
stringstream ss(line);
|
||||
ss >> keyword >> attribute;
|
||||
type = "";
|
||||
while(ss >> type_w)
|
||||
type += type_w + " ";
|
||||
attributes.emplace_back(attribute, type );
|
||||
continue;
|
||||
}
|
||||
if (line[0] == '@') {
|
||||
continue;
|
||||
}
|
||||
lines.push_back(line);
|
||||
}
|
||||
file.close();
|
||||
if (attributes.empty())
|
||||
throw invalid_argument("No attributes found");
|
||||
if (classLast) {
|
||||
className = get<0>(attributes.back());
|
||||
classType = get<1>(attributes.back());
|
||||
attributes.pop_back();
|
||||
} else {
|
||||
className = get<0>(attributes.front());
|
||||
classType = get<1>(attributes.front());
|
||||
attributes.erase(attributes.begin());
|
||||
}
|
||||
generateDataset(classLast);
|
||||
} else
|
||||
if (!file.is_open()) {
|
||||
throw invalid_argument("Unable to open file");
|
||||
}
|
||||
string line;
|
||||
string keyword;
|
||||
string attribute;
|
||||
string type;
|
||||
string type_w;
|
||||
while (getline(file, line)) {
|
||||
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
|
||||
continue;
|
||||
}
|
||||
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
|
||||
stringstream ss(line);
|
||||
ss >> keyword >> attribute;
|
||||
type = "";
|
||||
while (ss >> type_w)
|
||||
type += type_w + " ";
|
||||
attributes.emplace_back(attribute, type);
|
||||
continue;
|
||||
}
|
||||
if (line[0] == '@') {
|
||||
continue;
|
||||
}
|
||||
lines.push_back(line);
|
||||
}
|
||||
file.close();
|
||||
if (attributes.empty())
|
||||
throw invalid_argument("No attributes found");
|
||||
if (classLast) {
|
||||
className = get<0>(attributes.back());
|
||||
classType = get<1>(attributes.back());
|
||||
attributes.pop_back();
|
||||
} else {
|
||||
className = get<0>(attributes.front());
|
||||
classType = get<1>(attributes.front());
|
||||
attributes.erase(attributes.begin());
|
||||
}
|
||||
generateDataset(classLast);
|
||||
|
||||
}
|
||||
void ArffFiles::generateDataset(bool classLast)
|
||||
{
|
||||
|
||||
void ArffFiles::generateDataset(bool classLast) {
|
||||
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
|
||||
vector<string> yy = vector<string>(lines.size(), "");
|
||||
auto yy = vector<string>(lines.size(), "");
|
||||
int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0;
|
||||
for (size_t i = 0; i < lines.size(); i++) {
|
||||
stringstream ss(lines[i]);
|
||||
string value;
|
||||
int pos = 0, xIndex = 0;
|
||||
int pos = 0;
|
||||
int xIndex = 0;
|
||||
while (getline(ss, value, ',')) {
|
||||
if (pos++ == labelIndex) {
|
||||
yy[i] = value;
|
||||
@@ -92,20 +98,20 @@ void ArffFiles::generateDataset(bool classLast)
|
||||
}
|
||||
y = factorize(yy);
|
||||
}
|
||||
string ArffFiles::trim(const string& source)
|
||||
{
|
||||
|
||||
string ArffFiles::trim(const string &source) {
|
||||
string s(source);
|
||||
s.erase(0, s.find_first_not_of(" \n\r\t"));
|
||||
s.erase(s.find_last_not_of(" \n\r\t") + 1);
|
||||
return s;
|
||||
}
|
||||
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
|
||||
{
|
||||
|
||||
vector<int> ArffFiles::factorize(const vector<string> &labels_t) {
|
||||
vector<int> yy;
|
||||
yy.reserve(labels_t.size());
|
||||
map<string, int> labelMap;
|
||||
int i = 0;
|
||||
for (const string &label : labels_t) {
|
||||
for (const string &label: labels_t) {
|
||||
if (labelMap.find(label) == labelMap.end()) {
|
||||
labelMap[label] = i++;
|
||||
}
|
||||
|
@@ -1,27 +1,44 @@
|
||||
#ifndef ARFFFILES_H
|
||||
#define ARFFFILES_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
class ArffFiles {
|
||||
private:
|
||||
vector<string> lines;
|
||||
vector<pair<string, string>> attributes;
|
||||
string className, classType;
|
||||
string className;
|
||||
string classType;
|
||||
vector<vector<float>> X;
|
||||
vector<int> y;
|
||||
|
||||
void generateDataset(bool);
|
||||
|
||||
public:
|
||||
ArffFiles();
|
||||
void load(string, bool = true);
|
||||
vector<string> getLines();
|
||||
unsigned long int getSize();
|
||||
string getClassName();
|
||||
string getClassType();
|
||||
static string trim(const string&);
|
||||
vector<vector<float>>& getX();
|
||||
vector<int>& getY();
|
||||
vector<pair<string, string>> getAttributes();
|
||||
static vector<int> factorize(const vector<string>& labels_t);
|
||||
|
||||
void load(const string &, bool = true);
|
||||
|
||||
vector<string> getLines() const;
|
||||
|
||||
unsigned long int getSize() const;
|
||||
|
||||
string getClassName() const;
|
||||
|
||||
string getClassType() const;
|
||||
|
||||
static string trim(const string &);
|
||||
|
||||
vector<vector<float>> &getX();
|
||||
|
||||
vector<int> &getY();
|
||||
|
||||
vector<pair<string, string>> getAttributes() const;
|
||||
|
||||
static vector<int> factorize(const vector<string> &labels_t);
|
||||
};
|
||||
|
||||
#endif
|
@@ -4,9 +4,10 @@ include(FetchContent)
|
||||
|
||||
include_directories(${GTEST_INCLUDE_DIRS})
|
||||
|
||||
|
||||
FetchContent_Declare(
|
||||
googletest
|
||||
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
|
||||
googletest
|
||||
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
|
||||
)
|
||||
# For Windows: Prevent overriding the parent project's compiler/linker settings
|
||||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
|
Reference in New Issue
Block a user