Refactor project structure and add Arff load and test

This commit is contained in:
2022-12-09 16:35:58 +01:00
parent e4cf72d0fe
commit 65de064fa9
20 changed files with 783 additions and 253 deletions

View File

@@ -19,6 +19,7 @@ namespace mdlp {
{
X = X_;
y = y_;
cutPoints.clear();
if (X.size() != y.size()) {
throw invalid_argument("X and y must have the same size");
}

View File

@@ -1,6 +1,5 @@
#include "Metrics.h"
#include <set>
#include <iostream>
using namespace std;
namespace mdlp {
Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
@@ -19,6 +18,8 @@ namespace mdlp {
indices = indices_;
y = y_;
numClasses = computeNumClasses(0, indices.size());
entropyCache.clear();
igCache.clear();
}
precision_t Metrics::entropy(size_t start, size_t end)
{
@@ -50,7 +51,6 @@ namespace mdlp {
int nElementsLeft = cut - start, nElementsRight = end - cut;
int nElements = end - start;
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
cout << "**********Cache IG hit for " << start << " " << end << endl;
return igCache[make_tuple(start, cut, end)];
}
entropyInterval = entropy(start, end);
@@ -61,14 +61,4 @@ namespace mdlp {
return iGain;
}
}
/*
cache_t entropyCache;
std::map<std::tuple<int, int>, double> c;
// Set the value at index (3, 5) to 7.8.
c[std::make_tuple(3, 5)] = 7.8;
// Print the value at index (3, 5).
std::cout << c[std::make_tuple(3, 5)] << std::endl;
*/
}

View File

@@ -0,0 +1,117 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
using namespace std;
ArffFiles::ArffFiles()
{
}
vector<string> ArffFiles::getLines()
{
return lines;
}
unsigned long int ArffFiles::getSize()
{
return lines.size();
}
vector<tuple<string, string>> ArffFiles::getAttributes()
{
return attributes;
}
string ArffFiles::getClassName()
{
return className;
}
string ArffFiles::getClassType()
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(string fileName, bool classLast)
{
ifstream file(fileName);
string keyword, attribute, type;
if (file.is_open()) {
string line;
while (getline(file, line)) {
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute >> type;
attributes.push_back(make_tuple(attribute, type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
if (classLast) {
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
} else {
className = get<0>(attributes.front());
classType = get<1>(attributes.front());
attributes.erase(attributes.begin());
}
generateDataset(classLast);
} else
throw invalid_argument("Unable to open file");
}
void ArffFiles::generateDataset(bool classLast)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
vector<string> yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? attributes.size() : 0;
for (int i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int pos = 0, xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
X[xIndex++][i] = stof(value);
}
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels)
{
vector<int> yy;
yy.reserve(labels.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

View File

@@ -0,0 +1,28 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
#include <tuple>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<tuple<string, string>> attributes;
string className, classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset(bool);
public:
ArffFiles();
void load(string, bool = true);
vector<string> getLines();
unsigned long int getSize();
string getClassName();
string getClassType();
string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<tuple<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels);
};
#endif

View File

@@ -1,177 +1,177 @@
#include "gtest/gtest.h"
#include "../Metrics.h"
#include "../CPPFImdlp.h"
namespace mdlp {
class TestFImdlp : public CPPFImdlp, public testing::Test {
public:
TestFImdlp() : CPPFImdlp(true, 6, true) {}
void SetUp()
{
// 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
//(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
fit(X, y);
}
void setProposal(bool value)
{
proposal = value;
}
void initCutPoints()
{
setCutPoints(cutPoints_t());
}
void initIndices()
{
indices = indices_t();
}
void initDiscretized()
{
xDiscretized = labels();
}
void checkSortedVector(samples& X_, indices_t indices_)
{
X = X_;
indices = indices_;
indices_t testSortedIndices = sortIndices(X);
precision_t prev = X[testSortedIndices[0]];
for (auto i = 0; i < X.size(); ++i) {
EXPECT_EQ(testSortedIndices[i], indices[i]);
EXPECT_LE(prev, X[testSortedIndices[i]]);
prev = X[testSortedIndices[i]];
}
}
void checkCutPoints(cutPoints_t& expected)
{
int expectedSize = expected.size();
EXPECT_EQ(cutPoints.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(cutPoints[i].start, expected[i].start);
EXPECT_EQ(cutPoints[i].end, expected[i].end);
EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision);
}
}
template<typename T, typename A>
void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
{
EXPECT_EQ(expected.size(), computed.size());
for (auto i = 0; i < expected.size(); i++) {
EXPECT_EQ(expected[i], computed[i]);
}
}
};
TEST_F(TestFImdlp, FitErrorEmptyDataset)
{
X = samples();
y = labels();
EXPECT_THROW(fit(X, y), std::invalid_argument);
}
TEST_F(TestFImdlp, FitErrorDifferentSize)
{
X = { 1, 2, 3 };
y = { 1, 2 };
EXPECT_THROW(fit(X, y), std::invalid_argument);
}
TEST_F(TestFImdlp, SortIndices)
{
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 };
checkSortedVector(X, indices);
X = { 5.77, 5.88, 5.99 };
indices = { 0, 1, 2 };
checkSortedVector(X, indices);
X = { 5.33, 5.22, 5.11 };
indices = { 2, 1, 0 };
checkSortedVector(X, indices);
}
TEST_F(TestFImdlp, EvaluateCutPoint)
{
cutPoint_t rest, candidate;
rest = { 0, 10, -1, -1, 1000 };
candidate = { 0, 4, -1, -1, 5.15 };
EXPECT_FALSE(evaluateCutPoint(rest, candidate));
}
TEST_F(TestFImdlp, ComputeCutPointsOriginal)
{
cutPoints_t expected;
expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
{ 6, 10, -1, 5.45, 3.4028234663852886e+38 }
};
setCutPoints(cutPoints_t());
computeCutPointsOriginal();
checkCutPoints(expected);
}
TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase)
{
cutPoints_t expected;
expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 },
};
X = { 0, 1, 2, 2 };
y = { 1, 1, 1, 2 };
fit(X, y);
computeCutPointsOriginal();
checkCutPoints(expected);
}
TEST_F(TestFImdlp, ComputeCutPointsProposal)
{
cutPoints_t expected;
expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 5.1 }, { 4, 6, -1, 5.1, 5.4 },
{ 6, 9, -1, 5.4, 5.85 },
{ 9, 10, -1, 5.85, 3.4028234663852886e+38 }
};
computeCutPointsProposal();
checkCutPoints(expected);
}
TEST_F(TestFImdlp, ComputeCutPointsProposalGCase)
{
cutPoints_t expected;
expected = {
{ 0, 3, -1, -3.4028234663852886e+38, 1.5 },
{ 3, 4, -1, 1.5, 3.4028234663852886e+38 }
};
X = { 0, 1, 2, 2 };
y = { 1, 1, 1, 2 };
fit(X, y);
computeCutPointsProposal();
checkCutPoints(expected);
}
TEST_F(TestFImdlp, DiscretizedValues)
{
labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
computed = getDiscretizedValues();
checkVectors(expected, computed);
}
TEST_F(TestFImdlp, GetCutPoints)
{
samples computed, expected = { 5.15, 5.45, 3.4028234663852886e+38 };
computeCutPointsOriginal();
computed = getCutPoints();
checkVectors(expected, computed);
}
TEST_F(TestFImdlp, Constructor)
{
samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
setProposal(false);
fit(X, y);
computeCutPointsOriginal();
cutPoints_t expected;
vector<precision_t> computed = getCutPoints();
expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
{ 6, 10, -1, 5.45, 3.4028234663852886e+38 }
};
computed = getCutPoints();
int expectedSize = expected.size();
EXPECT_EQ(computed.size(), expected.size());
for (auto i = 0; i < expectedSize; i++) {
EXPECT_NEAR(computed[i], expected[i].toValue, .00000001);
}
}
}
//#include "gtest/gtest.h"
//#include "../Metrics.h"
//#include "../CPPFImdlp.h"
//namespace mdlp {
// class TestFImdlp : public CPPFImdlp, public testing::Test {
// public:
// TestFImdlp() : CPPFImdlp(true, true) {}
// void SetUp()
// {
// // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
// //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
// X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
// y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
// fit(X, y);
// }
// void setProposal(bool value)
// {
// proposal = value;
// }
// void initCutPoints()
// {
// setCutPoints(cutPoints_t());
// }
// void initIndices()
// {
// indices = indices_t();
// }
// void initDiscretized()
// {
// xDiscretized = labels();
// }
// void checkSortedVector(samples& X_, indices_t indices_)
// {
// X = X_;
// indices = indices_;
// indices_t testSortedIndices = sortIndices(X);
// precision_t prev = X[testSortedIndices[0]];
// for (auto i = 0; i < X.size(); ++i) {
// EXPECT_EQ(testSortedIndices[i], indices[i]);
// EXPECT_LE(prev, X[testSortedIndices[i]]);
// prev = X[testSortedIndices[i]];
// }
// }
// void checkCutPoints(cutPoints_t& expected)
// {
// int expectedSize = expected.size();
// EXPECT_EQ(cutPoints.size(), expectedSize);
// for (auto i = 0; i < expectedSize; i++) {
// EXPECT_EQ(cutPoints[i].start, expected[i].start);
// EXPECT_EQ(cutPoints[i].end, expected[i].end);
// EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber);
// EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision);
// EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision);
// }
// }
// template<typename T, typename A>
// void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
// {
// EXPECT_EQ(expected.size(), computed.size());
// for (auto i = 0; i < expected.size(); i++) {
// EXPECT_EQ(expected[i], computed[i]);
// }
// }
//
// };
// TEST_F(TestFImdlp, FitErrorEmptyDataset)
// {
// X = samples();
// y = labels();
// EXPECT_THROW(fit(X, y), std::invalid_argument);
// }
// TEST_F(TestFImdlp, FitErrorDifferentSize)
// {
// X = { 1, 2, 3 };
// y = { 1, 2 };
// EXPECT_THROW(fit(X, y), std::invalid_argument);
// }
// TEST_F(TestFImdlp, SortIndices)
// {
// X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
// indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 };
// checkSortedVector(X, indices);
// X = { 5.77, 5.88, 5.99 };
// indices = { 0, 1, 2 };
// checkSortedVector(X, indices);
// X = { 5.33, 5.22, 5.11 };
// indices = { 2, 1, 0 };
// checkSortedVector(X, indices);
// }
// TEST_F(TestFImdlp, EvaluateCutPoint)
// {
// cutPoint_t rest, candidate;
// rest = { 0, 10, -1, -1, 1000 };
// candidate = { 0, 4, -1, -1, 5.15 };
// EXPECT_FALSE(evaluateCutPoint(rest, candidate));
// }
// TEST_F(TestFImdlp, ComputeCutPointsOriginal)
// {
// cutPoints_t expected;
// expected = {
// { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
// { 6, 10, -1, 5.45, 3.4028234663852886e+38 }
// };
// setCutPoints(cutPoints_t());
// computeCutPointsOriginal();
// checkCutPoints(expected);
// }
// TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase)
// {
// cutPoints_t expected;
// expected = {
// { 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 },
// };
// X = { 0, 1, 2, 2 };
// y = { 1, 1, 1, 2 };
// fit(X, y);
// computeCutPointsOriginal();
// checkCutPoints(expected);
// }
// TEST_F(TestFImdlp, ComputeCutPointsProposal)
// {
// cutPoints_t expected;
// expected = {
// { 0, 4, -1, -3.4028234663852886e+38, 5.1 }, { 4, 6, -1, 5.1, 5.4 },
// { 6, 9, -1, 5.4, 5.85 },
// { 9, 10, -1, 5.85, 3.4028234663852886e+38 }
// };
// computeCutPointsProposal();
// checkCutPoints(expected);
// }
// TEST_F(TestFImdlp, ComputeCutPointsProposalGCase)
// {
// cutPoints_t expected;
// expected = {
// { 0, 3, -1, -3.4028234663852886e+38, 1.5 },
// { 3, 4, -1, 1.5, 3.4028234663852886e+38 }
// };
// X = { 0, 1, 2, 2 };
// y = { 1, 1, 1, 2 };
// fit(X, y);
// computeCutPointsProposal();
// checkCutPoints(expected);
// }
// TEST_F(TestFImdlp, DiscretizedValues)
// {
// labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
// computed = getDiscretizedValues();
// checkVectors(expected, computed);
// }
// TEST_F(TestFImdlp, GetCutPoints)
// {
// samples computed, expected = { 5.15, 5.45, 3.4028234663852886e+38 };
// computeCutPointsOriginal();
// computed = getCutPoints();
// checkVectors(expected, computed);
// }
// TEST_F(TestFImdlp, Constructor)
// {
// samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
// labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
// setProposal(false);
// fit(X, y);
// computeCutPointsOriginal();
// cutPoints_t expected;
// vector<precision_t> computed = getCutPoints();
// expected = {
// { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
// { 6, 10, -1, 5.45, 3.4028234663852886e+38 }
// };
// computed = getCutPoints();
// int expectedSize = expected.size();
// EXPECT_EQ(computed.size(), expected.size());
// for (auto i = 0; i < expectedSize; i++) {
// EXPECT_NEAR(computed[i], expected[i].toValue, .00000001);
// }
// }
//}

225
fimdlp/testcpp/datasets/iris.arff Executable file
View File

@@ -0,0 +1,225 @@
% 1. Title: Iris Plants Database
%
% 2. Sources:
% (a) Creator: R.A. Fisher
% (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
% (c) Date: July, 1988
%
% 3. Past Usage:
% - Publications: too many to mention!!! Here are a few.
% 1. Fisher,R.A. "The use of multiple measurements in taxonomic problems"
% Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions
% to Mathematical Statistics" (John Wiley, NY, 1950).
% 2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
% (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
% 3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
% Structure and Classification Rule for Recognition in Partially Exposed
% Environments". IEEE Transactions on Pattern Analysis and Machine
% Intelligence, Vol. PAMI-2, No. 1, 67-71.
% -- Results:
% -- very low misclassification rates (0% for the setosa class)
% 4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE
% Transactions on Information Theory, May 1972, 431-433.
% -- Results:
% -- very low misclassification rates again
% 5. See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II
% conceptual clustering system finds 3 classes in the data.
%
% 4. Relevant Information:
% --- This is perhaps the best known database to be found in the pattern
% recognition literature. Fisher's paper is a classic in the field
% and is referenced frequently to this day. (See Duda & Hart, for
% example.) The data set contains 3 classes of 50 instances each,
% where each class refers to a type of iris plant. One class is
% linearly separable from the other 2; the latter are NOT linearly
% separable from each other.
% --- Predicted attribute: class of iris plant.
% --- This is an exceedingly simple domain.
%
% 5. Number of Instances: 150 (50 in each of three classes)
%
% 6. Number of Attributes: 4 numeric, predictive attributes and the class
%
% 7. Attribute Information:
% 1. sepal length in cm
% 2. sepal width in cm
% 3. petal length in cm
% 4. petal width in cm
% 5. class:
% -- Iris Setosa
% -- Iris Versicolour
% -- Iris Virginica
%
% 8. Missing Attribute Values: None
%
% Summary Statistics:
% Min Max Mean SD Class Correlation
% sepal length: 4.3 7.9 5.84 0.83 0.7826
% sepal width: 2.0 4.4 3.05 0.43 -0.4194
% petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
% petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
%
% 9. Class Distribution: 33.3% for each of 3 classes.
@RELATION iris
@ATTRIBUTE sepallength REAL
@ATTRIBUTE sepalwidth REAL
@ATTRIBUTE petallength REAL
@ATTRIBUTE petalwidth REAL
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
@DATA
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
5.7,3.8,1.7,0.3,Iris-setosa
5.1,3.8,1.5,0.3,Iris-setosa
5.4,3.4,1.7,0.2,Iris-setosa
5.1,3.7,1.5,0.4,Iris-setosa
4.6,3.6,1.0,0.2,Iris-setosa
5.1,3.3,1.7,0.5,Iris-setosa
4.8,3.4,1.9,0.2,Iris-setosa
5.0,3.0,1.6,0.2,Iris-setosa
5.0,3.4,1.6,0.4,Iris-setosa
5.2,3.5,1.5,0.2,Iris-setosa
5.2,3.4,1.4,0.2,Iris-setosa
4.7,3.2,1.6,0.2,Iris-setosa
4.8,3.1,1.6,0.2,Iris-setosa
5.4,3.4,1.5,0.4,Iris-setosa
5.2,4.1,1.5,0.1,Iris-setosa
5.5,4.2,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.0,3.2,1.2,0.2,Iris-setosa
5.5,3.5,1.3,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
4.4,3.0,1.3,0.2,Iris-setosa
5.1,3.4,1.5,0.2,Iris-setosa
5.0,3.5,1.3,0.3,Iris-setosa
4.5,2.3,1.3,0.3,Iris-setosa
4.4,3.2,1.3,0.2,Iris-setosa
5.0,3.5,1.6,0.6,Iris-setosa
5.1,3.8,1.9,0.4,Iris-setosa
4.8,3.0,1.4,0.3,Iris-setosa
5.1,3.8,1.6,0.2,Iris-setosa
4.6,3.2,1.4,0.2,Iris-setosa
5.3,3.7,1.5,0.2,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor
5.7,2.8,4.5,1.3,Iris-versicolor
6.3,3.3,4.7,1.6,Iris-versicolor
4.9,2.4,3.3,1.0,Iris-versicolor
6.6,2.9,4.6,1.3,Iris-versicolor
5.2,2.7,3.9,1.4,Iris-versicolor
5.0,2.0,3.5,1.0,Iris-versicolor
5.9,3.0,4.2,1.5,Iris-versicolor
6.0,2.2,4.0,1.0,Iris-versicolor
6.1,2.9,4.7,1.4,Iris-versicolor
5.6,2.9,3.6,1.3,Iris-versicolor
6.7,3.1,4.4,1.4,Iris-versicolor
5.6,3.0,4.5,1.5,Iris-versicolor
5.8,2.7,4.1,1.0,Iris-versicolor
6.2,2.2,4.5,1.5,Iris-versicolor
5.6,2.5,3.9,1.1,Iris-versicolor
5.9,3.2,4.8,1.8,Iris-versicolor
6.1,2.8,4.0,1.3,Iris-versicolor
6.3,2.5,4.9,1.5,Iris-versicolor
6.1,2.8,4.7,1.2,Iris-versicolor
6.4,2.9,4.3,1.3,Iris-versicolor
6.6,3.0,4.4,1.4,Iris-versicolor
6.8,2.8,4.8,1.4,Iris-versicolor
6.7,3.0,5.0,1.7,Iris-versicolor
6.0,2.9,4.5,1.5,Iris-versicolor
5.7,2.6,3.5,1.0,Iris-versicolor
5.5,2.4,3.8,1.1,Iris-versicolor
5.5,2.4,3.7,1.0,Iris-versicolor
5.8,2.7,3.9,1.2,Iris-versicolor
6.0,2.7,5.1,1.6,Iris-versicolor
5.4,3.0,4.5,1.5,Iris-versicolor
6.0,3.4,4.5,1.6,Iris-versicolor
6.7,3.1,4.7,1.5,Iris-versicolor
6.3,2.3,4.4,1.3,Iris-versicolor
5.6,3.0,4.1,1.3,Iris-versicolor
5.5,2.5,4.0,1.3,Iris-versicolor
5.5,2.6,4.4,1.2,Iris-versicolor
6.1,3.0,4.6,1.4,Iris-versicolor
5.8,2.6,4.0,1.2,Iris-versicolor
5.0,2.3,3.3,1.0,Iris-versicolor
5.6,2.7,4.2,1.3,Iris-versicolor
5.7,3.0,4.2,1.2,Iris-versicolor
5.7,2.9,4.2,1.3,Iris-versicolor
6.2,2.9,4.3,1.3,Iris-versicolor
5.1,2.5,3.0,1.1,Iris-versicolor
5.7,2.8,4.1,1.3,Iris-versicolor
6.3,3.3,6.0,2.5,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
7.1,3.0,5.9,2.1,Iris-virginica
6.3,2.9,5.6,1.8,Iris-virginica
6.5,3.0,5.8,2.2,Iris-virginica
7.6,3.0,6.6,2.1,Iris-virginica
4.9,2.5,4.5,1.7,Iris-virginica
7.3,2.9,6.3,1.8,Iris-virginica
6.7,2.5,5.8,1.8,Iris-virginica
7.2,3.6,6.1,2.5,Iris-virginica
6.5,3.2,5.1,2.0,Iris-virginica
6.4,2.7,5.3,1.9,Iris-virginica
6.8,3.0,5.5,2.1,Iris-virginica
5.7,2.5,5.0,2.0,Iris-virginica
5.8,2.8,5.1,2.4,Iris-virginica
6.4,3.2,5.3,2.3,Iris-virginica
6.5,3.0,5.5,1.8,Iris-virginica
7.7,3.8,6.7,2.2,Iris-virginica
7.7,2.6,6.9,2.3,Iris-virginica
6.0,2.2,5.0,1.5,Iris-virginica
6.9,3.2,5.7,2.3,Iris-virginica
5.6,2.8,4.9,2.0,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
6.3,2.7,4.9,1.8,Iris-virginica
6.7,3.3,5.7,2.1,Iris-virginica
7.2,3.2,6.0,1.8,Iris-virginica
6.2,2.8,4.8,1.8,Iris-virginica
6.1,3.0,4.9,1.8,Iris-virginica
6.4,2.8,5.6,2.1,Iris-virginica
7.2,3.0,5.8,1.6,Iris-virginica
7.4,2.8,6.1,1.9,Iris-virginica
7.9,3.8,6.4,2.0,Iris-virginica
6.4,2.8,5.6,2.2,Iris-virginica
6.3,2.8,5.1,1.5,Iris-virginica
6.1,2.6,5.6,1.4,Iris-virginica
7.7,3.0,6.1,2.3,Iris-virginica
6.3,3.4,5.6,2.4,Iris-virginica
6.4,3.1,5.5,1.8,Iris-virginica
6.0,3.0,4.8,1.8,Iris-virginica
6.9,3.1,5.4,2.1,Iris-virginica
6.7,3.1,5.6,2.4,Iris-virginica
6.9,3.1,5.1,2.3,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
6.8,3.2,5.9,2.3,Iris-virginica
6.7,3.3,5.7,2.5,Iris-virginica
6.7,3.0,5.2,2.3,Iris-virginica
6.3,2.5,5.0,1.9,Iris-virginica
6.5,3.0,5.2,2.0,Iris-virginica
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica
%
%
%

BIN
fimdlp/testcpp/main Executable file

Binary file not shown.

57
fimdlp/testcpp/main.cpp Normal file
View File

@@ -0,0 +1,57 @@
#include "ArffFiles.h"
#include <iostream>
#include <vector>
#include <iomanip>
#include "../CPPFImdlp.h"
using namespace std;
int main(int argc, char** argv)
{
ArffFiles file;
vector<string> lines;
string path = "/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/";
map<string, bool > datasets = {
{"mfeat-factors", true},
{"iris", true},
{"letter", true},
{"kdd_JapaneseVowels", false}
};
if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {
cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl;
return 1;
}
//file.load("datasets/mfeat-factors.arff", true);
//file.load("/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/kdd_JapaneseVowels.arff", false);
//file.load("/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/iris.arff", true);
file.load(path + argv[1] + ".arff", datasets[argv[1]]);
auto attributes = file.getAttributes();
int items = file.getSize();
cout << "Number of lines: " << items << endl;
cout << "Attributes: " << endl;
for (auto attribute : attributes) {
cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl;
}
cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl;
vector<vector<float>>& X = file.getX();
vector<int>& y = file.getY();
for (int i = 0; i < 50; i++) {
for (auto feature : X) {
cout << fixed << setprecision(1) << feature[i] << " ";
}
cout << y[i] << endl;
}
mdlp::CPPFImdlp test = mdlp::CPPFImdlp();
for (auto i = 0; i < attributes.size(); i++) {
cout << "Cut points for " << get<0>(attributes[i]) << endl;
cout << "--------------------------" << setprecision(3) << endl;
test.fit(X[i], y);
for (auto item : test.getCutPoints()) {
cout << item << endl;
}
}
return 0;
}

View File

@@ -0,0 +1,111 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
using namespace std;
ArffFiles::ArffFiles()
{
}
vector<string> ArffFiles::getLines()
{
return lines;
}
unsigned long int ArffFiles::getSize()
{
return lines.size();
}
vector<tuple<string, string>> ArffFiles::getAttributes()
{
return attributes;
}
string ArffFiles::getClassName()
{
return className;
}
string ArffFiles::getClassType()
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(string fileName)
{
ifstream file(fileName);
string keyword, attribute, type;
if (file.is_open()) {
string line;
while (getline(file, line)) {
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute >> type;
attributes.push_back(make_tuple(attribute, type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
generateDataset();
} else
throw invalid_argument("Unable to open file");
}
void ArffFiles::generateDataset()
{
X = vector<vector<float>>(lines.size(), vector<float>(attributes.size()));
vector<string> yy = vector<string>(lines.size(), "");
for (int i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int j = 0;
while (getline(ss, value, ',')) {
if (j == attributes.size()) {
yy[i] = value;
break;
}
X[i][j] = stof(value);
j++;
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels)
{
vector<int> yy;
yy.reserve(labels.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

View File

@@ -0,0 +1,28 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
#include <tuple>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<tuple<string, string>> attributes;
string className, classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset();
public:
ArffFiles();
void load(string);
vector<string> getLines();
unsigned long int getSize();
string getClassName();
string getClassType();
string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<tuple<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels);
};
#endif

View File

@@ -0,0 +1,6 @@
cmake_minimum_required(VERSION 3.24)
project(main)
set(CMAKE_CXX_STANDARD 17)
add_executable(main main.cpp ArffFiles.cpp)

View File

@@ -0,0 +1,30 @@
#include "ArffFiles.h"
#include <iostream>
#include <vector>
#include <iomanip>
using namespace std;
int main(int argc, char **argv) {
ArffFiles file;
vector<string> lines;
//file.load("datasets/mfeat-factors.arff");
file.load("/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/mfeat-factors.arff");
cout << "Number of lines: " << file.getSize() << endl;
cout << "Attributes: " << endl;
for (auto attribute: file.getAttributes()) {
cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl;
}
cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl;
vector<vector<float>> &X = file.getX();
vector<int> &y = file.getY();
for (int i = 0; i < X.size(); i++) {
for (float value: X[i]) {
cout << fixed << setprecision(1) << value << " ";
}
cout << y[i] << endl;
}
return 0;
}

View File

@@ -4,7 +4,6 @@ from fimdlp.cppfimdlp import CFImdlp
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import time
from math import log2
from scipy.io import arff
import pandas as pd
@@ -44,65 +43,3 @@ print(test.get_cut_points())
clf = RandomForestClassifier(random_state=0)
print(clf.fit(Xt, y).score(Xt, y))
print(Xt)
# for proposal in [True, False]:
# X = data.data
# y = data.target
# print("*** Proposal: ", proposal)
# test = CFImdlp(debug=True, proposal=proposal)
# test.fit(X[:, 0], y)
# result = test.get_cut_points()
# for item in result:
# print(
# f"Class={item['classNumber']} - ({item['start']:3d}, "
# f"{item['end']:3d}) -> ({item['fromValue']:3.1f}, "
# f"{item['toValue']:3.1f}]"
# )
# print(test.get_discretized_values())
# print("+" * 40)
# X = np.array(
# [
# [5.1, 3.5, 1.4, 0.2],
# [5.2, 3.0, 1.4, 0.2],
# [5.3, 3.2, 1.3, 0.2],
# [5.4, 3.1, 1.5, 0.2],
# ]
# )
# y = np.array([0, 0, 0, 1])
# print(test.fit(X[:, 0], y).transform(X[:, 0]))
# result = test.get_cut_points()
# for item in result:
# print(
# f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
# f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
# )
# print("*" * 40)
# # print(Xs, ys)
# # print("**********************")
# # test = [(0, 3), (4, 4), (5, 5), (6, 8), (9, 9)]
# # print(ys)
# # for start, end in test:
# # print("Testing ", start, end, ys[:end], ys[end:])
# # print("Information gain: ", information_gain(ys, ys[:end], ys[end:]))
# # print(test.transform(X))
# # print(X)
# # print(indices)
# # print(np.array(X)[indices])
# # # k = test.cut_points(X[:, 0], y)
# # # print(k)
# # # k = test.cut_points_ant(X[:, 0], y)
# # # print(k)
# # # test.debug_points(X[:, 0], y)
# # X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
# # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# # indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
# # clf = CFImdlp(debug=True, proposal=False)
# # clf.fit(X, y)
# # print(clf.get_cut_points())
# # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# # # To check
# # indices2 = np.argsort(X)
# # Xs = np.array(X)[indices2]
# # ys = np.array(y)[indices2]
# kdd_JapaneseVowels