Refactor project structure and add Arff load and test

This commit is contained in:
2022-12-09 16:35:58 +01:00
parent e4cf72d0fe
commit 65de064fa9
20 changed files with 783 additions and 253 deletions

View File

@@ -19,6 +19,7 @@ namespace mdlp {
{ {
X = X_; X = X_;
y = y_; y = y_;
cutPoints.clear();
if (X.size() != y.size()) { if (X.size() != y.size()) {
throw invalid_argument("X and y must have the same size"); throw invalid_argument("X and y must have the same size");
} }

View File

@@ -1,6 +1,5 @@
#include "Metrics.h" #include "Metrics.h"
#include <set> #include <set>
#include <iostream>
using namespace std; using namespace std;
namespace mdlp { namespace mdlp {
Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t()) Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
@@ -19,6 +18,8 @@ namespace mdlp {
indices = indices_; indices = indices_;
y = y_; y = y_;
numClasses = computeNumClasses(0, indices.size()); numClasses = computeNumClasses(0, indices.size());
entropyCache.clear();
igCache.clear();
} }
precision_t Metrics::entropy(size_t start, size_t end) precision_t Metrics::entropy(size_t start, size_t end)
{ {
@@ -50,7 +51,6 @@ namespace mdlp {
int nElementsLeft = cut - start, nElementsRight = end - cut; int nElementsLeft = cut - start, nElementsRight = end - cut;
int nElements = end - start; int nElements = end - start;
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) { if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
cout << "**********Cache IG hit for " << start << " " << end << endl;
return igCache[make_tuple(start, cut, end)]; return igCache[make_tuple(start, cut, end)];
} }
entropyInterval = entropy(start, end); entropyInterval = entropy(start, end);
@@ -62,13 +62,3 @@ namespace mdlp {
} }
} }
/*
cache_t entropyCache;
std::map<std::tuple<int, int>, double> c;
// Set the value at index (3, 5) to 7.8.
c[std::make_tuple(3, 5)] = 7.8;
// Print the value at index (3, 5).
std::cout << c[std::make_tuple(3, 5)] << std::endl;
*/

View File

@@ -0,0 +1,117 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
using namespace std;
ArffFiles::ArffFiles()
{
}
vector<string> ArffFiles::getLines()
{
return lines;
}
unsigned long int ArffFiles::getSize()
{
return lines.size();
}
vector<tuple<string, string>> ArffFiles::getAttributes()
{
return attributes;
}
string ArffFiles::getClassName()
{
return className;
}
string ArffFiles::getClassType()
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(string fileName, bool classLast)
{
ifstream file(fileName);
string keyword, attribute, type;
if (file.is_open()) {
string line;
while (getline(file, line)) {
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute >> type;
attributes.push_back(make_tuple(attribute, type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
if (classLast) {
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
} else {
className = get<0>(attributes.front());
classType = get<1>(attributes.front());
attributes.erase(attributes.begin());
}
generateDataset(classLast);
} else
throw invalid_argument("Unable to open file");
}
void ArffFiles::generateDataset(bool classLast)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
vector<string> yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? attributes.size() : 0;
for (int i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int pos = 0, xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
X[xIndex++][i] = stof(value);
}
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels)
{
vector<int> yy;
yy.reserve(labels.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

View File

@@ -0,0 +1,28 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
#include <tuple>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<tuple<string, string>> attributes;
string className, classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset(bool);
public:
ArffFiles();
void load(string, bool = true);
vector<string> getLines();
unsigned long int getSize();
string getClassName();
string getClassType();
string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<tuple<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels);
};
#endif

View File

@@ -1,177 +1,177 @@
#include "gtest/gtest.h" //#include "gtest/gtest.h"
#include "../Metrics.h" //#include "../Metrics.h"
#include "../CPPFImdlp.h" //#include "../CPPFImdlp.h"
namespace mdlp { //namespace mdlp {
class TestFImdlp : public CPPFImdlp, public testing::Test { // class TestFImdlp : public CPPFImdlp, public testing::Test {
public: // public:
TestFImdlp() : CPPFImdlp(true, 6, true) {} // TestFImdlp() : CPPFImdlp(true, true) {}
void SetUp() // void SetUp()
{ // {
// 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0] // // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
//(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2) // //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; // X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; // y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
fit(X, y); // fit(X, y);
} // }
void setProposal(bool value) // void setProposal(bool value)
{ // {
proposal = value; // proposal = value;
} // }
void initCutPoints() // void initCutPoints()
{ // {
setCutPoints(cutPoints_t()); // setCutPoints(cutPoints_t());
} // }
void initIndices() // void initIndices()
{ // {
indices = indices_t(); // indices = indices_t();
} // }
void initDiscretized() // void initDiscretized()
{ // {
xDiscretized = labels(); // xDiscretized = labels();
} // }
void checkSortedVector(samples& X_, indices_t indices_) // void checkSortedVector(samples& X_, indices_t indices_)
{ // {
X = X_; // X = X_;
indices = indices_; // indices = indices_;
indices_t testSortedIndices = sortIndices(X); // indices_t testSortedIndices = sortIndices(X);
precision_t prev = X[testSortedIndices[0]]; // precision_t prev = X[testSortedIndices[0]];
for (auto i = 0; i < X.size(); ++i) { // for (auto i = 0; i < X.size(); ++i) {
EXPECT_EQ(testSortedIndices[i], indices[i]); // EXPECT_EQ(testSortedIndices[i], indices[i]);
EXPECT_LE(prev, X[testSortedIndices[i]]); // EXPECT_LE(prev, X[testSortedIndices[i]]);
prev = X[testSortedIndices[i]]; // prev = X[testSortedIndices[i]];
} // }
} // }
void checkCutPoints(cutPoints_t& expected) // void checkCutPoints(cutPoints_t& expected)
{ // {
int expectedSize = expected.size(); // int expectedSize = expected.size();
EXPECT_EQ(cutPoints.size(), expectedSize); // EXPECT_EQ(cutPoints.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) { // for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(cutPoints[i].start, expected[i].start); // EXPECT_EQ(cutPoints[i].start, expected[i].start);
EXPECT_EQ(cutPoints[i].end, expected[i].end); // EXPECT_EQ(cutPoints[i].end, expected[i].end);
EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber); // EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision); // EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision); // EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision);
} // }
} // }
template<typename T, typename A> // template<typename T, typename A>
void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed) // void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
{ // {
EXPECT_EQ(expected.size(), computed.size()); // EXPECT_EQ(expected.size(), computed.size());
for (auto i = 0; i < expected.size(); i++) { // for (auto i = 0; i < expected.size(); i++) {
EXPECT_EQ(expected[i], computed[i]); // EXPECT_EQ(expected[i], computed[i]);
} // }
} // }
//
}; // };
TEST_F(TestFImdlp, FitErrorEmptyDataset) // TEST_F(TestFImdlp, FitErrorEmptyDataset)
{ // {
X = samples(); // X = samples();
y = labels(); // y = labels();
EXPECT_THROW(fit(X, y), std::invalid_argument); // EXPECT_THROW(fit(X, y), std::invalid_argument);
} // }
TEST_F(TestFImdlp, FitErrorDifferentSize) // TEST_F(TestFImdlp, FitErrorDifferentSize)
{ // {
X = { 1, 2, 3 }; // X = { 1, 2, 3 };
y = { 1, 2 }; // y = { 1, 2 };
EXPECT_THROW(fit(X, y), std::invalid_argument); // EXPECT_THROW(fit(X, y), std::invalid_argument);
} // }
TEST_F(TestFImdlp, SortIndices) // TEST_F(TestFImdlp, SortIndices)
{ // {
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; // X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 }; // indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 };
checkSortedVector(X, indices); // checkSortedVector(X, indices);
X = { 5.77, 5.88, 5.99 }; // X = { 5.77, 5.88, 5.99 };
indices = { 0, 1, 2 }; // indices = { 0, 1, 2 };
checkSortedVector(X, indices); // checkSortedVector(X, indices);
X = { 5.33, 5.22, 5.11 }; // X = { 5.33, 5.22, 5.11 };
indices = { 2, 1, 0 }; // indices = { 2, 1, 0 };
checkSortedVector(X, indices); // checkSortedVector(X, indices);
} // }
TEST_F(TestFImdlp, EvaluateCutPoint) // TEST_F(TestFImdlp, EvaluateCutPoint)
{ // {
cutPoint_t rest, candidate; // cutPoint_t rest, candidate;
rest = { 0, 10, -1, -1, 1000 }; // rest = { 0, 10, -1, -1, 1000 };
candidate = { 0, 4, -1, -1, 5.15 }; // candidate = { 0, 4, -1, -1, 5.15 };
EXPECT_FALSE(evaluateCutPoint(rest, candidate)); // EXPECT_FALSE(evaluateCutPoint(rest, candidate));
} // }
TEST_F(TestFImdlp, ComputeCutPointsOriginal) // TEST_F(TestFImdlp, ComputeCutPointsOriginal)
{ // {
cutPoints_t expected; // cutPoints_t expected;
expected = { // expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 }, // { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
{ 6, 10, -1, 5.45, 3.4028234663852886e+38 } // { 6, 10, -1, 5.45, 3.4028234663852886e+38 }
}; // };
setCutPoints(cutPoints_t()); // setCutPoints(cutPoints_t());
computeCutPointsOriginal(); // computeCutPointsOriginal();
checkCutPoints(expected); // checkCutPoints(expected);
} // }
TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) // TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase)
{ // {
cutPoints_t expected; // cutPoints_t expected;
expected = { // expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 }, // { 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 },
}; // };
X = { 0, 1, 2, 2 }; // X = { 0, 1, 2, 2 };
y = { 1, 1, 1, 2 }; // y = { 1, 1, 1, 2 };
fit(X, y); // fit(X, y);
computeCutPointsOriginal(); // computeCutPointsOriginal();
checkCutPoints(expected); // checkCutPoints(expected);
} // }
TEST_F(TestFImdlp, ComputeCutPointsProposal) // TEST_F(TestFImdlp, ComputeCutPointsProposal)
{ // {
cutPoints_t expected; // cutPoints_t expected;
expected = { // expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 5.1 }, { 4, 6, -1, 5.1, 5.4 }, // { 0, 4, -1, -3.4028234663852886e+38, 5.1 }, { 4, 6, -1, 5.1, 5.4 },
{ 6, 9, -1, 5.4, 5.85 }, // { 6, 9, -1, 5.4, 5.85 },
{ 9, 10, -1, 5.85, 3.4028234663852886e+38 } // { 9, 10, -1, 5.85, 3.4028234663852886e+38 }
}; // };
computeCutPointsProposal(); // computeCutPointsProposal();
checkCutPoints(expected); // checkCutPoints(expected);
} // }
TEST_F(TestFImdlp, ComputeCutPointsProposalGCase) // TEST_F(TestFImdlp, ComputeCutPointsProposalGCase)
{ // {
cutPoints_t expected; // cutPoints_t expected;
expected = { // expected = {
{ 0, 3, -1, -3.4028234663852886e+38, 1.5 }, // { 0, 3, -1, -3.4028234663852886e+38, 1.5 },
{ 3, 4, -1, 1.5, 3.4028234663852886e+38 } // { 3, 4, -1, 1.5, 3.4028234663852886e+38 }
}; // };
X = { 0, 1, 2, 2 }; // X = { 0, 1, 2, 2 };
y = { 1, 1, 1, 2 }; // y = { 1, 1, 1, 2 };
fit(X, y); // fit(X, y);
computeCutPointsProposal(); // computeCutPointsProposal();
checkCutPoints(expected); // checkCutPoints(expected);
} // }
TEST_F(TestFImdlp, DiscretizedValues) // TEST_F(TestFImdlp, DiscretizedValues)
{ // {
labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
computed = getDiscretizedValues(); // computed = getDiscretizedValues();
checkVectors(expected, computed); // checkVectors(expected, computed);
} // }
TEST_F(TestFImdlp, GetCutPoints) // TEST_F(TestFImdlp, GetCutPoints)
{ // {
samples computed, expected = { 5.15, 5.45, 3.4028234663852886e+38 }; // samples computed, expected = { 5.15, 5.45, 3.4028234663852886e+38 };
computeCutPointsOriginal(); // computeCutPointsOriginal();
computed = getCutPoints(); // computed = getCutPoints();
checkVectors(expected, computed); // checkVectors(expected, computed);
} // }
TEST_F(TestFImdlp, Constructor) // TEST_F(TestFImdlp, Constructor)
{ // {
samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; // samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; // labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
setProposal(false); // setProposal(false);
fit(X, y); // fit(X, y);
computeCutPointsOriginal(); // computeCutPointsOriginal();
cutPoints_t expected; // cutPoints_t expected;
vector<precision_t> computed = getCutPoints(); // vector<precision_t> computed = getCutPoints();
expected = { // expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 }, // { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
{ 6, 10, -1, 5.45, 3.4028234663852886e+38 } // { 6, 10, -1, 5.45, 3.4028234663852886e+38 }
}; // };
computed = getCutPoints(); // computed = getCutPoints();
int expectedSize = expected.size(); // int expectedSize = expected.size();
EXPECT_EQ(computed.size(), expected.size()); // EXPECT_EQ(computed.size(), expected.size());
for (auto i = 0; i < expectedSize; i++) { // for (auto i = 0; i < expectedSize; i++) {
EXPECT_NEAR(computed[i], expected[i].toValue, .00000001); // EXPECT_NEAR(computed[i], expected[i].toValue, .00000001);
} // }
} // }
} //}

225
fimdlp/testcpp/datasets/iris.arff Executable file
View File

@@ -0,0 +1,225 @@
% 1. Title: Iris Plants Database
%
% 2. Sources:
% (a) Creator: R.A. Fisher
% (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
% (c) Date: July, 1988
%
% 3. Past Usage:
% - Publications: too many to mention!!! Here are a few.
% 1. Fisher,R.A. "The use of multiple measurements in taxonomic problems"
% Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions
% to Mathematical Statistics" (John Wiley, NY, 1950).
% 2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
% (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
% 3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
% Structure and Classification Rule for Recognition in Partially Exposed
% Environments". IEEE Transactions on Pattern Analysis and Machine
% Intelligence, Vol. PAMI-2, No. 1, 67-71.
% -- Results:
% -- very low misclassification rates (0% for the setosa class)
% 4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE
% Transactions on Information Theory, May 1972, 431-433.
% -- Results:
% -- very low misclassification rates again
% 5. See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II
% conceptual clustering system finds 3 classes in the data.
%
% 4. Relevant Information:
% --- This is perhaps the best known database to be found in the pattern
% recognition literature. Fisher's paper is a classic in the field
% and is referenced frequently to this day. (See Duda & Hart, for
% example.) The data set contains 3 classes of 50 instances each,
% where each class refers to a type of iris plant. One class is
% linearly separable from the other 2; the latter are NOT linearly
% separable from each other.
% --- Predicted attribute: class of iris plant.
% --- This is an exceedingly simple domain.
%
% 5. Number of Instances: 150 (50 in each of three classes)
%
% 6. Number of Attributes: 4 numeric, predictive attributes and the class
%
% 7. Attribute Information:
% 1. sepal length in cm
% 2. sepal width in cm
% 3. petal length in cm
% 4. petal width in cm
% 5. class:
% -- Iris Setosa
% -- Iris Versicolour
% -- Iris Virginica
%
% 8. Missing Attribute Values: None
%
% Summary Statistics:
% Min Max Mean SD Class Correlation
% sepal length: 4.3 7.9 5.84 0.83 0.7826
% sepal width: 2.0 4.4 3.05 0.43 -0.4194
% petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
% petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
%
% 9. Class Distribution: 33.3% for each of 3 classes.
@RELATION iris
@ATTRIBUTE sepallength REAL
@ATTRIBUTE sepalwidth REAL
@ATTRIBUTE petallength REAL
@ATTRIBUTE petalwidth REAL
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
@DATA
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
5.7,3.8,1.7,0.3,Iris-setosa
5.1,3.8,1.5,0.3,Iris-setosa
5.4,3.4,1.7,0.2,Iris-setosa
5.1,3.7,1.5,0.4,Iris-setosa
4.6,3.6,1.0,0.2,Iris-setosa
5.1,3.3,1.7,0.5,Iris-setosa
4.8,3.4,1.9,0.2,Iris-setosa
5.0,3.0,1.6,0.2,Iris-setosa
5.0,3.4,1.6,0.4,Iris-setosa
5.2,3.5,1.5,0.2,Iris-setosa
5.2,3.4,1.4,0.2,Iris-setosa
4.7,3.2,1.6,0.2,Iris-setosa
4.8,3.1,1.6,0.2,Iris-setosa
5.4,3.4,1.5,0.4,Iris-setosa
5.2,4.1,1.5,0.1,Iris-setosa
5.5,4.2,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.0,3.2,1.2,0.2,Iris-setosa
5.5,3.5,1.3,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
4.4,3.0,1.3,0.2,Iris-setosa
5.1,3.4,1.5,0.2,Iris-setosa
5.0,3.5,1.3,0.3,Iris-setosa
4.5,2.3,1.3,0.3,Iris-setosa
4.4,3.2,1.3,0.2,Iris-setosa
5.0,3.5,1.6,0.6,Iris-setosa
5.1,3.8,1.9,0.4,Iris-setosa
4.8,3.0,1.4,0.3,Iris-setosa
5.1,3.8,1.6,0.2,Iris-setosa
4.6,3.2,1.4,0.2,Iris-setosa
5.3,3.7,1.5,0.2,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor
5.7,2.8,4.5,1.3,Iris-versicolor
6.3,3.3,4.7,1.6,Iris-versicolor
4.9,2.4,3.3,1.0,Iris-versicolor
6.6,2.9,4.6,1.3,Iris-versicolor
5.2,2.7,3.9,1.4,Iris-versicolor
5.0,2.0,3.5,1.0,Iris-versicolor
5.9,3.0,4.2,1.5,Iris-versicolor
6.0,2.2,4.0,1.0,Iris-versicolor
6.1,2.9,4.7,1.4,Iris-versicolor
5.6,2.9,3.6,1.3,Iris-versicolor
6.7,3.1,4.4,1.4,Iris-versicolor
5.6,3.0,4.5,1.5,Iris-versicolor
5.8,2.7,4.1,1.0,Iris-versicolor
6.2,2.2,4.5,1.5,Iris-versicolor
5.6,2.5,3.9,1.1,Iris-versicolor
5.9,3.2,4.8,1.8,Iris-versicolor
6.1,2.8,4.0,1.3,Iris-versicolor
6.3,2.5,4.9,1.5,Iris-versicolor
6.1,2.8,4.7,1.2,Iris-versicolor
6.4,2.9,4.3,1.3,Iris-versicolor
6.6,3.0,4.4,1.4,Iris-versicolor
6.8,2.8,4.8,1.4,Iris-versicolor
6.7,3.0,5.0,1.7,Iris-versicolor
6.0,2.9,4.5,1.5,Iris-versicolor
5.7,2.6,3.5,1.0,Iris-versicolor
5.5,2.4,3.8,1.1,Iris-versicolor
5.5,2.4,3.7,1.0,Iris-versicolor
5.8,2.7,3.9,1.2,Iris-versicolor
6.0,2.7,5.1,1.6,Iris-versicolor
5.4,3.0,4.5,1.5,Iris-versicolor
6.0,3.4,4.5,1.6,Iris-versicolor
6.7,3.1,4.7,1.5,Iris-versicolor
6.3,2.3,4.4,1.3,Iris-versicolor
5.6,3.0,4.1,1.3,Iris-versicolor
5.5,2.5,4.0,1.3,Iris-versicolor
5.5,2.6,4.4,1.2,Iris-versicolor
6.1,3.0,4.6,1.4,Iris-versicolor
5.8,2.6,4.0,1.2,Iris-versicolor
5.0,2.3,3.3,1.0,Iris-versicolor
5.6,2.7,4.2,1.3,Iris-versicolor
5.7,3.0,4.2,1.2,Iris-versicolor
5.7,2.9,4.2,1.3,Iris-versicolor
6.2,2.9,4.3,1.3,Iris-versicolor
5.1,2.5,3.0,1.1,Iris-versicolor
5.7,2.8,4.1,1.3,Iris-versicolor
6.3,3.3,6.0,2.5,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
7.1,3.0,5.9,2.1,Iris-virginica
6.3,2.9,5.6,1.8,Iris-virginica
6.5,3.0,5.8,2.2,Iris-virginica
7.6,3.0,6.6,2.1,Iris-virginica
4.9,2.5,4.5,1.7,Iris-virginica
7.3,2.9,6.3,1.8,Iris-virginica
6.7,2.5,5.8,1.8,Iris-virginica
7.2,3.6,6.1,2.5,Iris-virginica
6.5,3.2,5.1,2.0,Iris-virginica
6.4,2.7,5.3,1.9,Iris-virginica
6.8,3.0,5.5,2.1,Iris-virginica
5.7,2.5,5.0,2.0,Iris-virginica
5.8,2.8,5.1,2.4,Iris-virginica
6.4,3.2,5.3,2.3,Iris-virginica
6.5,3.0,5.5,1.8,Iris-virginica
7.7,3.8,6.7,2.2,Iris-virginica
7.7,2.6,6.9,2.3,Iris-virginica
6.0,2.2,5.0,1.5,Iris-virginica
6.9,3.2,5.7,2.3,Iris-virginica
5.6,2.8,4.9,2.0,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
6.3,2.7,4.9,1.8,Iris-virginica
6.7,3.3,5.7,2.1,Iris-virginica
7.2,3.2,6.0,1.8,Iris-virginica
6.2,2.8,4.8,1.8,Iris-virginica
6.1,3.0,4.9,1.8,Iris-virginica
6.4,2.8,5.6,2.1,Iris-virginica
7.2,3.0,5.8,1.6,Iris-virginica
7.4,2.8,6.1,1.9,Iris-virginica
7.9,3.8,6.4,2.0,Iris-virginica
6.4,2.8,5.6,2.2,Iris-virginica
6.3,2.8,5.1,1.5,Iris-virginica
6.1,2.6,5.6,1.4,Iris-virginica
7.7,3.0,6.1,2.3,Iris-virginica
6.3,3.4,5.6,2.4,Iris-virginica
6.4,3.1,5.5,1.8,Iris-virginica
6.0,3.0,4.8,1.8,Iris-virginica
6.9,3.1,5.4,2.1,Iris-virginica
6.7,3.1,5.6,2.4,Iris-virginica
6.9,3.1,5.1,2.3,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
6.8,3.2,5.9,2.3,Iris-virginica
6.7,3.3,5.7,2.5,Iris-virginica
6.7,3.0,5.2,2.3,Iris-virginica
6.3,2.5,5.0,1.9,Iris-virginica
6.5,3.0,5.2,2.0,Iris-virginica
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica
%
%
%

BIN
fimdlp/testcpp/main Executable file

Binary file not shown.

57
fimdlp/testcpp/main.cpp Normal file
View File

@@ -0,0 +1,57 @@
#include "ArffFiles.h"
#include <iostream>
#include <vector>
#include <iomanip>
#include "../CPPFImdlp.h"
using namespace std;
int main(int argc, char** argv)
{
ArffFiles file;
vector<string> lines;
string path = "/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/";
map<string, bool > datasets = {
{"mfeat-factors", true},
{"iris", true},
{"letter", true},
{"kdd_JapaneseVowels", false}
};
if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {
cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl;
return 1;
}
//file.load("datasets/mfeat-factors.arff", true);
//file.load("/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/kdd_JapaneseVowels.arff", false);
//file.load("/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/iris.arff", true);
file.load(path + argv[1] + ".arff", datasets[argv[1]]);
auto attributes = file.getAttributes();
int items = file.getSize();
cout << "Number of lines: " << items << endl;
cout << "Attributes: " << endl;
for (auto attribute : attributes) {
cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl;
}
cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl;
vector<vector<float>>& X = file.getX();
vector<int>& y = file.getY();
for (int i = 0; i < 50; i++) {
for (auto feature : X) {
cout << fixed << setprecision(1) << feature[i] << " ";
}
cout << y[i] << endl;
}
mdlp::CPPFImdlp test = mdlp::CPPFImdlp();
for (auto i = 0; i < attributes.size(); i++) {
cout << "Cut points for " << get<0>(attributes[i]) << endl;
cout << "--------------------------" << setprecision(3) << endl;
test.fit(X[i], y);
for (auto item : test.getCutPoints()) {
cout << item << endl;
}
}
return 0;
}

View File

@@ -0,0 +1,111 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
using namespace std;
ArffFiles::ArffFiles()
{
}
vector<string> ArffFiles::getLines()
{
return lines;
}
unsigned long int ArffFiles::getSize()
{
return lines.size();
}
vector<tuple<string, string>> ArffFiles::getAttributes()
{
return attributes;
}
string ArffFiles::getClassName()
{
return className;
}
string ArffFiles::getClassType()
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(string fileName)
{
ifstream file(fileName);
string keyword, attribute, type;
if (file.is_open()) {
string line;
while (getline(file, line)) {
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute >> type;
attributes.push_back(make_tuple(attribute, type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
generateDataset();
} else
throw invalid_argument("Unable to open file");
}
void ArffFiles::generateDataset()
{
X = vector<vector<float>>(lines.size(), vector<float>(attributes.size()));
vector<string> yy = vector<string>(lines.size(), "");
for (int i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int j = 0;
while (getline(ss, value, ',')) {
if (j == attributes.size()) {
yy[i] = value;
break;
}
X[i][j] = stof(value);
j++;
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels)
{
vector<int> yy;
yy.reserve(labels.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

View File

@@ -0,0 +1,28 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
#include <tuple>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<tuple<string, string>> attributes;
string className, classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset();
public:
ArffFiles();
void load(string);
vector<string> getLines();
unsigned long int getSize();
string getClassName();
string getClassType();
string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<tuple<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels);
};
#endif

View File

@@ -0,0 +1,6 @@
cmake_minimum_required(VERSION 3.24)
project(main)
set(CMAKE_CXX_STANDARD 17)
add_executable(main main.cpp ArffFiles.cpp)

View File

@@ -0,0 +1,30 @@
#include "ArffFiles.h"
#include <iostream>
#include <vector>
#include <iomanip>
using namespace std;
int main(int argc, char **argv) {
ArffFiles file;
vector<string> lines;
//file.load("datasets/mfeat-factors.arff");
file.load("/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/mfeat-factors.arff");
cout << "Number of lines: " << file.getSize() << endl;
cout << "Attributes: " << endl;
for (auto attribute: file.getAttributes()) {
cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl;
}
cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl;
vector<vector<float>> &X = file.getX();
vector<int> &y = file.getY();
for (int i = 0; i < X.size(); i++) {
for (float value: X[i]) {
cout << fixed << setprecision(1) << value << " ";
}
cout << y[i] << endl;
}
return 0;
}

View File

@@ -4,7 +4,6 @@ from fimdlp.cppfimdlp import CFImdlp
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
import numpy as np import numpy as np
import time import time
from math import log2
from scipy.io import arff from scipy.io import arff
import pandas as pd import pandas as pd
@@ -44,65 +43,3 @@ print(test.get_cut_points())
clf = RandomForestClassifier(random_state=0) clf = RandomForestClassifier(random_state=0)
print(clf.fit(Xt, y).score(Xt, y)) print(clf.fit(Xt, y).score(Xt, y))
print(Xt) print(Xt)
# for proposal in [True, False]:
# X = data.data
# y = data.target
# print("*** Proposal: ", proposal)
# test = CFImdlp(debug=True, proposal=proposal)
# test.fit(X[:, 0], y)
# result = test.get_cut_points()
# for item in result:
# print(
# f"Class={item['classNumber']} - ({item['start']:3d}, "
# f"{item['end']:3d}) -> ({item['fromValue']:3.1f}, "
# f"{item['toValue']:3.1f}]"
# )
# print(test.get_discretized_values())
# print("+" * 40)
# X = np.array(
# [
# [5.1, 3.5, 1.4, 0.2],
# [5.2, 3.0, 1.4, 0.2],
# [5.3, 3.2, 1.3, 0.2],
# [5.4, 3.1, 1.5, 0.2],
# ]
# )
# y = np.array([0, 0, 0, 1])
# print(test.fit(X[:, 0], y).transform(X[:, 0]))
# result = test.get_cut_points()
# for item in result:
# print(
# f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
# f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
# )
# print("*" * 40)
# # print(Xs, ys)
# # print("**********************")
# # test = [(0, 3), (4, 4), (5, 5), (6, 8), (9, 9)]
# # print(ys)
# # for start, end in test:
# # print("Testing ", start, end, ys[:end], ys[end:])
# # print("Information gain: ", information_gain(ys, ys[:end], ys[end:]))
# # print(test.transform(X))
# # print(X)
# # print(indices)
# # print(np.array(X)[indices])
# # # k = test.cut_points(X[:, 0], y)
# # # print(k)
# # # k = test.cut_points_ant(X[:, 0], y)
# # # print(k)
# # # test.debug_points(X[:, 0], y)
# # X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
# # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# # indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
# # clf = CFImdlp(debug=True, proposal=False)
# # clf.fit(X, y)
# # print(clf.get_cut_points())
# # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# # # To check
# # indices2 = np.argsort(X)
# # Xs = np.array(X)[indices2]
# # ys = np.array(y)[indices2]
# kdd_JapaneseVowels