diff --git a/fimdlp/CPPFImdlp.cpp b/fimdlp/CPPFImdlp.cpp index 8113c55..e5b6c4e 100644 --- a/fimdlp/CPPFImdlp.cpp +++ b/fimdlp/CPPFImdlp.cpp @@ -19,6 +19,7 @@ namespace mdlp { { X = X_; y = y_; + cutPoints.clear(); if (X.size() != y.size()) { throw invalid_argument("X and y must have the same size"); } diff --git a/fimdlp/Metrics.cpp b/fimdlp/Metrics.cpp index 041ecf4..f2e54f5 100644 --- a/fimdlp/Metrics.cpp +++ b/fimdlp/Metrics.cpp @@ -1,6 +1,5 @@ #include "Metrics.h" #include -#include using namespace std; namespace mdlp { Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t()) @@ -19,6 +18,8 @@ namespace mdlp { indices = indices_; y = y_; numClasses = computeNumClasses(0, indices.size()); + entropyCache.clear(); + igCache.clear(); } precision_t Metrics::entropy(size_t start, size_t end) { @@ -50,7 +51,6 @@ namespace mdlp { int nElementsLeft = cut - start, nElementsRight = end - cut; int nElements = end - start; if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) { - cout << "**********Cache IG hit for " << start << " " << end << endl; return igCache[make_tuple(start, cut, end)]; } entropyInterval = entropy(start, end); @@ -61,14 +61,4 @@ namespace mdlp { return iGain; } -} -/* - cache_t entropyCache; - std::map, double> c; - - // Set the value at index (3, 5) to 7.8. - c[std::make_tuple(3, 5)] = 7.8; - - // Print the value at index (3, 5). - std::cout << c[std::make_tuple(3, 5)] << std::endl; -*/ \ No newline at end of file +} \ No newline at end of file diff --git a/fimdlp/tests/bak/CPPFImdlp.cpp b/fimdlp/bak/CPPFImdlp.cpp similarity index 100% rename from fimdlp/tests/bak/CPPFImdlp.cpp rename to fimdlp/bak/CPPFImdlp.cpp diff --git a/fimdlp/tests/bak/CPPFImdlp.h b/fimdlp/bak/CPPFImdlp.h similarity index 100% rename from fimdlp/tests/bak/CPPFImdlp.h rename to fimdlp/bak/CPPFImdlp.h diff --git a/fimdlp/tests/bak/Metrics.cpp b/fimdlp/bak/Metrics.cpp similarity index 100% rename from fimdlp/tests/bak/Metrics.cpp rename to fimdlp/bak/Metrics.cpp diff --git a/fimdlp/tests/bak/Metrics.h b/fimdlp/bak/Metrics.h similarity index 100% rename from fimdlp/tests/bak/Metrics.h rename to fimdlp/bak/Metrics.h diff --git a/fimdlp/testcpp/ArffFiles.cpp b/fimdlp/testcpp/ArffFiles.cpp new file mode 100644 index 0000000..b8a8928 --- /dev/null +++ b/fimdlp/testcpp/ArffFiles.cpp @@ -0,0 +1,117 @@ +#include "ArffFiles.h" + +#include +#include +#include +#include + +using namespace std; + +ArffFiles::ArffFiles() +{ +} +vector ArffFiles::getLines() +{ + return lines; +} +unsigned long int ArffFiles::getSize() +{ + return lines.size(); +} +vector> ArffFiles::getAttributes() +{ + return attributes; +} +string ArffFiles::getClassName() +{ + return className; +} +string ArffFiles::getClassType() +{ + return classType; +} +vector>& ArffFiles::getX() +{ + return X; +} +vector& ArffFiles::getY() +{ + return y; +} +void ArffFiles::load(string fileName, bool classLast) +{ + ifstream file(fileName); + string keyword, attribute, type; + if (file.is_open()) { + string line; + while (getline(file, line)) { + if (line[0] == '%' || line.empty() || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { + stringstream ss(line); + ss >> keyword >> attribute >> type; + attributes.push_back(make_tuple(attribute, type)); + continue; + } + if (line[0] == '@') { + continue; + } + lines.push_back(line); + } + file.close(); + if (attributes.empty()) + throw invalid_argument("No attributes found"); + if (classLast) { + className = get<0>(attributes.back()); + classType = get<1>(attributes.back()); + attributes.pop_back(); + } else { + className = get<0>(attributes.front()); + classType = get<1>(attributes.front()); + attributes.erase(attributes.begin()); + } + generateDataset(classLast); + } else + throw invalid_argument("Unable to open file"); +} +void ArffFiles::generateDataset(bool classLast) +{ + X = vector>(attributes.size(), vector(lines.size())); + vector yy = vector(lines.size(), ""); + int labelIndex = classLast ? attributes.size() : 0; + for (int i = 0; i < lines.size(); i++) { + stringstream ss(lines[i]); + string value; + int pos = 0, xIndex = 0; + while (getline(ss, value, ',')) { + if (pos++ == labelIndex) { + yy[i] = value; + } else { + X[xIndex++][i] = stof(value); + } + } + } + y = factorize(yy); +} +string ArffFiles::trim(const string& source) +{ + string s(source); + s.erase(0, s.find_first_not_of(" \n\r\t")); + s.erase(s.find_last_not_of(" \n\r\t") + 1); + return s; +} +vector ArffFiles::factorize(const vector& labels) +{ + vector yy; + yy.reserve(labels.size()); + map labelMap; + int i = 0; + for (string label : labels) { + if (labelMap.find(label) == labelMap.end()) { + labelMap[label] = i++; + } + yy.push_back(labelMap[label]); + } + return yy; +} \ No newline at end of file diff --git a/fimdlp/testcpp/ArffFiles.h b/fimdlp/testcpp/ArffFiles.h new file mode 100644 index 0000000..317ebb5 --- /dev/null +++ b/fimdlp/testcpp/ArffFiles.h @@ -0,0 +1,28 @@ +#ifndef ARFFFILES_H +#define ARFFFILES_H +#include +#include +#include +using namespace std; +class ArffFiles { +private: + vector lines; + vector> attributes; + string className, classType; + vector> X; + vector y; + void generateDataset(bool); +public: + ArffFiles(); + void load(string, bool = true); + vector getLines(); + unsigned long int getSize(); + string getClassName(); + string getClassType(); + string trim(const string&); + vector>& getX(); + vector& getY(); + vector> getAttributes(); + vector factorize(const vector& labels); +}; +#endif \ No newline at end of file diff --git a/fimdlp/testcpp/FImdlp_unittest.cc b/fimdlp/testcpp/FImdlp_unittest.cc index 3bdc69d..173bd41 100644 --- a/fimdlp/testcpp/FImdlp_unittest.cc +++ b/fimdlp/testcpp/FImdlp_unittest.cc @@ -1,177 +1,177 @@ -#include "gtest/gtest.h" -#include "../Metrics.h" -#include "../CPPFImdlp.h" -namespace mdlp { - class TestFImdlp : public CPPFImdlp, public testing::Test { - public: - TestFImdlp() : CPPFImdlp(true, 6, true) {} - void SetUp() - { - // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0] - //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2) - X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; - y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; - fit(X, y); - } - void setProposal(bool value) - { - proposal = value; - } - void initCutPoints() - { - setCutPoints(cutPoints_t()); - } - void initIndices() - { - indices = indices_t(); - } - void initDiscretized() - { - xDiscretized = labels(); - } - void checkSortedVector(samples& X_, indices_t indices_) - { - X = X_; - indices = indices_; - indices_t testSortedIndices = sortIndices(X); - precision_t prev = X[testSortedIndices[0]]; - for (auto i = 0; i < X.size(); ++i) { - EXPECT_EQ(testSortedIndices[i], indices[i]); - EXPECT_LE(prev, X[testSortedIndices[i]]); - prev = X[testSortedIndices[i]]; - } - } - void checkCutPoints(cutPoints_t& expected) - { - int expectedSize = expected.size(); - EXPECT_EQ(cutPoints.size(), expectedSize); - for (auto i = 0; i < expectedSize; i++) { - EXPECT_EQ(cutPoints[i].start, expected[i].start); - EXPECT_EQ(cutPoints[i].end, expected[i].end); - EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber); - EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision); - EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision); - } - } - template - void checkVectors(std::vector const& expected, std::vector const& computed) - { - EXPECT_EQ(expected.size(), computed.size()); - for (auto i = 0; i < expected.size(); i++) { - EXPECT_EQ(expected[i], computed[i]); - } - } - - }; - TEST_F(TestFImdlp, FitErrorEmptyDataset) - { - X = samples(); - y = labels(); - EXPECT_THROW(fit(X, y), std::invalid_argument); - } - TEST_F(TestFImdlp, FitErrorDifferentSize) - { - X = { 1, 2, 3 }; - y = { 1, 2 }; - EXPECT_THROW(fit(X, y), std::invalid_argument); - } - TEST_F(TestFImdlp, SortIndices) - { - X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; - indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 }; - checkSortedVector(X, indices); - X = { 5.77, 5.88, 5.99 }; - indices = { 0, 1, 2 }; - checkSortedVector(X, indices); - X = { 5.33, 5.22, 5.11 }; - indices = { 2, 1, 0 }; - checkSortedVector(X, indices); - } - TEST_F(TestFImdlp, EvaluateCutPoint) - { - cutPoint_t rest, candidate; - rest = { 0, 10, -1, -1, 1000 }; - candidate = { 0, 4, -1, -1, 5.15 }; - EXPECT_FALSE(evaluateCutPoint(rest, candidate)); - } - TEST_F(TestFImdlp, ComputeCutPointsOriginal) - { - cutPoints_t expected; - expected = { - { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 }, - { 6, 10, -1, 5.45, 3.4028234663852886e+38 } - }; - setCutPoints(cutPoints_t()); - computeCutPointsOriginal(); - checkCutPoints(expected); - } - TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) - { - cutPoints_t expected; - expected = { - { 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 }, - }; - X = { 0, 1, 2, 2 }; - y = { 1, 1, 1, 2 }; - fit(X, y); - computeCutPointsOriginal(); - checkCutPoints(expected); - } - TEST_F(TestFImdlp, ComputeCutPointsProposal) - { - cutPoints_t expected; - expected = { - { 0, 4, -1, -3.4028234663852886e+38, 5.1 }, { 4, 6, -1, 5.1, 5.4 }, - { 6, 9, -1, 5.4, 5.85 }, - { 9, 10, -1, 5.85, 3.4028234663852886e+38 } - }; - computeCutPointsProposal(); - checkCutPoints(expected); - } - TEST_F(TestFImdlp, ComputeCutPointsProposalGCase) - { - cutPoints_t expected; - expected = { - { 0, 3, -1, -3.4028234663852886e+38, 1.5 }, - { 3, 4, -1, 1.5, 3.4028234663852886e+38 } - }; - X = { 0, 1, 2, 2 }; - y = { 1, 1, 1, 2 }; - fit(X, y); - computeCutPointsProposal(); - checkCutPoints(expected); - } - TEST_F(TestFImdlp, DiscretizedValues) - { - labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - computed = getDiscretizedValues(); - checkVectors(expected, computed); - } - TEST_F(TestFImdlp, GetCutPoints) - { - samples computed, expected = { 5.15, 5.45, 3.4028234663852886e+38 }; - computeCutPointsOriginal(); - computed = getCutPoints(); - checkVectors(expected, computed); - } - TEST_F(TestFImdlp, Constructor) - { - samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; - labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; - setProposal(false); - fit(X, y); - computeCutPointsOriginal(); - cutPoints_t expected; - vector computed = getCutPoints(); - expected = { - { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 }, - { 6, 10, -1, 5.45, 3.4028234663852886e+38 } - }; - computed = getCutPoints(); - int expectedSize = expected.size(); - EXPECT_EQ(computed.size(), expected.size()); - for (auto i = 0; i < expectedSize; i++) { - EXPECT_NEAR(computed[i], expected[i].toValue, .00000001); - } - } -} \ No newline at end of file +//#include "gtest/gtest.h" +//#include "../Metrics.h" +//#include "../CPPFImdlp.h" +//namespace mdlp { +// class TestFImdlp : public CPPFImdlp, public testing::Test { +// public: +// TestFImdlp() : CPPFImdlp(true, true) {} +// void SetUp() +// { +// // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0] +// //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2) +// X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; +// y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; +// fit(X, y); +// } +// void setProposal(bool value) +// { +// proposal = value; +// } +// void initCutPoints() +// { +// setCutPoints(cutPoints_t()); +// } +// void initIndices() +// { +// indices = indices_t(); +// } +// void initDiscretized() +// { +// xDiscretized = labels(); +// } +// void checkSortedVector(samples& X_, indices_t indices_) +// { +// X = X_; +// indices = indices_; +// indices_t testSortedIndices = sortIndices(X); +// precision_t prev = X[testSortedIndices[0]]; +// for (auto i = 0; i < X.size(); ++i) { +// EXPECT_EQ(testSortedIndices[i], indices[i]); +// EXPECT_LE(prev, X[testSortedIndices[i]]); +// prev = X[testSortedIndices[i]]; +// } +// } +// void checkCutPoints(cutPoints_t& expected) +// { +// int expectedSize = expected.size(); +// EXPECT_EQ(cutPoints.size(), expectedSize); +// for (auto i = 0; i < expectedSize; i++) { +// EXPECT_EQ(cutPoints[i].start, expected[i].start); +// EXPECT_EQ(cutPoints[i].end, expected[i].end); +// EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber); +// EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision); +// EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision); +// } +// } +// template +// void checkVectors(std::vector const& expected, std::vector const& computed) +// { +// EXPECT_EQ(expected.size(), computed.size()); +// for (auto i = 0; i < expected.size(); i++) { +// EXPECT_EQ(expected[i], computed[i]); +// } +// } +// +// }; +// TEST_F(TestFImdlp, FitErrorEmptyDataset) +// { +// X = samples(); +// y = labels(); +// EXPECT_THROW(fit(X, y), std::invalid_argument); +// } +// TEST_F(TestFImdlp, FitErrorDifferentSize) +// { +// X = { 1, 2, 3 }; +// y = { 1, 2 }; +// EXPECT_THROW(fit(X, y), std::invalid_argument); +// } +// TEST_F(TestFImdlp, SortIndices) +// { +// X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; +// indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 }; +// checkSortedVector(X, indices); +// X = { 5.77, 5.88, 5.99 }; +// indices = { 0, 1, 2 }; +// checkSortedVector(X, indices); +// X = { 5.33, 5.22, 5.11 }; +// indices = { 2, 1, 0 }; +// checkSortedVector(X, indices); +// } +// TEST_F(TestFImdlp, EvaluateCutPoint) +// { +// cutPoint_t rest, candidate; +// rest = { 0, 10, -1, -1, 1000 }; +// candidate = { 0, 4, -1, -1, 5.15 }; +// EXPECT_FALSE(evaluateCutPoint(rest, candidate)); +// } +// TEST_F(TestFImdlp, ComputeCutPointsOriginal) +// { +// cutPoints_t expected; +// expected = { +// { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 }, +// { 6, 10, -1, 5.45, 3.4028234663852886e+38 } +// }; +// setCutPoints(cutPoints_t()); +// computeCutPointsOriginal(); +// checkCutPoints(expected); +// } +// TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) +// { +// cutPoints_t expected; +// expected = { +// { 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 }, +// }; +// X = { 0, 1, 2, 2 }; +// y = { 1, 1, 1, 2 }; +// fit(X, y); +// computeCutPointsOriginal(); +// checkCutPoints(expected); +// } +// TEST_F(TestFImdlp, ComputeCutPointsProposal) +// { +// cutPoints_t expected; +// expected = { +// { 0, 4, -1, -3.4028234663852886e+38, 5.1 }, { 4, 6, -1, 5.1, 5.4 }, +// { 6, 9, -1, 5.4, 5.85 }, +// { 9, 10, -1, 5.85, 3.4028234663852886e+38 } +// }; +// computeCutPointsProposal(); +// checkCutPoints(expected); +// } +// TEST_F(TestFImdlp, ComputeCutPointsProposalGCase) +// { +// cutPoints_t expected; +// expected = { +// { 0, 3, -1, -3.4028234663852886e+38, 1.5 }, +// { 3, 4, -1, 1.5, 3.4028234663852886e+38 } +// }; +// X = { 0, 1, 2, 2 }; +// y = { 1, 1, 1, 2 }; +// fit(X, y); +// computeCutPointsProposal(); +// checkCutPoints(expected); +// } +// TEST_F(TestFImdlp, DiscretizedValues) +// { +// labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +// computed = getDiscretizedValues(); +// checkVectors(expected, computed); +// } +// TEST_F(TestFImdlp, GetCutPoints) +// { +// samples computed, expected = { 5.15, 5.45, 3.4028234663852886e+38 }; +// computeCutPointsOriginal(); +// computed = getCutPoints(); +// checkVectors(expected, computed); +// } +// TEST_F(TestFImdlp, Constructor) +// { +// samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; +// labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; +// setProposal(false); +// fit(X, y); +// computeCutPointsOriginal(); +// cutPoints_t expected; +// vector computed = getCutPoints(); +// expected = { +// { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 }, +// { 6, 10, -1, 5.45, 3.4028234663852886e+38 } +// }; +// computed = getCutPoints(); +// int expectedSize = expected.size(); +// EXPECT_EQ(computed.size(), expected.size()); +// for (auto i = 0; i < expectedSize; i++) { +// EXPECT_NEAR(computed[i], expected[i].toValue, .00000001); +// } +// } +//} \ No newline at end of file diff --git a/fimdlp/testcpp/datasets/iris.arff b/fimdlp/testcpp/datasets/iris.arff new file mode 100755 index 0000000..780480c --- /dev/null +++ b/fimdlp/testcpp/datasets/iris.arff @@ -0,0 +1,225 @@ +% 1. Title: Iris Plants Database +% +% 2. Sources: +% (a) Creator: R.A. Fisher +% (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) +% (c) Date: July, 1988 +% +% 3. Past Usage: +% - Publications: too many to mention!!! Here are a few. +% 1. Fisher,R.A. "The use of multiple measurements in taxonomic problems" +% Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions +% to Mathematical Statistics" (John Wiley, NY, 1950). +% 2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis. +% (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. +% 3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System +% Structure and Classification Rule for Recognition in Partially Exposed +% Environments". IEEE Transactions on Pattern Analysis and Machine +% Intelligence, Vol. PAMI-2, No. 1, 67-71. +% -- Results: +% -- very low misclassification rates (0% for the setosa class) +% 4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE +% Transactions on Information Theory, May 1972, 431-433. +% -- Results: +% -- very low misclassification rates again +% 5. See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II +% conceptual clustering system finds 3 classes in the data. +% +% 4. Relevant Information: +% --- This is perhaps the best known database to be found in the pattern +% recognition literature. Fisher's paper is a classic in the field +% and is referenced frequently to this day. (See Duda & Hart, for +% example.) The data set contains 3 classes of 50 instances each, +% where each class refers to a type of iris plant. One class is +% linearly separable from the other 2; the latter are NOT linearly +% separable from each other. +% --- Predicted attribute: class of iris plant. +% --- This is an exceedingly simple domain. +% +% 5. Number of Instances: 150 (50 in each of three classes) +% +% 6. Number of Attributes: 4 numeric, predictive attributes and the class +% +% 7. Attribute Information: +% 1. sepal length in cm +% 2. sepal width in cm +% 3. petal length in cm +% 4. petal width in cm +% 5. class: +% -- Iris Setosa +% -- Iris Versicolour +% -- Iris Virginica +% +% 8. Missing Attribute Values: None +% +% Summary Statistics: +% Min Max Mean SD Class Correlation +% sepal length: 4.3 7.9 5.84 0.83 0.7826 +% sepal width: 2.0 4.4 3.05 0.43 -0.4194 +% petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) +% petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) +% +% 9. Class Distribution: 33.3% for each of 3 classes. + +@RELATION iris + +@ATTRIBUTE sepallength REAL +@ATTRIBUTE sepalwidth REAL +@ATTRIBUTE petallength REAL +@ATTRIBUTE petalwidth REAL +@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} + +@DATA +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica +% +% +% diff --git a/kdd_JapaneseVowels.arff b/fimdlp/testcpp/datasets/kdd_JapaneseVowels.arff similarity index 100% rename from kdd_JapaneseVowels.arff rename to fimdlp/testcpp/datasets/kdd_JapaneseVowels.arff diff --git a/letter.arff b/fimdlp/testcpp/datasets/letter.arff similarity index 100% rename from letter.arff rename to fimdlp/testcpp/datasets/letter.arff diff --git a/mfeat-factors.arff b/fimdlp/testcpp/datasets/mfeat-factors.arff similarity index 100% rename from mfeat-factors.arff rename to fimdlp/testcpp/datasets/mfeat-factors.arff diff --git a/fimdlp/testcpp/main b/fimdlp/testcpp/main new file mode 100755 index 0000000..8159308 Binary files /dev/null and b/fimdlp/testcpp/main differ diff --git a/fimdlp/testcpp/main.cpp b/fimdlp/testcpp/main.cpp new file mode 100644 index 0000000..155a93f --- /dev/null +++ b/fimdlp/testcpp/main.cpp @@ -0,0 +1,57 @@ +#include "ArffFiles.h" +#include +#include +#include +#include "../CPPFImdlp.h" + +using namespace std; + +int main(int argc, char** argv) +{ + ArffFiles file; + vector lines; + string path = "/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/"; + map datasets = { + {"mfeat-factors", true}, + {"iris", true}, + {"letter", true}, + {"kdd_JapaneseVowels", false} + }; + if (argc != 2 || datasets.find(argv[1]) == datasets.end()) { + cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl; + return 1; + } + + //file.load("datasets/mfeat-factors.arff", true); + //file.load("/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/kdd_JapaneseVowels.arff", false); + //file.load("/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/iris.arff", true); + file.load(path + argv[1] + ".arff", datasets[argv[1]]); + auto attributes = file.getAttributes(); + int items = file.getSize(); + cout << "Number of lines: " << items << endl; + cout << "Attributes: " << endl; + for (auto attribute : attributes) { + cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl; + } + cout << "Class name: " << file.getClassName() << endl; + cout << "Class type: " << file.getClassType() << endl; + cout << "Data: " << endl; + vector>& X = file.getX(); + vector& y = file.getY(); + for (int i = 0; i < 50; i++) { + for (auto feature : X) { + cout << fixed << setprecision(1) << feature[i] << " "; + } + cout << y[i] << endl; + } + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(); + for (auto i = 0; i < attributes.size(); i++) { + cout << "Cut points for " << get<0>(attributes[i]) << endl; + cout << "--------------------------" << setprecision(3) << endl; + test.fit(X[i], y); + for (auto item : test.getCutPoints()) { + cout << item << endl; + } + } + return 0; +} diff --git a/fimdlp/testcpp/xx/ArffFiles.cpp b/fimdlp/testcpp/xx/ArffFiles.cpp new file mode 100644 index 0000000..a65576d --- /dev/null +++ b/fimdlp/testcpp/xx/ArffFiles.cpp @@ -0,0 +1,111 @@ +#include "ArffFiles.h" + +#include +#include +#include +#include + +using namespace std; + +ArffFiles::ArffFiles() +{ +} +vector ArffFiles::getLines() +{ + return lines; +} +unsigned long int ArffFiles::getSize() +{ + return lines.size(); +} +vector> ArffFiles::getAttributes() +{ + return attributes; +} +string ArffFiles::getClassName() +{ + return className; +} +string ArffFiles::getClassType() +{ + return classType; +} +vector>& ArffFiles::getX() +{ + return X; +} +vector& ArffFiles::getY() +{ + return y; +} +void ArffFiles::load(string fileName) +{ + ifstream file(fileName); + string keyword, attribute, type; + if (file.is_open()) { + string line; + while (getline(file, line)) { + if (line[0] == '%' || line.empty() || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { + stringstream ss(line); + ss >> keyword >> attribute >> type; + attributes.push_back(make_tuple(attribute, type)); + continue; + } + if (line[0] == '@') { + continue; + } + lines.push_back(line); + } + file.close(); + if (attributes.empty()) + throw invalid_argument("No attributes found"); + className = get<0>(attributes.back()); + classType = get<1>(attributes.back()); + attributes.pop_back(); + generateDataset(); + } else + throw invalid_argument("Unable to open file"); +} +void ArffFiles::generateDataset() +{ + X = vector>(lines.size(), vector(attributes.size())); + vector yy = vector(lines.size(), ""); + for (int i = 0; i < lines.size(); i++) { + stringstream ss(lines[i]); + string value; + int j = 0; + while (getline(ss, value, ',')) { + if (j == attributes.size()) { + yy[i] = value; + break; + } + X[i][j] = stof(value); + j++; + } + } + y = factorize(yy); +} +string ArffFiles::trim(const string& source) +{ + string s(source); + s.erase(0, s.find_first_not_of(" \n\r\t")); + s.erase(s.find_last_not_of(" \n\r\t") + 1); + return s; +} +vector ArffFiles::factorize(const vector& labels) +{ + vector yy; + yy.reserve(labels.size()); + map labelMap; + int i = 0; + for (string label : labels) { + if (labelMap.find(label) == labelMap.end()) { + labelMap[label] = i++; + } + yy.push_back(labelMap[label]); + } + return yy; +} \ No newline at end of file diff --git a/fimdlp/testcpp/xx/ArffFiles.h b/fimdlp/testcpp/xx/ArffFiles.h new file mode 100644 index 0000000..2788b84 --- /dev/null +++ b/fimdlp/testcpp/xx/ArffFiles.h @@ -0,0 +1,28 @@ +#ifndef ARFFFILES_H +#define ARFFFILES_H +#include +#include +#include +using namespace std; +class ArffFiles { +private: + vector lines; + vector> attributes; + string className, classType; + vector> X; + vector y; + void generateDataset(); +public: + ArffFiles(); + void load(string); + vector getLines(); + unsigned long int getSize(); + string getClassName(); + string getClassType(); + string trim(const string&); + vector>& getX(); + vector& getY(); + vector> getAttributes(); + vector factorize(const vector& labels); +}; +#endif \ No newline at end of file diff --git a/fimdlp/testcpp/xx/CMakeLists.txt b/fimdlp/testcpp/xx/CMakeLists.txt new file mode 100644 index 0000000..ba62e67 --- /dev/null +++ b/fimdlp/testcpp/xx/CMakeLists.txt @@ -0,0 +1,6 @@ +cmake_minimum_required(VERSION 3.24) +project(main) + +set(CMAKE_CXX_STANDARD 17) + +add_executable(main main.cpp ArffFiles.cpp) diff --git a/fimdlp/testcpp/xx/main.cpp b/fimdlp/testcpp/xx/main.cpp new file mode 100644 index 0000000..b220199 --- /dev/null +++ b/fimdlp/testcpp/xx/main.cpp @@ -0,0 +1,30 @@ +#include "ArffFiles.h" +#include +#include +#include + +using namespace std; + +int main(int argc, char **argv) { + ArffFiles file; + vector lines; + //file.load("datasets/mfeat-factors.arff"); + file.load("/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/mfeat-factors.arff"); + cout << "Number of lines: " << file.getSize() << endl; + cout << "Attributes: " << endl; + for (auto attribute: file.getAttributes()) { + cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl; + } + cout << "Class name: " << file.getClassName() << endl; + cout << "Class type: " << file.getClassType() << endl; + cout << "Data: " << endl; + vector> &X = file.getX(); + vector &y = file.getY(); + for (int i = 0; i < X.size(); i++) { + for (float value: X[i]) { + cout << fixed << setprecision(1) << value << " "; + } + cout << y[i] << endl; + } + return 0; +} diff --git a/sample.py b/sample.py index 7473635..c216e3e 100644 --- a/sample.py +++ b/sample.py @@ -4,7 +4,6 @@ from fimdlp.cppfimdlp import CFImdlp from sklearn.ensemble import RandomForestClassifier import numpy as np import time -from math import log2 from scipy.io import arff import pandas as pd @@ -44,65 +43,3 @@ print(test.get_cut_points()) clf = RandomForestClassifier(random_state=0) print(clf.fit(Xt, y).score(Xt, y)) print(Xt) -# for proposal in [True, False]: -# X = data.data -# y = data.target -# print("*** Proposal: ", proposal) -# test = CFImdlp(debug=True, proposal=proposal) -# test.fit(X[:, 0], y) -# result = test.get_cut_points() -# for item in result: -# print( -# f"Class={item['classNumber']} - ({item['start']:3d}, " -# f"{item['end']:3d}) -> ({item['fromValue']:3.1f}, " -# f"{item['toValue']:3.1f}]" -# ) -# print(test.get_discretized_values()) -# print("+" * 40) -# X = np.array( -# [ -# [5.1, 3.5, 1.4, 0.2], -# [5.2, 3.0, 1.4, 0.2], -# [5.3, 3.2, 1.3, 0.2], -# [5.4, 3.1, 1.5, 0.2], -# ] -# ) -# y = np.array([0, 0, 0, 1]) -# print(test.fit(X[:, 0], y).transform(X[:, 0])) -# result = test.get_cut_points() -# for item in result: -# print( -# f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})" -# f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]" -# ) -# print("*" * 40) -# # print(Xs, ys) -# # print("**********************") -# # test = [(0, 3), (4, 4), (5, 5), (6, 8), (9, 9)] -# # print(ys) -# # for start, end in test: -# # print("Testing ", start, end, ys[:end], ys[end:]) -# # print("Information gain: ", information_gain(ys, ys[:end], ys[end:])) -# # print(test.transform(X)) -# # print(X) -# # print(indices) -# # print(np.array(X)[indices]) - - -# # # k = test.cut_points(X[:, 0], y) -# # # print(k) -# # # k = test.cut_points_ant(X[:, 0], y) -# # # print(k) -# # # test.debug_points(X[:, 0], y) -# # X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9] -# # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2] -# # indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7] -# # clf = CFImdlp(debug=True, proposal=False) -# # clf.fit(X, y) -# # print(clf.get_cut_points()) -# # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2] -# # # To check -# # indices2 = np.argsort(X) -# # Xs = np.array(X)[indices2] -# # ys = np.array(y)[indices2] -# kdd_JapaneseVowels