diff --git a/.gitignore b/.gitignore index 0318fe0..ceb5580 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,5 @@ dmypy.json .pyre/ cfimdlp.cpp .vscode/* +**/.idea/* + diff --git a/Makefile b/Makefile index 529c057..342dcba 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ clean: ## Clean up if [ -f fimdlp/cppfimdlp.cpython-310-darwin.so ]; then rm fimdlp/cppfimdlp.cpython-310-darwin.so; fi; test: - cd fimdlp/testcpp && ./test.sh + cd fimdlp/testcpp && ./test lint: ## Lint and static-check black fimdlp diff --git a/fimdlp/CPPFImdlp.cpp b/fimdlp/CPPFImdlp.cpp index e1ed848..1238d5a 100644 --- a/fimdlp/CPPFImdlp.cpp +++ b/fimdlp/CPPFImdlp.cpp @@ -5,18 +5,26 @@ #include #include "Metrics.h" namespace mdlp { - CPPFImdlp::CPPFImdlp() : debug(false), precision(6) + std::ostream& operator << (std::ostream& os, const cutPoint_t& cut) + { + os << cut.classNumber << " -> (" << cut.start << ", " << cut.end << + ") - (" << cut.fromValue << ", " << cut.toValue << ") " + << std::endl; + return os; + + } + CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false) { divider = pow(10, precision); } - CPPFImdlp::CPPFImdlp(int precision, bool debug) : debug(debug), precision(precision) + CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug) { divider = pow(10, precision); } CPPFImdlp::~CPPFImdlp() { } - std::vector CPPFImdlp::getCutPoints() + std::vector CPPFImdlp::getCutPoints() { return cutPoints; } @@ -32,7 +40,11 @@ namespace mdlp { this->xDiscretized = labels(X.size(), -1); this->numClasses = Metrics::numClasses(y, indices, 0, X.size()); - computeCutPoints(); + if (proposed) { + computeCutPointsProposed(); + } else { + computeCutPointsOriginal(); + } filterCutPoints(); applyCutPoints(); } @@ -64,7 +76,7 @@ namespace mdlp { } } } - bool CPPFImdlp::evaluateCutPoint(CutPoint_t rest, CutPoint_t candidate) + bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate) { int k, k1, k2; float ig, delta; @@ -73,7 +85,6 @@ namespace mdlp { if (N < 2) { return false; } - k = Metrics::numClasses(y, indices, rest.start, rest.end); k1 = Metrics::numClasses(y, indices, rest.start, candidate.end); k2 = Metrics::numClasses(y, indices, candidate.end, rest.end); @@ -83,15 +94,18 @@ namespace mdlp { ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses); delta = log2(pow(3, k) - 2) - (k * ent - k1 * ent1 - k2 * ent2); float term = 1 / N * (log2(N - 1) + delta); - std::cout << candidate + if (debug) { + std::cout << "Rest: " << rest; + std::cout << "Candidate: " << candidate; std::cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << std::endl; - std::cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << std::endl; + std::cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << std::endl; + } return (ig > term); } void CPPFImdlp::filterCutPoints() { - std::vector filtered; - CutPoint_t rest; + cutPoints_t filtered; + cutPoint_t rest; int classNumber = 0; rest.start = 0; @@ -116,24 +130,25 @@ namespace mdlp { item.classNumber = classNumber++; filtered.push_back(item); first = false; + rest.start = item.end; } else { std::cout << "Rejected" << std::endl; lastReject = true; } } - if (!first) + if (!first) { filtered.back().toValue = std::numeric_limits::max(); - else { + filtered.back().end = X.size(); + } else { filtered.push_back(rest); } cutPoints = filtered; } - void CPPFImdlp::computeCutPoints() + void CPPFImdlp::computeCutPointsProposed() { - - std::vector cutPts; - CutPoint_t cutPoint; + cutPoints_t cutPts; + cutPoint_t cutPoint; indices_t cutIdx; float xPrev, xCur, xPivot; int yPrev, yCur, yPivot; @@ -196,38 +211,56 @@ namespace mdlp { } cutPoints = cutPts; } - void CPPFImdlp::computeCutPointsAnt() + void CPPFImdlp::computeCutPointsOriginal() { - samples cutPts; - labels cutIdx; - float xPrev, cutPoint; + cutPoints_t cutPts; + cutPoint_t cutPoint; + float xPrev = std::numeric_limits::lowest(); int yPrev; - size_t idxPrev; - xPrev = X.at(indices[0]); - yPrev = y.at(indices[0]); - idxPrev = indices[0]; - if (debug) { - std::cout << "Entropy: " << Metrics::entropy(y, indices, 0, y.size(), Metrics::numClasses(y, indices, 0, indices.size())) << std::endl; - } - for (auto index = indices.begin(); index != indices.end(); ++index) { + bool first = true; + // idxPrev is the index of the init instance of the cutPoint + size_t index, idxPrev = 0, idx = indices[0]; + xPrev = X[idx]; + yPrev = y[idx]; + for (index = 0; index < size_t(indices.size()) - 1; index++) { + idx = indices[index]; // Definition 2 Cut points are always on boundaries - if (y.at(*index) != yPrev && xPrev < X.at(*index)) { - cutPoint = round(divider * (X.at(*index) + xPrev) / 2) / divider; - if (debug) { - std::cout << "Cut point: " << (xPrev + X.at(*index)) / 2 << " //"; - std::cout << X.at(*index) << " -> " << y.at(*index) << " yPrev= " << yPrev; - std::cout << "* (" << X.at(*index) << ", " << xPrev << ")=" - << ((X.at(*index) + xPrev) / 2) << "idxPrev" - << idxPrev << std::endl; + if (y[idx] != yPrev && xPrev < X[idx]) { + if (first) { + first = false; + cutPoint.fromValue = std::numeric_limits::lowest(); + } else { + cutPoint.fromValue = cutPts.back().toValue; } + cutPoint.start = idxPrev; + cutPoint.end = index; + cutPoint.classNumber = -1; + cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider; + if (debug) { + std::cout << "Cut point: " << cutPoint << " //"; + std::cout << X[idx] << " -> " << y[idx] << " yPrev= " + << yPrev << idxPrev << std::endl; + } + idxPrev = index; cutPts.push_back(cutPoint); - cutIdx.push_back(idxPrev); } - xPrev = X.at(*index); - yPrev = y.at(*index); - idxPrev = *index; + xPrev = X[idx]; + yPrev = y[idx]; } - // cutPoints = cutPts; + std::cout << "Came to here" << first << std::endl; + if (first) { + cutPoint.start = 0; + cutPoint.classNumber = -1; + cutPoint.fromValue = std::numeric_limits::lowest(); + cutPoint.toValue = std::numeric_limits::max(); + cutPoints.push_back(cutPoint); + } else + cutPts.back().toValue = std::numeric_limits::max(); + cutPts.back().end = X.size(); + if (debug) + for (auto cutPoint : cutPts) + std::cout << "Cut point: " << cutPoint << std::endl; + cutPoints = cutPts; } // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes indices_t CPPFImdlp::sortIndices(samples& X) diff --git a/fimdlp/CPPFImdlp.h b/fimdlp/CPPFImdlp.h index f0ebed9..f242eaf 100644 --- a/fimdlp/CPPFImdlp.h +++ b/fimdlp/CPPFImdlp.h @@ -5,29 +5,30 @@ namespace mdlp { class CPPFImdlp { private: - bool debug; + bool proposed; // proposed algorithm or original algorithm int precision; + bool debug; float divider; indices_t indices; // sorted indices to use with X and y samples X; labels y; labels xDiscretized; int numClasses; - std::vector cutPoints; + cutPoints_t cutPoints; protected: indices_t sortIndices(samples&); - void computeCutPointsAnt(); - void computeCutPoints(); - bool evaluateCutPoint(CutPoint_t, CutPoint_t); + void computeCutPointsOriginal(); + void computeCutPointsProposed(); + bool evaluateCutPoint(cutPoint_t, cutPoint_t); void filterCutPoints(); void applyCutPoints(); public: CPPFImdlp(); - CPPFImdlp(int, bool debug = false); + CPPFImdlp(bool, int, bool debug = false); ~CPPFImdlp(); - std::vector getCutPoints(); + cutPoints_t getCutPoints(); labels getDiscretizedValues(); void debugPoints(samples&, labels&); void fit(samples&, labels&); diff --git a/fimdlp/Metrics.cpp b/fimdlp/Metrics.cpp index 7f9b060..c26751f 100644 --- a/fimdlp/Metrics.cpp +++ b/fimdlp/Metrics.cpp @@ -41,7 +41,7 @@ namespace mdlp { entropy = Metrics::entropy(y, indices, start, end, nClasses); entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft); entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight); - iGain = entropy - (float)nElementsLeft / nElements * entropyLeft - (float)nElementsRight / nElements * entropyRight; + iGain = entropy - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements; return iGain; } diff --git a/fimdlp/cfimdlp.pyx b/fimdlp/cfimdlp.pyx index ccce80f..61534df 100644 --- a/fimdlp/cfimdlp.pyx +++ b/fimdlp/cfimdlp.pyx @@ -10,7 +10,7 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp": float fromValue, toValue; cdef cppclass CPPFImdlp: CPPFImdlp() except + - CPPFImdlp(int, bool) except + + CPPFImdlp(bool, int, bool) except + void fit(vector[float]&, vector[int]&) vector[int] transform(vector[float]&) vector[int] getDiscretizedValues() @@ -18,7 +18,7 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp": void debugPoints(vector[float]&, vector[int]&) -class PCutPoint_t: +class PcutPoint_t: def __init__(self, start, end, fromValue, toValue): self.start = start self.end = end @@ -27,8 +27,9 @@ class PCutPoint_t: cdef class CFImdlp: cdef CPPFImdlp *thisptr - def __cinit__(self, precision=6, debug=False): - self.thisptr = new CPPFImdlp(precision, debug) + def __cinit__(self, precision=6, debug=False, proposed=True): + # Proposed or original algorithm + self.thisptr = new CPPFImdlp(proposed, precision, debug) def __dealloc__(self): del self.thisptr def fit(self, X, y): diff --git a/fimdlp/cppfimdlp.cpython-310-darwin.so b/fimdlp/cppfimdlp.cpython-310-darwin.so deleted file mode 100755 index cccc05b..0000000 Binary files a/fimdlp/cppfimdlp.cpython-310-darwin.so and /dev/null differ diff --git a/fimdlp/mdlp.py b/fimdlp/mdlp.py index 12eb272..090766a 100644 --- a/fimdlp/mdlp.py +++ b/fimdlp/mdlp.py @@ -61,7 +61,7 @@ class FImdlp(TransformerMixin, BaseEstimator): self.n_features_ = X.shape[1] self.X_ = X self.y_ = y - self.discretizer_ = CFImdlp(debug=False) + self.discretizer_ = CFImdlp(debug=True, proposed=False) return self def transform(self, X): @@ -104,19 +104,31 @@ class FImdlp(TransformerMixin, BaseEstimator): print("Cuts calculados en python: ", cuts) print("Cuts calculados en C++") print("Cut points for each feature in Iris dataset:") - for i in range(0, self.n_features_): + for i in range(0, 1): # datax = self.X_[np.argsort(self.X_[:, i]), i] # y_ = self.y_[np.argsort(self.X_[:, i])] datax = self.X_[:, i] y_ = self.y_ - Xcutpoints = self.discretizer_.cut_points(datax, y_) + self.discretizer_.fit(datax, y_) + Xcutpoints = self.discretizer_.get_cut_points() print( f"New ({len(Xcutpoints)}):{self.features_[i]:20s}: " - f"{Xcutpoints}" - ) - Xcutpoints = self.discretizer_.cut_points_ant(datax, y_) - print( - f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: " - f"{Xcutpoints}" + f"{[i['toValue'] for i in Xcutpoints]}" ) + X_translated = [ + f"{i['classNumber']} - ({i['start']}, {i['end']}) - " + f"({i['fromValue']}, {i['toValue']})" + for i in Xcutpoints + ] + print(X_translated) + print("*******************************") + print("Disretized values:") + print(self.discretizer_.transform(datax)) + print("*******************************") + print("indices:", np.argsort(X[:, 0])) + # Xcutpoints = self.discretizer_.cut_points_ant(datax, y_) + # print( + # f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: " + # f"{Xcutpoints}" + # ) return X diff --git a/fimdlp/testcpp/FImdlp_unittest.cc b/fimdlp/testcpp/FImdlp_unittest.cc index 61d5260..bc7f226 100644 --- a/fimdlp/testcpp/FImdlp_unittest.cc +++ b/fimdlp/testcpp/FImdlp_unittest.cc @@ -30,7 +30,7 @@ namespace mdlp { prev = X[testSortedIndices[i]]; } } - std::vector testCutPoints(samples& X, indices_t& indices, labels& y) + std::vector testCutPoints(samples& X, indices_t& indices, labels& y) { this->X = X; this->y = y; @@ -56,7 +56,7 @@ namespace mdlp { } // TEST_F(TestMetrics, EvaluateCutPoint) // { - // CutPoint_t rest, candidate; + // cutPoint_t rest, candidate; // rest.start = 0; // rest.end = 10; // candidate.start = 0; @@ -64,13 +64,13 @@ namespace mdlp { // float computed = evaluateCutPoint(rest, candidate); // ASSERT_NEAR(0.468996, computed, precision_test); // } - TEST_F(TestMetrics, ComputeCutPoints) + TEST_F(TestMetrics, ComputeCutPointsOriginal) { - std::vector computed, expected; - computeCutPoints(); + std::vector computed, expected; + computeCutPointsOriginal(); computed = getCutPoints(); for (auto cut : computed) { - std::cout << "(" << cut.start << ", " << cut.end << ") -> (" << cut.fromValue << ", " << cut.toValue << ")" << std::endl; + std::cout << cut.classNumber << " -> (" << cut.start << ", " << cut.end << ") -> (" << cut.fromValue << ", " << cut.toValue << ")" << std::endl; } } } \ No newline at end of file diff --git a/fimdlp/testcpp/test.sh b/fimdlp/testcpp/test.sh deleted file mode 100755 index e27cdde..0000000 --- a/fimdlp/testcpp/test.sh +++ /dev/null @@ -1,12 +0,0 @@ -cmake -S . -B build -Wno-dev -if test $? -ne 0; then - echo "Error in creating build commands." - exit 1 -fi -cmake --build build -if test $? -ne 0; then - echo "Error in build command." - exit 1 -fi -cd build -ctest --output-on-failure diff --git a/fimdlp/typesFImdlp.h b/fimdlp/typesFImdlp.h index 7214ce9..2a0e72c 100644 --- a/fimdlp/typesFImdlp.h +++ b/fimdlp/typesFImdlp.h @@ -2,14 +2,15 @@ #define TYPES_H #include namespace mdlp { - typedef std::vector samples; - typedef std::vector labels; - typedef std::vector indices_t; struct CutPointBody { size_t start, end; // indices of the sorted vector int classNumber; // class assigned to the cut point float fromValue, toValue; }; - typedef CutPointBody CutPoint_t; + typedef CutPointBody cutPoint_t; + typedef std::vector samples; + typedef std::vector labels; + typedef std::vector indices_t; + typedef std::vector cutPoints_t; } #endif \ No newline at end of file diff --git a/sample.py b/sample.py index fd969bb..6f7285d 100644 --- a/sample.py +++ b/sample.py @@ -2,6 +2,59 @@ from sklearn.datasets import load_iris from fimdlp.mdlp import FImdlp from fimdlp.cppfimdlp import CFImdlp import numpy as np +from math import log + + +def entropy(y: np.array) -> float: + """Compute entropy of a labels set + + Parameters + ---------- + y : np.array + set of labels + + Returns + ------- + float + entropy + """ + n_labels = len(y) + if n_labels <= 1: + return 0 + counts = np.bincount(y) + proportions = counts / n_labels + n_classes = np.count_nonzero(proportions) + if n_classes <= 1: + return 0 + entropy = 0.0 + # Compute standard entropy. + for prop in proportions: + if prop != 0.0: + entropy -= prop * log(prop, 2) + return entropy + + +def information_gain( + labels: np.array, labels_up: np.array, labels_dn: np.array +) -> float: + imp_prev = entropy(labels) + card_up = card_dn = imp_up = imp_dn = 0 + if labels_up is not None: + card_up = labels_up.shape[0] + imp_up = entropy(labels_up) + if labels_dn is not None: + card_dn = labels_dn.shape[0] if labels_dn is not None else 0 + imp_dn = entropy(labels_dn) + samples = card_up + card_dn + if samples == 0: + return 0.0 + else: + result = ( + imp_prev + - (card_up / samples) * imp_up + - (card_dn / samples) * imp_dn + ) + return result data = load_iris() @@ -10,26 +63,38 @@ y = data.target features = data.feature_names test = FImdlp() test.fit(X, y, features=features) -# test.transform(X) +test.transform(X) -test = CFImdlp(debug=False) -# k = test.cut_points(X[:, 0], y) -# print(k) -# k = test.cut_points_ant(X[:, 0], y) -# print(k) -# test.debug_points(X[:, 0], y) -X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9] -indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7] -y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2] -# test.fit(X[:, 0], y) -test.fit(X, y) -result = test.get_cut_points() -for item in result: - print( - f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})" - f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]" - ) -print(test.get_discretized_values()) +# test = CFImdlp(debug=False) +# # k = test.cut_points(X[:, 0], y) +# # print(k) +# # k = test.cut_points_ant(X[:, 0], y) +# # print(k) +# # test.debug_points(X[:, 0], y) +# X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9] +# indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7] +# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2] +# # To check +# indices2 = np.argsort(X) +# Xs = np.array(X)[indices2] +# ys = np.array(y)[indices2] +# # test.fit(X[:, 0], y) +# test.fit(X, y) +# result = test.get_cut_points() +# for item in result: +# print( +# f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})" +# f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]" +# ) +# print(test.get_discretized_values()) + +# print(Xs, ys) +# print("**********************") +# test = [(0, 3), (4, 4), (5, 5), (6, 8), (9, 9)] +# print(ys) +# for start, end in test: +# print("Testing ", start, end, ys[:end], ys[end:]) +# print("Information gain: ", information_gain(ys, ys[:end], ys[end:])) # print(test.transform(X)) # print(X) # print(indices) diff --git a/test1.xlsx b/test1.xlsx new file mode 100644 index 0000000..2e6bcb2 Binary files /dev/null and b/test1.xlsx differ