From 7f4b09d2d63b7db0db9156b6b4aec2750ab28a84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 7 Dec 2022 01:26:38 +0100 Subject: [PATCH] Add good_cut filter --- fimdlp/CPPFImdlp.cpp | 34 +++++++++++++++++++++++++--------- fimdlp/CPPFImdlp.h | 1 + fimdlp/main.cpp | 21 ++++++++++++++++++--- 3 files changed, 44 insertions(+), 12 deletions(-) diff --git a/fimdlp/CPPFImdlp.cpp b/fimdlp/CPPFImdlp.cpp index b7ae995..be5819f 100644 --- a/fimdlp/CPPFImdlp.cpp +++ b/fimdlp/CPPFImdlp.cpp @@ -46,9 +46,9 @@ namespace mdlp { if (X.size() == 0 || y.size() == 0) { throw invalid_argument("X and y must have at least one element"); } - this->indices = sortIndices(X_); - this->xDiscretized = labels(X.size(), -1); - this->numClasses = Metrics::numClasses(y, indices, 0, X.size()); + indices = sortIndices(X_); + xDiscretized = labels(X.size(), -1); + numClasses = Metrics::numClasses(y, indices, 0, X.size()); if (proposal) { computeCutPointsProposal(); @@ -168,9 +168,9 @@ namespace mdlp { } while (idx < numElements && xCur == xPivot); // Check if the class changed and there are more than 1 element - if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur)) { + if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && goodCut(start, idx, numElements + 1)) { // Must we add the entropy criteria here? - // if (totalEntropy - (entropyLeft + entropyRight) < 0) { Accept cut point } + // if (totalEntropy - (entropyLeft + entropyRight) > 0) { Accept cut point } cutPoint.start = start; cutPoint.end = idx; start = idx; @@ -211,16 +211,17 @@ namespace mdlp { int yPrev; bool first = true; // idxPrev is the index of the init instance of the cutPoint - size_t index, idxPrev = 0, idx = indices[0]; + size_t index, idxPrev = 0, last, idx = indices[0]; xPrev = X[idx]; yPrev = y[idx]; - for (index = 0; index < size_t(indices.size()) - 1; index++) { + last = indices.size() - 1; + for (index = 0; index < last; index++) { idx = indices[index]; // Definition 2 Cut points are always on class boundaries && // there are more than 1 items in the interval - if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1) { + // if (entropy of interval) > (entropyLeft + entropyRight)) { Accept cut point } (goodCut) + if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1 && goodCut(idxPrev, idx, last + 1)) { // Must we add the entropy criteria here? - // if (totalEntropy - (entropyLeft + entropyRight) < 0) { Accept cut point } if (first) { first = false; cutPoint.fromValue = numeric_limits::lowest(); @@ -253,6 +254,21 @@ namespace mdlp { } cutPoints = cutPts; } + bool CPPFImdlp::goodCut(size_t start, size_t cut, size_t end) + { + /* + Meter las entropías en una matríz cuadrada dispersa (samples, samples) M[start, end] iniciada a -1 y si no se ha calculado calcularla y almacenarla + + + */ + float entropyLeft = Metrics::entropy(y, indices, start, cut, numClasses); + float entropyRight = Metrics::entropy(y, indices, cut, end, numClasses); + float entropyInterval = Metrics::entropy(y, indices, start, end, numClasses); + if (debug) + printf("Entropy L, R, T: L(%5.3g) + R(%5.3g) - T(%5.3g) \t", entropyLeft, entropyRight, entropyInterval); + //return (entropyInterval - (entropyLeft + entropyRight) > 0); + return true; + } // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes indices_t CPPFImdlp::sortIndices(samples& X_) { diff --git a/fimdlp/CPPFImdlp.h b/fimdlp/CPPFImdlp.h index 8965092..926b6e6 100644 --- a/fimdlp/CPPFImdlp.h +++ b/fimdlp/CPPFImdlp.h @@ -22,6 +22,7 @@ namespace mdlp { void computeCutPointsProposal(); bool evaluateCutPoint(cutPoint_t, cutPoint_t); void filterCutPoints(); + bool goodCut(size_t, size_t, size_t); // if the cut candidate reduces entropy public: CPPFImdlp(); diff --git a/fimdlp/main.cpp b/fimdlp/main.cpp index 8c918fd..7c1c4a1 100644 --- a/fimdlp/main.cpp +++ b/fimdlp/main.cpp @@ -19,19 +19,34 @@ int main() // Read the Data from the file // as String Vector + size_t col; vector row; string line, word; + vector> dataset = vector>(15, vector()); while (getline(fin, line)) { if (count++ > 215) { - row.clear(); stringstream ss(line); + col = 0; while (getline(ss, word, ',')) { - row.push_back(word); - cout << word << " "; + col = col % 15; + dataset[col].push_back(stof(word)); + cout << col << "-" << word << " "; + col++; } cout << endl; } } + labels y = labels(dataset[0].begin(), dataset[0].end()); + cout << "Column 0 (y): " << y.size() << endl; + for (auto item : y) { + cout << item << " "; + } + CPPFImdlp test = CPPFImdlp(false, 6, true); + test.fit(dataset[3], y); + cout << "Cut points: " << test.getCutPoints().size() << endl; + for (auto item : test.getCutPoints()) { + cout << item << " "; + } fin.close(); return 0; } \ No newline at end of file