Enhance cutpoints computation

This commit is contained in:
2022-12-02 19:22:13 +01:00
parent 5657c1cd9f
commit 97cd2243fa
13 changed files with 207 additions and 104 deletions

2
.gitignore vendored
View File

@@ -129,3 +129,5 @@ dmypy.json
.pyre/ .pyre/
cfimdlp.cpp cfimdlp.cpp
.vscode/* .vscode/*
**/.idea/*

View File

@@ -8,7 +8,7 @@ clean: ## Clean up
if [ -f fimdlp/cppfimdlp.cpython-310-darwin.so ]; then rm fimdlp/cppfimdlp.cpython-310-darwin.so; fi; if [ -f fimdlp/cppfimdlp.cpython-310-darwin.so ]; then rm fimdlp/cppfimdlp.cpython-310-darwin.so; fi;
test: test:
cd fimdlp/testcpp && ./test.sh cd fimdlp/testcpp && ./test
lint: ## Lint and static-check lint: ## Lint and static-check
black fimdlp black fimdlp

View File

@@ -5,18 +5,26 @@
#include <algorithm> #include <algorithm>
#include "Metrics.h" #include "Metrics.h"
namespace mdlp { namespace mdlp {
CPPFImdlp::CPPFImdlp() : debug(false), precision(6) std::ostream& operator << (std::ostream& os, const cutPoint_t& cut)
{
os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
") - (" << cut.fromValue << ", " << cut.toValue << ") "
<< std::endl;
return os;
}
CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false)
{ {
divider = pow(10, precision); divider = pow(10, precision);
} }
CPPFImdlp::CPPFImdlp(int precision, bool debug) : debug(debug), precision(precision) CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug)
{ {
divider = pow(10, precision); divider = pow(10, precision);
} }
CPPFImdlp::~CPPFImdlp() CPPFImdlp::~CPPFImdlp()
{ {
} }
std::vector<CutPoint_t> CPPFImdlp::getCutPoints() std::vector<cutPoint_t> CPPFImdlp::getCutPoints()
{ {
return cutPoints; return cutPoints;
} }
@@ -32,7 +40,11 @@ namespace mdlp {
this->xDiscretized = labels(X.size(), -1); this->xDiscretized = labels(X.size(), -1);
this->numClasses = Metrics::numClasses(y, indices, 0, X.size()); this->numClasses = Metrics::numClasses(y, indices, 0, X.size());
computeCutPoints(); if (proposed) {
computeCutPointsProposed();
} else {
computeCutPointsOriginal();
}
filterCutPoints(); filterCutPoints();
applyCutPoints(); applyCutPoints();
} }
@@ -64,7 +76,7 @@ namespace mdlp {
} }
} }
} }
bool CPPFImdlp::evaluateCutPoint(CutPoint_t rest, CutPoint_t candidate) bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
{ {
int k, k1, k2; int k, k1, k2;
float ig, delta; float ig, delta;
@@ -73,7 +85,6 @@ namespace mdlp {
if (N < 2) { if (N < 2) {
return false; return false;
} }
k = Metrics::numClasses(y, indices, rest.start, rest.end); k = Metrics::numClasses(y, indices, rest.start, rest.end);
k1 = Metrics::numClasses(y, indices, rest.start, candidate.end); k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
k2 = Metrics::numClasses(y, indices, candidate.end, rest.end); k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
@@ -83,15 +94,18 @@ namespace mdlp {
ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses); ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
delta = log2(pow(3, k) - 2) - (k * ent - k1 * ent1 - k2 * ent2); delta = log2(pow(3, k) - 2) - (k * ent - k1 * ent1 - k2 * ent2);
float term = 1 / N * (log2(N - 1) + delta); float term = 1 / N * (log2(N - 1) + delta);
std::cout << candidate if (debug) {
std::cout << "Rest: " << rest;
std::cout << "Candidate: " << candidate;
std::cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << std::endl; std::cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << std::endl;
std::cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << std::endl; std::cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << std::endl;
}
return (ig > term); return (ig > term);
} }
void CPPFImdlp::filterCutPoints() void CPPFImdlp::filterCutPoints()
{ {
std::vector<CutPoint_t> filtered; cutPoints_t filtered;
CutPoint_t rest; cutPoint_t rest;
int classNumber = 0; int classNumber = 0;
rest.start = 0; rest.start = 0;
@@ -116,24 +130,25 @@ namespace mdlp {
item.classNumber = classNumber++; item.classNumber = classNumber++;
filtered.push_back(item); filtered.push_back(item);
first = false; first = false;
rest.start = item.end;
} else { } else {
std::cout << "Rejected" << std::endl; std::cout << "Rejected" << std::endl;
lastReject = true; lastReject = true;
} }
} }
if (!first) if (!first) {
filtered.back().toValue = std::numeric_limits<float>::max(); filtered.back().toValue = std::numeric_limits<float>::max();
else { filtered.back().end = X.size();
} else {
filtered.push_back(rest); filtered.push_back(rest);
} }
cutPoints = filtered; cutPoints = filtered;
} }
void CPPFImdlp::computeCutPoints() void CPPFImdlp::computeCutPointsProposed()
{ {
cutPoints_t cutPts;
std::vector<CutPoint_t> cutPts; cutPoint_t cutPoint;
CutPoint_t cutPoint;
indices_t cutIdx; indices_t cutIdx;
float xPrev, xCur, xPivot; float xPrev, xCur, xPivot;
int yPrev, yCur, yPivot; int yPrev, yCur, yPivot;
@@ -196,38 +211,56 @@ namespace mdlp {
} }
cutPoints = cutPts; cutPoints = cutPts;
} }
void CPPFImdlp::computeCutPointsAnt() void CPPFImdlp::computeCutPointsOriginal()
{ {
samples cutPts; cutPoints_t cutPts;
labels cutIdx; cutPoint_t cutPoint;
float xPrev, cutPoint; float xPrev = std::numeric_limits<float>::lowest();
int yPrev; int yPrev;
size_t idxPrev; bool first = true;
xPrev = X.at(indices[0]); // idxPrev is the index of the init instance of the cutPoint
yPrev = y.at(indices[0]); size_t index, idxPrev = 0, idx = indices[0];
idxPrev = indices[0]; xPrev = X[idx];
if (debug) { yPrev = y[idx];
std::cout << "Entropy: " << Metrics::entropy(y, indices, 0, y.size(), Metrics::numClasses(y, indices, 0, indices.size())) << std::endl; for (index = 0; index < size_t(indices.size()) - 1; index++) {
} idx = indices[index];
for (auto index = indices.begin(); index != indices.end(); ++index) {
// Definition 2 Cut points are always on boundaries // Definition 2 Cut points are always on boundaries
if (y.at(*index) != yPrev && xPrev < X.at(*index)) { if (y[idx] != yPrev && xPrev < X[idx]) {
cutPoint = round(divider * (X.at(*index) + xPrev) / 2) / divider; if (first) {
if (debug) { first = false;
std::cout << "Cut point: " << (xPrev + X.at(*index)) / 2 << " //"; cutPoint.fromValue = std::numeric_limits<float>::lowest();
std::cout << X.at(*index) << " -> " << y.at(*index) << " yPrev= " << yPrev; } else {
std::cout << "* (" << X.at(*index) << ", " << xPrev << ")=" cutPoint.fromValue = cutPts.back().toValue;
<< ((X.at(*index) + xPrev) / 2) << "idxPrev"
<< idxPrev << std::endl;
} }
cutPoint.start = idxPrev;
cutPoint.end = index;
cutPoint.classNumber = -1;
cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
if (debug) {
std::cout << "Cut point: " << cutPoint << " //";
std::cout << X[idx] << " -> " << y[idx] << " yPrev= "
<< yPrev << idxPrev << std::endl;
}
idxPrev = index;
cutPts.push_back(cutPoint); cutPts.push_back(cutPoint);
cutIdx.push_back(idxPrev);
} }
xPrev = X.at(*index); xPrev = X[idx];
yPrev = y.at(*index); yPrev = y[idx];
idxPrev = *index;
} }
// cutPoints = cutPts; std::cout << "Came to here" << first << std::endl;
if (first) {
cutPoint.start = 0;
cutPoint.classNumber = -1;
cutPoint.fromValue = std::numeric_limits<float>::lowest();
cutPoint.toValue = std::numeric_limits<float>::max();
cutPoints.push_back(cutPoint);
} else
cutPts.back().toValue = std::numeric_limits<float>::max();
cutPts.back().end = X.size();
if (debug)
for (auto cutPoint : cutPts)
std::cout << "Cut point: " << cutPoint << std::endl;
cutPoints = cutPts;
} }
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples& X) indices_t CPPFImdlp::sortIndices(samples& X)

View File

@@ -5,29 +5,30 @@
namespace mdlp { namespace mdlp {
class CPPFImdlp { class CPPFImdlp {
private: private:
bool debug; bool proposed; // proposed algorithm or original algorithm
int precision; int precision;
bool debug;
float divider; float divider;
indices_t indices; // sorted indices to use with X and y indices_t indices; // sorted indices to use with X and y
samples X; samples X;
labels y; labels y;
labels xDiscretized; labels xDiscretized;
int numClasses; int numClasses;
std::vector<CutPoint_t> cutPoints; cutPoints_t cutPoints;
protected: protected:
indices_t sortIndices(samples&); indices_t sortIndices(samples&);
void computeCutPointsAnt(); void computeCutPointsOriginal();
void computeCutPoints(); void computeCutPointsProposed();
bool evaluateCutPoint(CutPoint_t, CutPoint_t); bool evaluateCutPoint(cutPoint_t, cutPoint_t);
void filterCutPoints(); void filterCutPoints();
void applyCutPoints(); void applyCutPoints();
public: public:
CPPFImdlp(); CPPFImdlp();
CPPFImdlp(int, bool debug = false); CPPFImdlp(bool, int, bool debug = false);
~CPPFImdlp(); ~CPPFImdlp();
std::vector<CutPoint_t> getCutPoints(); cutPoints_t getCutPoints();
labels getDiscretizedValues(); labels getDiscretizedValues();
void debugPoints(samples&, labels&); void debugPoints(samples&, labels&);
void fit(samples&, labels&); void fit(samples&, labels&);

View File

@@ -41,7 +41,7 @@ namespace mdlp {
entropy = Metrics::entropy(y, indices, start, end, nClasses); entropy = Metrics::entropy(y, indices, start, end, nClasses);
entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft); entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft);
entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight); entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight);
iGain = entropy - (float)nElementsLeft / nElements * entropyLeft - (float)nElementsRight / nElements * entropyRight; iGain = entropy - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements;
return iGain; return iGain;
} }

View File

@@ -10,7 +10,7 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
float fromValue, toValue; float fromValue, toValue;
cdef cppclass CPPFImdlp: cdef cppclass CPPFImdlp:
CPPFImdlp() except + CPPFImdlp() except +
CPPFImdlp(int, bool) except + CPPFImdlp(bool, int, bool) except +
void fit(vector[float]&, vector[int]&) void fit(vector[float]&, vector[int]&)
vector[int] transform(vector[float]&) vector[int] transform(vector[float]&)
vector[int] getDiscretizedValues() vector[int] getDiscretizedValues()
@@ -18,7 +18,7 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
void debugPoints(vector[float]&, vector[int]&) void debugPoints(vector[float]&, vector[int]&)
class PCutPoint_t: class PcutPoint_t:
def __init__(self, start, end, fromValue, toValue): def __init__(self, start, end, fromValue, toValue):
self.start = start self.start = start
self.end = end self.end = end
@@ -27,8 +27,9 @@ class PCutPoint_t:
cdef class CFImdlp: cdef class CFImdlp:
cdef CPPFImdlp *thisptr cdef CPPFImdlp *thisptr
def __cinit__(self, precision=6, debug=False): def __cinit__(self, precision=6, debug=False, proposed=True):
self.thisptr = new CPPFImdlp(precision, debug) # Proposed or original algorithm
self.thisptr = new CPPFImdlp(proposed, precision, debug)
def __dealloc__(self): def __dealloc__(self):
del self.thisptr del self.thisptr
def fit(self, X, y): def fit(self, X, y):

View File

@@ -61,7 +61,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
self.n_features_ = X.shape[1] self.n_features_ = X.shape[1]
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
self.discretizer_ = CFImdlp(debug=False) self.discretizer_ = CFImdlp(debug=True, proposed=False)
return self return self
def transform(self, X): def transform(self, X):
@@ -104,19 +104,31 @@ class FImdlp(TransformerMixin, BaseEstimator):
print("Cuts calculados en python: ", cuts) print("Cuts calculados en python: ", cuts)
print("Cuts calculados en C++") print("Cuts calculados en C++")
print("Cut points for each feature in Iris dataset:") print("Cut points for each feature in Iris dataset:")
for i in range(0, self.n_features_): for i in range(0, 1):
# datax = self.X_[np.argsort(self.X_[:, i]), i] # datax = self.X_[np.argsort(self.X_[:, i]), i]
# y_ = self.y_[np.argsort(self.X_[:, i])] # y_ = self.y_[np.argsort(self.X_[:, i])]
datax = self.X_[:, i] datax = self.X_[:, i]
y_ = self.y_ y_ = self.y_
Xcutpoints = self.discretizer_.cut_points(datax, y_) self.discretizer_.fit(datax, y_)
Xcutpoints = self.discretizer_.get_cut_points()
print( print(
f"New ({len(Xcutpoints)}):{self.features_[i]:20s}: " f"New ({len(Xcutpoints)}):{self.features_[i]:20s}: "
f"{Xcutpoints}" f"{[i['toValue'] for i in Xcutpoints]}"
)
Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
print(
f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
f"{Xcutpoints}"
) )
X_translated = [
f"{i['classNumber']} - ({i['start']}, {i['end']}) - "
f"({i['fromValue']}, {i['toValue']})"
for i in Xcutpoints
]
print(X_translated)
print("*******************************")
print("Disretized values:")
print(self.discretizer_.transform(datax))
print("*******************************")
print("indices:", np.argsort(X[:, 0]))
# Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
# print(
# f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
# f"{Xcutpoints}"
# )
return X return X

View File

@@ -30,7 +30,7 @@ namespace mdlp {
prev = X[testSortedIndices[i]]; prev = X[testSortedIndices[i]];
} }
} }
std::vector<CutPoint_t> testCutPoints(samples& X, indices_t& indices, labels& y) std::vector<cutPoint_t> testCutPoints(samples& X, indices_t& indices, labels& y)
{ {
this->X = X; this->X = X;
this->y = y; this->y = y;
@@ -56,7 +56,7 @@ namespace mdlp {
} }
// TEST_F(TestMetrics, EvaluateCutPoint) // TEST_F(TestMetrics, EvaluateCutPoint)
// { // {
// CutPoint_t rest, candidate; // cutPoint_t rest, candidate;
// rest.start = 0; // rest.start = 0;
// rest.end = 10; // rest.end = 10;
// candidate.start = 0; // candidate.start = 0;
@@ -64,13 +64,13 @@ namespace mdlp {
// float computed = evaluateCutPoint(rest, candidate); // float computed = evaluateCutPoint(rest, candidate);
// ASSERT_NEAR(0.468996, computed, precision_test); // ASSERT_NEAR(0.468996, computed, precision_test);
// } // }
TEST_F(TestMetrics, ComputeCutPoints) TEST_F(TestMetrics, ComputeCutPointsOriginal)
{ {
std::vector<CutPoint_t> computed, expected; std::vector<cutPoint_t> computed, expected;
computeCutPoints(); computeCutPointsOriginal();
computed = getCutPoints(); computed = getCutPoints();
for (auto cut : computed) { for (auto cut : computed) {
std::cout << "(" << cut.start << ", " << cut.end << ") -> (" << cut.fromValue << ", " << cut.toValue << ")" << std::endl; std::cout << cut.classNumber << " -> (" << cut.start << ", " << cut.end << ") -> (" << cut.fromValue << ", " << cut.toValue << ")" << std::endl;
} }
} }
} }

View File

@@ -1,12 +0,0 @@
cmake -S . -B build -Wno-dev
if test $? -ne 0; then
echo "Error in creating build commands."
exit 1
fi
cmake --build build
if test $? -ne 0; then
echo "Error in build command."
exit 1
fi
cd build
ctest --output-on-failure

View File

@@ -2,14 +2,15 @@
#define TYPES_H #define TYPES_H
#include <vector> #include <vector>
namespace mdlp { namespace mdlp {
typedef std::vector<float> samples;
typedef std::vector<int> labels;
typedef std::vector<size_t> indices_t;
struct CutPointBody { struct CutPointBody {
size_t start, end; // indices of the sorted vector size_t start, end; // indices of the sorted vector
int classNumber; // class assigned to the cut point int classNumber; // class assigned to the cut point
float fromValue, toValue; float fromValue, toValue;
}; };
typedef CutPointBody CutPoint_t; typedef CutPointBody cutPoint_t;
typedef std::vector<float> samples;
typedef std::vector<int> labels;
typedef std::vector<size_t> indices_t;
typedef std::vector<cutPoint_t> cutPoints_t;
} }
#endif #endif

103
sample.py
View File

@@ -2,6 +2,59 @@ from sklearn.datasets import load_iris
from fimdlp.mdlp import FImdlp from fimdlp.mdlp import FImdlp
from fimdlp.cppfimdlp import CFImdlp from fimdlp.cppfimdlp import CFImdlp
import numpy as np import numpy as np
from math import log
def entropy(y: np.array) -> float:
"""Compute entropy of a labels set
Parameters
----------
y : np.array
set of labels
Returns
-------
float
entropy
"""
n_labels = len(y)
if n_labels <= 1:
return 0
counts = np.bincount(y)
proportions = counts / n_labels
n_classes = np.count_nonzero(proportions)
if n_classes <= 1:
return 0
entropy = 0.0
# Compute standard entropy.
for prop in proportions:
if prop != 0.0:
entropy -= prop * log(prop, 2)
return entropy
def information_gain(
labels: np.array, labels_up: np.array, labels_dn: np.array
) -> float:
imp_prev = entropy(labels)
card_up = card_dn = imp_up = imp_dn = 0
if labels_up is not None:
card_up = labels_up.shape[0]
imp_up = entropy(labels_up)
if labels_dn is not None:
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
imp_dn = entropy(labels_dn)
samples = card_up + card_dn
if samples == 0:
return 0.0
else:
result = (
imp_prev
- (card_up / samples) * imp_up
- (card_dn / samples) * imp_dn
)
return result
data = load_iris() data = load_iris()
@@ -10,26 +63,38 @@ y = data.target
features = data.feature_names features = data.feature_names
test = FImdlp() test = FImdlp()
test.fit(X, y, features=features) test.fit(X, y, features=features)
# test.transform(X) test.transform(X)
test = CFImdlp(debug=False) # test = CFImdlp(debug=False)
# k = test.cut_points(X[:, 0], y) # # k = test.cut_points(X[:, 0], y)
# print(k) # # print(k)
# k = test.cut_points_ant(X[:, 0], y) # # k = test.cut_points_ant(X[:, 0], y)
# print(k) # # print(k)
# test.debug_points(X[:, 0], y) # # test.debug_points(X[:, 0], y)
X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9] # X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7] # indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2] # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# test.fit(X[:, 0], y) # # To check
test.fit(X, y) # indices2 = np.argsort(X)
result = test.get_cut_points() # Xs = np.array(X)[indices2]
for item in result: # ys = np.array(y)[indices2]
print( # # test.fit(X[:, 0], y)
f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})" # test.fit(X, y)
f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]" # result = test.get_cut_points()
) # for item in result:
print(test.get_discretized_values()) # print(
# f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
# f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
# )
# print(test.get_discretized_values())
# print(Xs, ys)
# print("**********************")
# test = [(0, 3), (4, 4), (5, 5), (6, 8), (9, 9)]
# print(ys)
# for start, end in test:
# print("Testing ", start, end, ys[:end], ys[end:])
# print("Information gain: ", information_gain(ys, ys[:end], ys[end:]))
# print(test.transform(X)) # print(test.transform(X))
# print(X) # print(X)
# print(indices) # print(indices)

BIN
test1.xlsx Normal file

Binary file not shown.