Enhance cutpoints computation

This commit is contained in:
2022-12-02 19:22:13 +01:00
parent 5657c1cd9f
commit 97cd2243fa
13 changed files with 207 additions and 104 deletions

2
.gitignore vendored
View File

@@ -129,3 +129,5 @@ dmypy.json
.pyre/
cfimdlp.cpp
.vscode/*
**/.idea/*

View File

@@ -8,7 +8,7 @@ clean: ## Clean up
if [ -f fimdlp/cppfimdlp.cpython-310-darwin.so ]; then rm fimdlp/cppfimdlp.cpython-310-darwin.so; fi;
test:
cd fimdlp/testcpp && ./test.sh
cd fimdlp/testcpp && ./test
lint: ## Lint and static-check
black fimdlp

View File

@@ -5,18 +5,26 @@
#include <algorithm>
#include "Metrics.h"
namespace mdlp {
CPPFImdlp::CPPFImdlp() : debug(false), precision(6)
std::ostream& operator << (std::ostream& os, const cutPoint_t& cut)
{
os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
") - (" << cut.fromValue << ", " << cut.toValue << ") "
<< std::endl;
return os;
}
CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false)
{
divider = pow(10, precision);
}
CPPFImdlp::CPPFImdlp(int precision, bool debug) : debug(debug), precision(precision)
CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug)
{
divider = pow(10, precision);
}
CPPFImdlp::~CPPFImdlp()
{
}
std::vector<CutPoint_t> CPPFImdlp::getCutPoints()
std::vector<cutPoint_t> CPPFImdlp::getCutPoints()
{
return cutPoints;
}
@@ -32,7 +40,11 @@ namespace mdlp {
this->xDiscretized = labels(X.size(), -1);
this->numClasses = Metrics::numClasses(y, indices, 0, X.size());
computeCutPoints();
if (proposed) {
computeCutPointsProposed();
} else {
computeCutPointsOriginal();
}
filterCutPoints();
applyCutPoints();
}
@@ -64,7 +76,7 @@ namespace mdlp {
}
}
}
bool CPPFImdlp::evaluateCutPoint(CutPoint_t rest, CutPoint_t candidate)
bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
{
int k, k1, k2;
float ig, delta;
@@ -73,7 +85,6 @@ namespace mdlp {
if (N < 2) {
return false;
}
k = Metrics::numClasses(y, indices, rest.start, rest.end);
k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
@@ -83,15 +94,18 @@ namespace mdlp {
ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
delta = log2(pow(3, k) - 2) - (k * ent - k1 * ent1 - k2 * ent2);
float term = 1 / N * (log2(N - 1) + delta);
std::cout << candidate
if (debug) {
std::cout << "Rest: " << rest;
std::cout << "Candidate: " << candidate;
std::cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << std::endl;
std::cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << std::endl;
}
return (ig > term);
}
void CPPFImdlp::filterCutPoints()
{
std::vector<CutPoint_t> filtered;
CutPoint_t rest;
cutPoints_t filtered;
cutPoint_t rest;
int classNumber = 0;
rest.start = 0;
@@ -116,24 +130,25 @@ namespace mdlp {
item.classNumber = classNumber++;
filtered.push_back(item);
first = false;
rest.start = item.end;
} else {
std::cout << "Rejected" << std::endl;
lastReject = true;
}
}
if (!first)
if (!first) {
filtered.back().toValue = std::numeric_limits<float>::max();
else {
filtered.back().end = X.size();
} else {
filtered.push_back(rest);
}
cutPoints = filtered;
}
void CPPFImdlp::computeCutPoints()
void CPPFImdlp::computeCutPointsProposed()
{
std::vector<CutPoint_t> cutPts;
CutPoint_t cutPoint;
cutPoints_t cutPts;
cutPoint_t cutPoint;
indices_t cutIdx;
float xPrev, xCur, xPivot;
int yPrev, yCur, yPivot;
@@ -196,38 +211,56 @@ namespace mdlp {
}
cutPoints = cutPts;
}
void CPPFImdlp::computeCutPointsAnt()
void CPPFImdlp::computeCutPointsOriginal()
{
samples cutPts;
labels cutIdx;
float xPrev, cutPoint;
cutPoints_t cutPts;
cutPoint_t cutPoint;
float xPrev = std::numeric_limits<float>::lowest();
int yPrev;
size_t idxPrev;
xPrev = X.at(indices[0]);
yPrev = y.at(indices[0]);
idxPrev = indices[0];
if (debug) {
std::cout << "Entropy: " << Metrics::entropy(y, indices, 0, y.size(), Metrics::numClasses(y, indices, 0, indices.size())) << std::endl;
}
for (auto index = indices.begin(); index != indices.end(); ++index) {
bool first = true;
// idxPrev is the index of the init instance of the cutPoint
size_t index, idxPrev = 0, idx = indices[0];
xPrev = X[idx];
yPrev = y[idx];
for (index = 0; index < size_t(indices.size()) - 1; index++) {
idx = indices[index];
// Definition 2 Cut points are always on boundaries
if (y.at(*index) != yPrev && xPrev < X.at(*index)) {
cutPoint = round(divider * (X.at(*index) + xPrev) / 2) / divider;
if (y[idx] != yPrev && xPrev < X[idx]) {
if (first) {
first = false;
cutPoint.fromValue = std::numeric_limits<float>::lowest();
} else {
cutPoint.fromValue = cutPts.back().toValue;
}
cutPoint.start = idxPrev;
cutPoint.end = index;
cutPoint.classNumber = -1;
cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
if (debug) {
std::cout << "Cut point: " << (xPrev + X.at(*index)) / 2 << " //";
std::cout << X.at(*index) << " -> " << y.at(*index) << " yPrev= " << yPrev;
std::cout << "* (" << X.at(*index) << ", " << xPrev << ")="
<< ((X.at(*index) + xPrev) / 2) << "idxPrev"
<< idxPrev << std::endl;
std::cout << "Cut point: " << cutPoint << " //";
std::cout << X[idx] << " -> " << y[idx] << " yPrev= "
<< yPrev << idxPrev << std::endl;
}
idxPrev = index;
cutPts.push_back(cutPoint);
cutIdx.push_back(idxPrev);
}
xPrev = X.at(*index);
yPrev = y.at(*index);
idxPrev = *index;
xPrev = X[idx];
yPrev = y[idx];
}
// cutPoints = cutPts;
std::cout << "Came to here" << first << std::endl;
if (first) {
cutPoint.start = 0;
cutPoint.classNumber = -1;
cutPoint.fromValue = std::numeric_limits<float>::lowest();
cutPoint.toValue = std::numeric_limits<float>::max();
cutPoints.push_back(cutPoint);
} else
cutPts.back().toValue = std::numeric_limits<float>::max();
cutPts.back().end = X.size();
if (debug)
for (auto cutPoint : cutPts)
std::cout << "Cut point: " << cutPoint << std::endl;
cutPoints = cutPts;
}
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples& X)

View File

@@ -5,29 +5,30 @@
namespace mdlp {
class CPPFImdlp {
private:
bool debug;
bool proposed; // proposed algorithm or original algorithm
int precision;
bool debug;
float divider;
indices_t indices; // sorted indices to use with X and y
samples X;
labels y;
labels xDiscretized;
int numClasses;
std::vector<CutPoint_t> cutPoints;
cutPoints_t cutPoints;
protected:
indices_t sortIndices(samples&);
void computeCutPointsAnt();
void computeCutPoints();
bool evaluateCutPoint(CutPoint_t, CutPoint_t);
void computeCutPointsOriginal();
void computeCutPointsProposed();
bool evaluateCutPoint(cutPoint_t, cutPoint_t);
void filterCutPoints();
void applyCutPoints();
public:
CPPFImdlp();
CPPFImdlp(int, bool debug = false);
CPPFImdlp(bool, int, bool debug = false);
~CPPFImdlp();
std::vector<CutPoint_t> getCutPoints();
cutPoints_t getCutPoints();
labels getDiscretizedValues();
void debugPoints(samples&, labels&);
void fit(samples&, labels&);

View File

@@ -41,7 +41,7 @@ namespace mdlp {
entropy = Metrics::entropy(y, indices, start, end, nClasses);
entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft);
entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight);
iGain = entropy - (float)nElementsLeft / nElements * entropyLeft - (float)nElementsRight / nElements * entropyRight;
iGain = entropy - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements;
return iGain;
}

View File

@@ -10,7 +10,7 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
float fromValue, toValue;
cdef cppclass CPPFImdlp:
CPPFImdlp() except +
CPPFImdlp(int, bool) except +
CPPFImdlp(bool, int, bool) except +
void fit(vector[float]&, vector[int]&)
vector[int] transform(vector[float]&)
vector[int] getDiscretizedValues()
@@ -18,7 +18,7 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
void debugPoints(vector[float]&, vector[int]&)
class PCutPoint_t:
class PcutPoint_t:
def __init__(self, start, end, fromValue, toValue):
self.start = start
self.end = end
@@ -27,8 +27,9 @@ class PCutPoint_t:
cdef class CFImdlp:
cdef CPPFImdlp *thisptr
def __cinit__(self, precision=6, debug=False):
self.thisptr = new CPPFImdlp(precision, debug)
def __cinit__(self, precision=6, debug=False, proposed=True):
# Proposed or original algorithm
self.thisptr = new CPPFImdlp(proposed, precision, debug)
def __dealloc__(self):
del self.thisptr
def fit(self, X, y):

View File

@@ -61,7 +61,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
self.n_features_ = X.shape[1]
self.X_ = X
self.y_ = y
self.discretizer_ = CFImdlp(debug=False)
self.discretizer_ = CFImdlp(debug=True, proposed=False)
return self
def transform(self, X):
@@ -104,19 +104,31 @@ class FImdlp(TransformerMixin, BaseEstimator):
print("Cuts calculados en python: ", cuts)
print("Cuts calculados en C++")
print("Cut points for each feature in Iris dataset:")
for i in range(0, self.n_features_):
for i in range(0, 1):
# datax = self.X_[np.argsort(self.X_[:, i]), i]
# y_ = self.y_[np.argsort(self.X_[:, i])]
datax = self.X_[:, i]
y_ = self.y_
Xcutpoints = self.discretizer_.cut_points(datax, y_)
self.discretizer_.fit(datax, y_)
Xcutpoints = self.discretizer_.get_cut_points()
print(
f"New ({len(Xcutpoints)}):{self.features_[i]:20s}: "
f"{Xcutpoints}"
)
Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
print(
f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
f"{Xcutpoints}"
f"{[i['toValue'] for i in Xcutpoints]}"
)
X_translated = [
f"{i['classNumber']} - ({i['start']}, {i['end']}) - "
f"({i['fromValue']}, {i['toValue']})"
for i in Xcutpoints
]
print(X_translated)
print("*******************************")
print("Disretized values:")
print(self.discretizer_.transform(datax))
print("*******************************")
print("indices:", np.argsort(X[:, 0]))
# Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
# print(
# f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
# f"{Xcutpoints}"
# )
return X

View File

@@ -30,7 +30,7 @@ namespace mdlp {
prev = X[testSortedIndices[i]];
}
}
std::vector<CutPoint_t> testCutPoints(samples& X, indices_t& indices, labels& y)
std::vector<cutPoint_t> testCutPoints(samples& X, indices_t& indices, labels& y)
{
this->X = X;
this->y = y;
@@ -56,7 +56,7 @@ namespace mdlp {
}
// TEST_F(TestMetrics, EvaluateCutPoint)
// {
// CutPoint_t rest, candidate;
// cutPoint_t rest, candidate;
// rest.start = 0;
// rest.end = 10;
// candidate.start = 0;
@@ -64,13 +64,13 @@ namespace mdlp {
// float computed = evaluateCutPoint(rest, candidate);
// ASSERT_NEAR(0.468996, computed, precision_test);
// }
TEST_F(TestMetrics, ComputeCutPoints)
TEST_F(TestMetrics, ComputeCutPointsOriginal)
{
std::vector<CutPoint_t> computed, expected;
computeCutPoints();
std::vector<cutPoint_t> computed, expected;
computeCutPointsOriginal();
computed = getCutPoints();
for (auto cut : computed) {
std::cout << "(" << cut.start << ", " << cut.end << ") -> (" << cut.fromValue << ", " << cut.toValue << ")" << std::endl;
std::cout << cut.classNumber << " -> (" << cut.start << ", " << cut.end << ") -> (" << cut.fromValue << ", " << cut.toValue << ")" << std::endl;
}
}
}

View File

@@ -1,12 +0,0 @@
cmake -S . -B build -Wno-dev
if test $? -ne 0; then
echo "Error in creating build commands."
exit 1
fi
cmake --build build
if test $? -ne 0; then
echo "Error in build command."
exit 1
fi
cd build
ctest --output-on-failure

View File

@@ -2,14 +2,15 @@
#define TYPES_H
#include <vector>
namespace mdlp {
typedef std::vector<float> samples;
typedef std::vector<int> labels;
typedef std::vector<size_t> indices_t;
struct CutPointBody {
size_t start, end; // indices of the sorted vector
int classNumber; // class assigned to the cut point
float fromValue, toValue;
};
typedef CutPointBody CutPoint_t;
typedef CutPointBody cutPoint_t;
typedef std::vector<float> samples;
typedef std::vector<int> labels;
typedef std::vector<size_t> indices_t;
typedef std::vector<cutPoint_t> cutPoints_t;
}
#endif

103
sample.py
View File

@@ -2,6 +2,59 @@ from sklearn.datasets import load_iris
from fimdlp.mdlp import FImdlp
from fimdlp.cppfimdlp import CFImdlp
import numpy as np
from math import log
def entropy(y: np.array) -> float:
"""Compute entropy of a labels set
Parameters
----------
y : np.array
set of labels
Returns
-------
float
entropy
"""
n_labels = len(y)
if n_labels <= 1:
return 0
counts = np.bincount(y)
proportions = counts / n_labels
n_classes = np.count_nonzero(proportions)
if n_classes <= 1:
return 0
entropy = 0.0
# Compute standard entropy.
for prop in proportions:
if prop != 0.0:
entropy -= prop * log(prop, 2)
return entropy
def information_gain(
labels: np.array, labels_up: np.array, labels_dn: np.array
) -> float:
imp_prev = entropy(labels)
card_up = card_dn = imp_up = imp_dn = 0
if labels_up is not None:
card_up = labels_up.shape[0]
imp_up = entropy(labels_up)
if labels_dn is not None:
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
imp_dn = entropy(labels_dn)
samples = card_up + card_dn
if samples == 0:
return 0.0
else:
result = (
imp_prev
- (card_up / samples) * imp_up
- (card_dn / samples) * imp_dn
)
return result
data = load_iris()
@@ -10,26 +63,38 @@ y = data.target
features = data.feature_names
test = FImdlp()
test.fit(X, y, features=features)
# test.transform(X)
test.transform(X)
test = CFImdlp(debug=False)
# k = test.cut_points(X[:, 0], y)
# print(k)
# k = test.cut_points_ant(X[:, 0], y)
# print(k)
# test.debug_points(X[:, 0], y)
X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# test.fit(X[:, 0], y)
test.fit(X, y)
result = test.get_cut_points()
for item in result:
print(
f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
)
print(test.get_discretized_values())
# test = CFImdlp(debug=False)
# # k = test.cut_points(X[:, 0], y)
# # print(k)
# # k = test.cut_points_ant(X[:, 0], y)
# # print(k)
# # test.debug_points(X[:, 0], y)
# X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
# indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# # To check
# indices2 = np.argsort(X)
# Xs = np.array(X)[indices2]
# ys = np.array(y)[indices2]
# # test.fit(X[:, 0], y)
# test.fit(X, y)
# result = test.get_cut_points()
# for item in result:
# print(
# f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
# f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
# )
# print(test.get_discretized_values())
# print(Xs, ys)
# print("**********************")
# test = [(0, 3), (4, 4), (5, 5), (6, 8), (9, 9)]
# print(ys)
# for start, end in test:
# print("Testing ", start, end, ys[:end], ys[end:])
# print("Information gain: ", information_gain(ys, ys[:end], ys[end:]))
# print(test.transform(X))
# print(X)
# print(indices)

BIN
test1.xlsx Normal file

Binary file not shown.