Refactor to ensemble the pieces

This commit is contained in:
2022-12-04 14:05:00 +01:00
parent 34a69622bc
commit 31c9b8a3a3
8 changed files with 69 additions and 108 deletions

BIN
cppfimdlp.cpython-310-darwin.so Executable file

Binary file not shown.

View File

@@ -12,21 +12,24 @@ namespace mdlp {
return os;
}
CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false)
CPPFImdlp::CPPFImdlp() : proposal(true), precision(6), debug(false)
{
divider = pow(10, precision);
numClasses = 0;
}
CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug)
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug) : proposal(proposal), precision(precision), debug(debug)
{
divider = pow(10, precision);
numClasses = 0;
}
CPPFImdlp::~CPPFImdlp()
= default;
std::vector<cutPoint_t> CPPFImdlp::getCutPoints()
samples CPPFImdlp::getCutPoints()
{
return cutPoints;
samples output(cutPoints.size());
std::transform(cutPoints.begin(), cutPoints.end(), output.begin(),
[](cutPoint_t cut) { return cut.toValue; });
return output;
}
labels CPPFImdlp::getDiscretizedValues()
{
@@ -48,28 +51,19 @@ namespace mdlp {
this->xDiscretized = labels(X.size(), -1);
this->numClasses = Metrics::numClasses(y, indices, 0, X.size());
if (proposed) {
computeCutPointsProposed();
if (proposal) {
computeCutPointsProposal();
} else {
computeCutPointsOriginal();
}
filterCutPoints();
applyCutPoints();
return *this;
}
labels& CPPFImdlp::transform(samples& X_)
{
indices_t indices_transform = sortIndices(X_);
applyCutPoints();
return xDiscretized;
}
void CPPFImdlp::applyCutPoints()
{
// Apply cut points to the input vector
for (auto cut : cutPoints) {
for (size_t i = cut.start; i < cut.end; i++) {
xDiscretized[indices[i]] = cut.classNumber;
}
}
return *this;
}
bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
{
@@ -142,7 +136,7 @@ namespace mdlp {
}
cutPoints = filtered;
}
void CPPFImdlp::computeCutPointsProposed()
void CPPFImdlp::computeCutPointsProposal()
{
cutPoints_t cutPts;
cutPoint_t cutPoint;
@@ -206,7 +200,7 @@ namespace mdlp {
if (debug) {
std::cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << std::endl;
for (auto cutPt : cutPts)
std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposed: Cut point: " << cutPt;
std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt;
}
cutPoints = cutPts;
}

View File

@@ -5,7 +5,7 @@
namespace mdlp {
class CPPFImdlp {
protected:
bool proposed; // proposed algorithm or original algorithm
bool proposal; // proposed algorithm or original algorithm
int precision;
bool debug;
float divider;
@@ -19,21 +19,20 @@ namespace mdlp {
void setCutPoints(cutPoints_t);
static indices_t sortIndices(samples&);
void computeCutPointsOriginal();
void computeCutPointsProposed();
void computeCutPointsProposal();
bool evaluateCutPoint(cutPoint_t, cutPoint_t);
void filterCutPoints();
void applyCutPoints();
public:
CPPFImdlp();
CPPFImdlp(bool, int, bool debug = false);
~CPPFImdlp();
cutPoints_t getCutPoints();
samples getCutPoints();
indices_t getIndices();
labels getDiscretizedValues();
void debugPoints(samples&, labels&);
CPPFImdlp& fit(samples&, labels&);
labels& transform(samples&);
labels transform(samples&);
};
}
#endif

View File

@@ -12,9 +12,8 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
CPPFImdlp() except +
CPPFImdlp(bool, int, bool) except +
CPPFImdlp& fit(vector[float]&, vector[int]&)
vector[int] transform(vector[float]&)
vector[int] getDiscretizedValues()
vector[CutPointBody] getCutPoints()
vector[float] getCutPoints()
class PcutPoint_t:
@@ -26,16 +25,14 @@ class PcutPoint_t:
cdef class CFImdlp:
cdef CPPFImdlp *thisptr
def __cinit__(self, precision=6, debug=False, proposed=True):
# Proposed or original algorithm
self.thisptr = new CPPFImdlp(proposed, precision, debug)
def __cinit__(self, precision=6, debug=False, proposal=True):
# Proposal or original algorithm
self.thisptr = new CPPFImdlp(proposal, precision, debug)
def __dealloc__(self):
del self.thisptr
def fit(self, X, y):
self.thisptr.fit(X, y)
return self
def transform(self, X):
return self.thisptr.transform(X)
def get_discretized_values(self):
return self.thisptr.getDiscretizedValues()
def get_cut_points(self):

View File

@@ -6,6 +6,9 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, proposal=True):
self.proposal = proposal # proposed algorithm or original algorithm
"""Fayyad - Irani MDLP discretization algorithm.
Parameters
@@ -57,11 +60,18 @@ class FImdlp(TransformerMixin, BaseEstimator):
X, y = self._check_params_fit(
X, y, expected_args=["class_name", "features"], kwargs=kwargs
)
self.n_features_ = X.shape[1]
self.X_ = X
self.y_ = y
self.discretizer_ = CFImdlp(debug=True, proposed=False)
self.discretizer_ = [None] * self.n_features_
self.cut_points_ = [None] * self.n_features_
# Can do it in parallel
for feature in range(self.n_features_):
self.discretizer_[feature] = CFImdlp(proposal=self.proposal)
self.discretizer_[feature].fit(X[:, feature], y)
self.cut_points_[feature] = self.discretizer_[
feature
].get_cut_points()
return self
def transform(self, X):
@@ -91,6 +101,15 @@ class FImdlp(TransformerMixin, BaseEstimator):
raise ValueError(
"Shape of input is different from what was seen in `fit`"
)
result = np.zeros_like(X, dtype=np.int32) - 1
# Can do it in parallel
for feature in range(self.n_features_):
result[:, feature] = np.searchsorted(
self.cut_points_[feature], X[:, feature]
)
return result
def test(self):
print("Calculating cut points in python for first feature")
yz = self.y_.copy()
xz = X[:, 0].copy()
@@ -102,7 +121,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
print(f"Cut point: ({xz[i-1]}, {xz[i]}) ({yz[i-1]}, {yz[i]})")
cuts.append((xz[i] + xz[i - 1]) / 2)
print("Cuts calculados en python: ", cuts)
print("Cuts calculados en C++")
print("-- Cuts calculados en C++ --")
print("Cut points for each feature in Iris dataset:")
for i in range(0, 1):
# datax = self.X_[np.argsort(self.X_[:, i]), i]
@@ -123,12 +142,6 @@ class FImdlp(TransformerMixin, BaseEstimator):
print(X_translated)
print("*******************************")
print("Disretized values:")
print(self.discretizer_.transform(datax))
print(self.discretizer_.get_discretized_values())
print("*******************************")
print("indices:", np.argsort(X[:, 0]))
# Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
# print(
# f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
# f"{Xcutpoints}"
# )
return X

View File

@@ -37,6 +37,19 @@ namespace mdlp {
prev = X[testSortedIndices[i]];
}
}
void checkCutPoints(cutPoints_t& expected)
{
int expectedSize = expected.size();
EXPECT_EQ(cutPoints.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(cutPoints[i].start, expected[i].start);
EXPECT_EQ(cutPoints[i].end, expected[i].end);
EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision);
}
}
};
TEST_F(TestFImdlp, SortIndices)
{
@@ -60,22 +73,13 @@ namespace mdlp {
TEST_F(TestFImdlp, ComputeCutPointsOriginal)
{
cutPoints_t computed, expected;
int expectedSize = 3;
expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
{ 6, 10, -1, 5.45, 3.4028234663852886e+38 }
};
setCutPoints(cutPoints_t());
computeCutPointsOriginal();
computed = getCutPoints();
EXPECT_EQ(computed.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(computed[i].start, expected[i].start);
EXPECT_EQ(computed[i].end, expected[i].end);
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
}
checkCutPoints(expected);
}
TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase)
{
@@ -83,22 +87,13 @@ namespace mdlp {
expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 },
};
int expectedSize = 1;
X = { 0, 1, 2, 2 };
y = { 1, 1, 1, 2 };
fit(X, y);
computeCutPointsOriginal();
computed = getCutPoints();
EXPECT_EQ(computed.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(computed[i].start, expected[i].start);
EXPECT_EQ(computed[i].end, expected[i].end);
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
}
checkCutPoints(expected);
}
TEST_F(TestFImdlp, ComputeCutPointsProposed)
TEST_F(TestFImdlp, ComputeCutPointsProposal)
{
cutPoints_t computed, expected;
expected = {
@@ -106,57 +101,20 @@ namespace mdlp {
{ 6, 9, -1, 5.4, 5.85 },
{ 9, 10, -1, 5.85, 3.4028234663852886e+38 }
};
int expectedSize = 4;
computeCutPointsProposed();
computed = getCutPoints();
EXPECT_EQ(computed.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(computed[i].start, expected[i].start);
EXPECT_EQ(computed[i].end, expected[i].end);
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
}
computeCutPointsProposal();
checkCutPoints(expected);
}
TEST_F(TestFImdlp, ComputeCutPointsProposedGCase)
TEST_F(TestFImdlp, ComputeCutPointsProposalGCase)
{
cutPoints_t computed, expected;
expected = {
{ 0, 3, -1, -3.4028234663852886e+38, 1.5 },
{ 3, 4, -1, 1.5, 3.4028234663852886e+38 }
};
int expectedSize = 2;
X = { 0, 1, 2, 2 };
y = { 1, 1, 1, 2 };
fit(X, y);
computeCutPointsProposed();
computed = getCutPoints();
EXPECT_EQ(computed.size(), expectedSize);
for (auto i = 0; i < expectedSize; i++) {
EXPECT_EQ(computed[i].start, expected[i].start);
EXPECT_EQ(computed[i].end, expected[i].end);
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
}
}
TEST_F(TestFImdlp, ApplyCutPoints)
{
cutPoints_t expected = {
{ 0, 4, 17, -3.4028234663852886e+38, 5.1 }, { 4, 6, 31, 5.1, 5.4 },
{ 6, 8, 59, 5.4, 5.85 },
{ 8, 10, 41, 5.85, 3.4028234663852886e+38 }
};
setCutPoints(expected);
applyCutPoints();
labels expected_x = getDiscretizedValues();
indices_t indices_x = getIndices();
for (auto i = 0; i < 5; i++) {
std::cout << "cutPoint[" << i << "].start = " << expected[i].start << std::endl;
for (auto j = expected[i].start; j < expected[i].end; j++) {
std::cout << expected_x[j] << expected[i].classNumber << std::endl;
EXPECT_EQ(expected_x[indices_x[j]], expected[i].classNumber);
}
}
computeCutPointsProposal();
checkCutPoints(expected);
}
}

View File

@@ -65,11 +65,11 @@ features = data.feature_names
# test.fit(X, y, features=features)
# test.transform(X)
# test.get_cut_points()
for proposed in [True, False]:
for proposal in [True, False]:
X = data.data
y = data.target
print("*** Proposed: ", proposed)
test = CFImdlp(debug=True, proposed=proposed)
print("*** Proposal: ", proposal)
test = CFImdlp(debug=True, proposal=proposal)
test.fit(X[:, 0], y)
result = test.get_cut_points()
for item in result:
@@ -118,7 +118,7 @@ for proposed in [True, False]:
# X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
# clf = CFImdlp(debug=True, proposed=False)
# clf = CFImdlp(debug=True, proposal=False)
# clf.fit(X, y)
# print(clf.get_cut_points())
# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]