mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-16 16:05:52 +00:00
Refactor to ensemble the pieces
This commit is contained in:
BIN
cppfimdlp.cpython-310-darwin.so
Executable file
BIN
cppfimdlp.cpython-310-darwin.so
Executable file
Binary file not shown.
@@ -12,21 +12,24 @@ namespace mdlp {
|
||||
return os;
|
||||
|
||||
}
|
||||
CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false)
|
||||
CPPFImdlp::CPPFImdlp() : proposal(true), precision(6), debug(false)
|
||||
{
|
||||
divider = pow(10, precision);
|
||||
numClasses = 0;
|
||||
}
|
||||
CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug)
|
||||
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug) : proposal(proposal), precision(precision), debug(debug)
|
||||
{
|
||||
divider = pow(10, precision);
|
||||
numClasses = 0;
|
||||
}
|
||||
CPPFImdlp::~CPPFImdlp()
|
||||
= default;
|
||||
std::vector<cutPoint_t> CPPFImdlp::getCutPoints()
|
||||
samples CPPFImdlp::getCutPoints()
|
||||
{
|
||||
return cutPoints;
|
||||
samples output(cutPoints.size());
|
||||
std::transform(cutPoints.begin(), cutPoints.end(), output.begin(),
|
||||
[](cutPoint_t cut) { return cut.toValue; });
|
||||
return output;
|
||||
}
|
||||
labels CPPFImdlp::getDiscretizedValues()
|
||||
{
|
||||
@@ -48,28 +51,19 @@ namespace mdlp {
|
||||
this->xDiscretized = labels(X.size(), -1);
|
||||
this->numClasses = Metrics::numClasses(y, indices, 0, X.size());
|
||||
|
||||
if (proposed) {
|
||||
computeCutPointsProposed();
|
||||
if (proposal) {
|
||||
computeCutPointsProposal();
|
||||
} else {
|
||||
computeCutPointsOriginal();
|
||||
}
|
||||
filterCutPoints();
|
||||
applyCutPoints();
|
||||
return *this;
|
||||
}
|
||||
labels& CPPFImdlp::transform(samples& X_)
|
||||
{
|
||||
indices_t indices_transform = sortIndices(X_);
|
||||
applyCutPoints();
|
||||
return xDiscretized;
|
||||
}
|
||||
void CPPFImdlp::applyCutPoints()
|
||||
{
|
||||
// Apply cut points to the input vector
|
||||
for (auto cut : cutPoints) {
|
||||
for (size_t i = cut.start; i < cut.end; i++) {
|
||||
xDiscretized[indices[i]] = cut.classNumber;
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
|
||||
{
|
||||
@@ -142,7 +136,7 @@ namespace mdlp {
|
||||
}
|
||||
cutPoints = filtered;
|
||||
}
|
||||
void CPPFImdlp::computeCutPointsProposed()
|
||||
void CPPFImdlp::computeCutPointsProposal()
|
||||
{
|
||||
cutPoints_t cutPts;
|
||||
cutPoint_t cutPoint;
|
||||
@@ -206,7 +200,7 @@ namespace mdlp {
|
||||
if (debug) {
|
||||
std::cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << std::endl;
|
||||
for (auto cutPt : cutPts)
|
||||
std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposed: Cut point: " << cutPt;
|
||||
std::cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt;
|
||||
}
|
||||
cutPoints = cutPts;
|
||||
}
|
||||
|
@@ -5,7 +5,7 @@
|
||||
namespace mdlp {
|
||||
class CPPFImdlp {
|
||||
protected:
|
||||
bool proposed; // proposed algorithm or original algorithm
|
||||
bool proposal; // proposed algorithm or original algorithm
|
||||
int precision;
|
||||
bool debug;
|
||||
float divider;
|
||||
@@ -19,21 +19,20 @@ namespace mdlp {
|
||||
void setCutPoints(cutPoints_t);
|
||||
static indices_t sortIndices(samples&);
|
||||
void computeCutPointsOriginal();
|
||||
void computeCutPointsProposed();
|
||||
void computeCutPointsProposal();
|
||||
bool evaluateCutPoint(cutPoint_t, cutPoint_t);
|
||||
void filterCutPoints();
|
||||
void applyCutPoints();
|
||||
|
||||
public:
|
||||
CPPFImdlp();
|
||||
CPPFImdlp(bool, int, bool debug = false);
|
||||
~CPPFImdlp();
|
||||
cutPoints_t getCutPoints();
|
||||
samples getCutPoints();
|
||||
indices_t getIndices();
|
||||
labels getDiscretizedValues();
|
||||
void debugPoints(samples&, labels&);
|
||||
CPPFImdlp& fit(samples&, labels&);
|
||||
labels& transform(samples&);
|
||||
labels transform(samples&);
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -12,9 +12,8 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
|
||||
CPPFImdlp() except +
|
||||
CPPFImdlp(bool, int, bool) except +
|
||||
CPPFImdlp& fit(vector[float]&, vector[int]&)
|
||||
vector[int] transform(vector[float]&)
|
||||
vector[int] getDiscretizedValues()
|
||||
vector[CutPointBody] getCutPoints()
|
||||
vector[float] getCutPoints()
|
||||
|
||||
|
||||
class PcutPoint_t:
|
||||
@@ -26,16 +25,14 @@ class PcutPoint_t:
|
||||
|
||||
cdef class CFImdlp:
|
||||
cdef CPPFImdlp *thisptr
|
||||
def __cinit__(self, precision=6, debug=False, proposed=True):
|
||||
# Proposed or original algorithm
|
||||
self.thisptr = new CPPFImdlp(proposed, precision, debug)
|
||||
def __cinit__(self, precision=6, debug=False, proposal=True):
|
||||
# Proposal or original algorithm
|
||||
self.thisptr = new CPPFImdlp(proposal, precision, debug)
|
||||
def __dealloc__(self):
|
||||
del self.thisptr
|
||||
def fit(self, X, y):
|
||||
self.thisptr.fit(X, y)
|
||||
return self
|
||||
def transform(self, X):
|
||||
return self.thisptr.transform(X)
|
||||
def get_discretized_values(self):
|
||||
return self.thisptr.getDiscretizedValues()
|
||||
def get_cut_points(self):
|
||||
|
Binary file not shown.
@@ -6,6 +6,9 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||
|
||||
|
||||
class FImdlp(TransformerMixin, BaseEstimator):
|
||||
def __init__(self, proposal=True):
|
||||
self.proposal = proposal # proposed algorithm or original algorithm
|
||||
|
||||
"""Fayyad - Irani MDLP discretization algorithm.
|
||||
|
||||
Parameters
|
||||
@@ -57,11 +60,18 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
X, y = self._check_params_fit(
|
||||
X, y, expected_args=["class_name", "features"], kwargs=kwargs
|
||||
)
|
||||
|
||||
self.n_features_ = X.shape[1]
|
||||
self.X_ = X
|
||||
self.y_ = y
|
||||
self.discretizer_ = CFImdlp(debug=True, proposed=False)
|
||||
self.discretizer_ = [None] * self.n_features_
|
||||
self.cut_points_ = [None] * self.n_features_
|
||||
# Can do it in parallel
|
||||
for feature in range(self.n_features_):
|
||||
self.discretizer_[feature] = CFImdlp(proposal=self.proposal)
|
||||
self.discretizer_[feature].fit(X[:, feature], y)
|
||||
self.cut_points_[feature] = self.discretizer_[
|
||||
feature
|
||||
].get_cut_points()
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
@@ -91,6 +101,15 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
raise ValueError(
|
||||
"Shape of input is different from what was seen in `fit`"
|
||||
)
|
||||
result = np.zeros_like(X, dtype=np.int32) - 1
|
||||
# Can do it in parallel
|
||||
for feature in range(self.n_features_):
|
||||
result[:, feature] = np.searchsorted(
|
||||
self.cut_points_[feature], X[:, feature]
|
||||
)
|
||||
return result
|
||||
|
||||
def test(self):
|
||||
print("Calculating cut points in python for first feature")
|
||||
yz = self.y_.copy()
|
||||
xz = X[:, 0].copy()
|
||||
@@ -102,7 +121,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
print(f"Cut point: ({xz[i-1]}, {xz[i]}) ({yz[i-1]}, {yz[i]})")
|
||||
cuts.append((xz[i] + xz[i - 1]) / 2)
|
||||
print("Cuts calculados en python: ", cuts)
|
||||
print("Cuts calculados en C++")
|
||||
print("-- Cuts calculados en C++ --")
|
||||
print("Cut points for each feature in Iris dataset:")
|
||||
for i in range(0, 1):
|
||||
# datax = self.X_[np.argsort(self.X_[:, i]), i]
|
||||
@@ -123,12 +142,6 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
print(X_translated)
|
||||
print("*******************************")
|
||||
print("Disretized values:")
|
||||
print(self.discretizer_.transform(datax))
|
||||
print(self.discretizer_.get_discretized_values())
|
||||
print("*******************************")
|
||||
print("indices:", np.argsort(X[:, 0]))
|
||||
# Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
|
||||
# print(
|
||||
# f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
|
||||
# f"{Xcutpoints}"
|
||||
# )
|
||||
return X
|
||||
|
@@ -37,6 +37,19 @@ namespace mdlp {
|
||||
prev = X[testSortedIndices[i]];
|
||||
}
|
||||
}
|
||||
void checkCutPoints(cutPoints_t& expected)
|
||||
{
|
||||
int expectedSize = expected.size();
|
||||
EXPECT_EQ(cutPoints.size(), expectedSize);
|
||||
for (auto i = 0; i < expectedSize; i++) {
|
||||
EXPECT_EQ(cutPoints[i].start, expected[i].start);
|
||||
EXPECT_EQ(cutPoints[i].end, expected[i].end);
|
||||
EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber);
|
||||
EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision);
|
||||
EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
TEST_F(TestFImdlp, SortIndices)
|
||||
{
|
||||
@@ -60,22 +73,13 @@ namespace mdlp {
|
||||
TEST_F(TestFImdlp, ComputeCutPointsOriginal)
|
||||
{
|
||||
cutPoints_t computed, expected;
|
||||
int expectedSize = 3;
|
||||
expected = {
|
||||
{ 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
|
||||
{ 6, 10, -1, 5.45, 3.4028234663852886e+38 }
|
||||
};
|
||||
setCutPoints(cutPoints_t());
|
||||
computeCutPointsOriginal();
|
||||
computed = getCutPoints();
|
||||
EXPECT_EQ(computed.size(), expectedSize);
|
||||
for (auto i = 0; i < expectedSize; i++) {
|
||||
EXPECT_EQ(computed[i].start, expected[i].start);
|
||||
EXPECT_EQ(computed[i].end, expected[i].end);
|
||||
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
|
||||
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
|
||||
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
|
||||
}
|
||||
checkCutPoints(expected);
|
||||
}
|
||||
TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase)
|
||||
{
|
||||
@@ -83,22 +87,13 @@ namespace mdlp {
|
||||
expected = {
|
||||
{ 0, 4, -1, -3.4028234663852886e+38, 3.4028234663852886e+38 },
|
||||
};
|
||||
int expectedSize = 1;
|
||||
X = { 0, 1, 2, 2 };
|
||||
y = { 1, 1, 1, 2 };
|
||||
fit(X, y);
|
||||
computeCutPointsOriginal();
|
||||
computed = getCutPoints();
|
||||
EXPECT_EQ(computed.size(), expectedSize);
|
||||
for (auto i = 0; i < expectedSize; i++) {
|
||||
EXPECT_EQ(computed[i].start, expected[i].start);
|
||||
EXPECT_EQ(computed[i].end, expected[i].end);
|
||||
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
|
||||
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
|
||||
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
|
||||
}
|
||||
checkCutPoints(expected);
|
||||
}
|
||||
TEST_F(TestFImdlp, ComputeCutPointsProposed)
|
||||
TEST_F(TestFImdlp, ComputeCutPointsProposal)
|
||||
{
|
||||
cutPoints_t computed, expected;
|
||||
expected = {
|
||||
@@ -106,57 +101,20 @@ namespace mdlp {
|
||||
{ 6, 9, -1, 5.4, 5.85 },
|
||||
{ 9, 10, -1, 5.85, 3.4028234663852886e+38 }
|
||||
};
|
||||
int expectedSize = 4;
|
||||
computeCutPointsProposed();
|
||||
computed = getCutPoints();
|
||||
EXPECT_EQ(computed.size(), expectedSize);
|
||||
for (auto i = 0; i < expectedSize; i++) {
|
||||
EXPECT_EQ(computed[i].start, expected[i].start);
|
||||
EXPECT_EQ(computed[i].end, expected[i].end);
|
||||
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
|
||||
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
|
||||
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
|
||||
}
|
||||
computeCutPointsProposal();
|
||||
checkCutPoints(expected);
|
||||
}
|
||||
TEST_F(TestFImdlp, ComputeCutPointsProposedGCase)
|
||||
TEST_F(TestFImdlp, ComputeCutPointsProposalGCase)
|
||||
{
|
||||
cutPoints_t computed, expected;
|
||||
expected = {
|
||||
{ 0, 3, -1, -3.4028234663852886e+38, 1.5 },
|
||||
{ 3, 4, -1, 1.5, 3.4028234663852886e+38 }
|
||||
};
|
||||
int expectedSize = 2;
|
||||
X = { 0, 1, 2, 2 };
|
||||
y = { 1, 1, 1, 2 };
|
||||
fit(X, y);
|
||||
computeCutPointsProposed();
|
||||
computed = getCutPoints();
|
||||
EXPECT_EQ(computed.size(), expectedSize);
|
||||
for (auto i = 0; i < expectedSize; i++) {
|
||||
EXPECT_EQ(computed[i].start, expected[i].start);
|
||||
EXPECT_EQ(computed[i].end, expected[i].end);
|
||||
EXPECT_EQ(computed[i].classNumber, expected[i].classNumber);
|
||||
EXPECT_NEAR(computed[i].fromValue, expected[i].fromValue, precision);
|
||||
EXPECT_NEAR(computed[i].toValue, expected[i].toValue, precision);
|
||||
}
|
||||
}
|
||||
TEST_F(TestFImdlp, ApplyCutPoints)
|
||||
{
|
||||
cutPoints_t expected = {
|
||||
{ 0, 4, 17, -3.4028234663852886e+38, 5.1 }, { 4, 6, 31, 5.1, 5.4 },
|
||||
{ 6, 8, 59, 5.4, 5.85 },
|
||||
{ 8, 10, 41, 5.85, 3.4028234663852886e+38 }
|
||||
};
|
||||
setCutPoints(expected);
|
||||
applyCutPoints();
|
||||
labels expected_x = getDiscretizedValues();
|
||||
indices_t indices_x = getIndices();
|
||||
for (auto i = 0; i < 5; i++) {
|
||||
std::cout << "cutPoint[" << i << "].start = " << expected[i].start << std::endl;
|
||||
for (auto j = expected[i].start; j < expected[i].end; j++) {
|
||||
std::cout << expected_x[j] << expected[i].classNumber << std::endl;
|
||||
EXPECT_EQ(expected_x[indices_x[j]], expected[i].classNumber);
|
||||
}
|
||||
}
|
||||
computeCutPointsProposal();
|
||||
checkCutPoints(expected);
|
||||
}
|
||||
}
|
@@ -65,11 +65,11 @@ features = data.feature_names
|
||||
# test.fit(X, y, features=features)
|
||||
# test.transform(X)
|
||||
# test.get_cut_points()
|
||||
for proposed in [True, False]:
|
||||
for proposal in [True, False]:
|
||||
X = data.data
|
||||
y = data.target
|
||||
print("*** Proposed: ", proposed)
|
||||
test = CFImdlp(debug=True, proposed=proposed)
|
||||
print("*** Proposal: ", proposal)
|
||||
test = CFImdlp(debug=True, proposal=proposal)
|
||||
test.fit(X[:, 0], y)
|
||||
result = test.get_cut_points()
|
||||
for item in result:
|
||||
@@ -118,7 +118,7 @@ for proposed in [True, False]:
|
||||
# X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
|
||||
# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
|
||||
# indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
|
||||
# clf = CFImdlp(debug=True, proposed=False)
|
||||
# clf = CFImdlp(debug=True, proposal=False)
|
||||
# clf.fit(X, y)
|
||||
# print(clf.get_cut_points())
|
||||
# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
|
||||
|
Reference in New Issue
Block a user