mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-17 00:15:52 +00:00
Enhance cutpoints computation
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -129,3 +129,5 @@ dmypy.json
|
|||||||
.pyre/
|
.pyre/
|
||||||
cfimdlp.cpp
|
cfimdlp.cpp
|
||||||
.vscode/*
|
.vscode/*
|
||||||
|
**/.idea/*
|
||||||
|
|
||||||
|
2
Makefile
2
Makefile
@@ -8,7 +8,7 @@ clean: ## Clean up
|
|||||||
if [ -f fimdlp/cppfimdlp.cpython-310-darwin.so ]; then rm fimdlp/cppfimdlp.cpython-310-darwin.so; fi;
|
if [ -f fimdlp/cppfimdlp.cpython-310-darwin.so ]; then rm fimdlp/cppfimdlp.cpython-310-darwin.so; fi;
|
||||||
|
|
||||||
test:
|
test:
|
||||||
cd fimdlp/testcpp && ./test.sh
|
cd fimdlp/testcpp && ./test
|
||||||
|
|
||||||
lint: ## Lint and static-check
|
lint: ## Lint and static-check
|
||||||
black fimdlp
|
black fimdlp
|
||||||
|
@@ -5,18 +5,26 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "Metrics.h"
|
#include "Metrics.h"
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
CPPFImdlp::CPPFImdlp() : debug(false), precision(6)
|
std::ostream& operator << (std::ostream& os, const cutPoint_t& cut)
|
||||||
|
{
|
||||||
|
os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
|
||||||
|
") - (" << cut.fromValue << ", " << cut.toValue << ") "
|
||||||
|
<< std::endl;
|
||||||
|
return os;
|
||||||
|
|
||||||
|
}
|
||||||
|
CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false)
|
||||||
{
|
{
|
||||||
divider = pow(10, precision);
|
divider = pow(10, precision);
|
||||||
}
|
}
|
||||||
CPPFImdlp::CPPFImdlp(int precision, bool debug) : debug(debug), precision(precision)
|
CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug)
|
||||||
{
|
{
|
||||||
divider = pow(10, precision);
|
divider = pow(10, precision);
|
||||||
}
|
}
|
||||||
CPPFImdlp::~CPPFImdlp()
|
CPPFImdlp::~CPPFImdlp()
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
std::vector<CutPoint_t> CPPFImdlp::getCutPoints()
|
std::vector<cutPoint_t> CPPFImdlp::getCutPoints()
|
||||||
{
|
{
|
||||||
return cutPoints;
|
return cutPoints;
|
||||||
}
|
}
|
||||||
@@ -32,7 +40,11 @@ namespace mdlp {
|
|||||||
this->xDiscretized = labels(X.size(), -1);
|
this->xDiscretized = labels(X.size(), -1);
|
||||||
this->numClasses = Metrics::numClasses(y, indices, 0, X.size());
|
this->numClasses = Metrics::numClasses(y, indices, 0, X.size());
|
||||||
|
|
||||||
computeCutPoints();
|
if (proposed) {
|
||||||
|
computeCutPointsProposed();
|
||||||
|
} else {
|
||||||
|
computeCutPointsOriginal();
|
||||||
|
}
|
||||||
filterCutPoints();
|
filterCutPoints();
|
||||||
applyCutPoints();
|
applyCutPoints();
|
||||||
}
|
}
|
||||||
@@ -64,7 +76,7 @@ namespace mdlp {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bool CPPFImdlp::evaluateCutPoint(CutPoint_t rest, CutPoint_t candidate)
|
bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
|
||||||
{
|
{
|
||||||
int k, k1, k2;
|
int k, k1, k2;
|
||||||
float ig, delta;
|
float ig, delta;
|
||||||
@@ -73,7 +85,6 @@ namespace mdlp {
|
|||||||
if (N < 2) {
|
if (N < 2) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
k = Metrics::numClasses(y, indices, rest.start, rest.end);
|
k = Metrics::numClasses(y, indices, rest.start, rest.end);
|
||||||
k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
|
k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
|
||||||
k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
|
k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
|
||||||
@@ -83,15 +94,18 @@ namespace mdlp {
|
|||||||
ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
|
ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
|
||||||
delta = log2(pow(3, k) - 2) - (k * ent - k1 * ent1 - k2 * ent2);
|
delta = log2(pow(3, k) - 2) - (k * ent - k1 * ent1 - k2 * ent2);
|
||||||
float term = 1 / N * (log2(N - 1) + delta);
|
float term = 1 / N * (log2(N - 1) + delta);
|
||||||
std::cout << candidate
|
if (debug) {
|
||||||
|
std::cout << "Rest: " << rest;
|
||||||
|
std::cout << "Candidate: " << candidate;
|
||||||
std::cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << std::endl;
|
std::cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << std::endl;
|
||||||
std::cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << std::endl;
|
std::cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << std::endl;
|
||||||
|
}
|
||||||
return (ig > term);
|
return (ig > term);
|
||||||
}
|
}
|
||||||
void CPPFImdlp::filterCutPoints()
|
void CPPFImdlp::filterCutPoints()
|
||||||
{
|
{
|
||||||
std::vector<CutPoint_t> filtered;
|
cutPoints_t filtered;
|
||||||
CutPoint_t rest;
|
cutPoint_t rest;
|
||||||
int classNumber = 0;
|
int classNumber = 0;
|
||||||
|
|
||||||
rest.start = 0;
|
rest.start = 0;
|
||||||
@@ -116,24 +130,25 @@ namespace mdlp {
|
|||||||
item.classNumber = classNumber++;
|
item.classNumber = classNumber++;
|
||||||
filtered.push_back(item);
|
filtered.push_back(item);
|
||||||
first = false;
|
first = false;
|
||||||
|
rest.start = item.end;
|
||||||
} else {
|
} else {
|
||||||
std::cout << "Rejected" << std::endl;
|
std::cout << "Rejected" << std::endl;
|
||||||
lastReject = true;
|
lastReject = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!first)
|
if (!first) {
|
||||||
filtered.back().toValue = std::numeric_limits<float>::max();
|
filtered.back().toValue = std::numeric_limits<float>::max();
|
||||||
else {
|
filtered.back().end = X.size();
|
||||||
|
} else {
|
||||||
filtered.push_back(rest);
|
filtered.push_back(rest);
|
||||||
}
|
}
|
||||||
|
|
||||||
cutPoints = filtered;
|
cutPoints = filtered;
|
||||||
}
|
}
|
||||||
void CPPFImdlp::computeCutPoints()
|
void CPPFImdlp::computeCutPointsProposed()
|
||||||
{
|
{
|
||||||
|
cutPoints_t cutPts;
|
||||||
std::vector<CutPoint_t> cutPts;
|
cutPoint_t cutPoint;
|
||||||
CutPoint_t cutPoint;
|
|
||||||
indices_t cutIdx;
|
indices_t cutIdx;
|
||||||
float xPrev, xCur, xPivot;
|
float xPrev, xCur, xPivot;
|
||||||
int yPrev, yCur, yPivot;
|
int yPrev, yCur, yPivot;
|
||||||
@@ -196,38 +211,56 @@ namespace mdlp {
|
|||||||
}
|
}
|
||||||
cutPoints = cutPts;
|
cutPoints = cutPts;
|
||||||
}
|
}
|
||||||
void CPPFImdlp::computeCutPointsAnt()
|
void CPPFImdlp::computeCutPointsOriginal()
|
||||||
{
|
{
|
||||||
samples cutPts;
|
cutPoints_t cutPts;
|
||||||
labels cutIdx;
|
cutPoint_t cutPoint;
|
||||||
float xPrev, cutPoint;
|
float xPrev = std::numeric_limits<float>::lowest();
|
||||||
int yPrev;
|
int yPrev;
|
||||||
size_t idxPrev;
|
bool first = true;
|
||||||
xPrev = X.at(indices[0]);
|
// idxPrev is the index of the init instance of the cutPoint
|
||||||
yPrev = y.at(indices[0]);
|
size_t index, idxPrev = 0, idx = indices[0];
|
||||||
idxPrev = indices[0];
|
xPrev = X[idx];
|
||||||
if (debug) {
|
yPrev = y[idx];
|
||||||
std::cout << "Entropy: " << Metrics::entropy(y, indices, 0, y.size(), Metrics::numClasses(y, indices, 0, indices.size())) << std::endl;
|
for (index = 0; index < size_t(indices.size()) - 1; index++) {
|
||||||
}
|
idx = indices[index];
|
||||||
for (auto index = indices.begin(); index != indices.end(); ++index) {
|
|
||||||
// Definition 2 Cut points are always on boundaries
|
// Definition 2 Cut points are always on boundaries
|
||||||
if (y.at(*index) != yPrev && xPrev < X.at(*index)) {
|
if (y[idx] != yPrev && xPrev < X[idx]) {
|
||||||
cutPoint = round(divider * (X.at(*index) + xPrev) / 2) / divider;
|
if (first) {
|
||||||
if (debug) {
|
first = false;
|
||||||
std::cout << "Cut point: " << (xPrev + X.at(*index)) / 2 << " //";
|
cutPoint.fromValue = std::numeric_limits<float>::lowest();
|
||||||
std::cout << X.at(*index) << " -> " << y.at(*index) << " yPrev= " << yPrev;
|
} else {
|
||||||
std::cout << "* (" << X.at(*index) << ", " << xPrev << ")="
|
cutPoint.fromValue = cutPts.back().toValue;
|
||||||
<< ((X.at(*index) + xPrev) / 2) << "idxPrev"
|
|
||||||
<< idxPrev << std::endl;
|
|
||||||
}
|
}
|
||||||
|
cutPoint.start = idxPrev;
|
||||||
|
cutPoint.end = index;
|
||||||
|
cutPoint.classNumber = -1;
|
||||||
|
cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
|
||||||
|
if (debug) {
|
||||||
|
std::cout << "Cut point: " << cutPoint << " //";
|
||||||
|
std::cout << X[idx] << " -> " << y[idx] << " yPrev= "
|
||||||
|
<< yPrev << idxPrev << std::endl;
|
||||||
|
}
|
||||||
|
idxPrev = index;
|
||||||
cutPts.push_back(cutPoint);
|
cutPts.push_back(cutPoint);
|
||||||
cutIdx.push_back(idxPrev);
|
|
||||||
}
|
}
|
||||||
xPrev = X.at(*index);
|
xPrev = X[idx];
|
||||||
yPrev = y.at(*index);
|
yPrev = y[idx];
|
||||||
idxPrev = *index;
|
|
||||||
}
|
}
|
||||||
// cutPoints = cutPts;
|
std::cout << "Came to here" << first << std::endl;
|
||||||
|
if (first) {
|
||||||
|
cutPoint.start = 0;
|
||||||
|
cutPoint.classNumber = -1;
|
||||||
|
cutPoint.fromValue = std::numeric_limits<float>::lowest();
|
||||||
|
cutPoint.toValue = std::numeric_limits<float>::max();
|
||||||
|
cutPoints.push_back(cutPoint);
|
||||||
|
} else
|
||||||
|
cutPts.back().toValue = std::numeric_limits<float>::max();
|
||||||
|
cutPts.back().end = X.size();
|
||||||
|
if (debug)
|
||||||
|
for (auto cutPoint : cutPts)
|
||||||
|
std::cout << "Cut point: " << cutPoint << std::endl;
|
||||||
|
cutPoints = cutPts;
|
||||||
}
|
}
|
||||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
||||||
indices_t CPPFImdlp::sortIndices(samples& X)
|
indices_t CPPFImdlp::sortIndices(samples& X)
|
||||||
|
@@ -5,29 +5,30 @@
|
|||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
class CPPFImdlp {
|
class CPPFImdlp {
|
||||||
private:
|
private:
|
||||||
bool debug;
|
bool proposed; // proposed algorithm or original algorithm
|
||||||
int precision;
|
int precision;
|
||||||
|
bool debug;
|
||||||
float divider;
|
float divider;
|
||||||
indices_t indices; // sorted indices to use with X and y
|
indices_t indices; // sorted indices to use with X and y
|
||||||
samples X;
|
samples X;
|
||||||
labels y;
|
labels y;
|
||||||
labels xDiscretized;
|
labels xDiscretized;
|
||||||
int numClasses;
|
int numClasses;
|
||||||
std::vector<CutPoint_t> cutPoints;
|
cutPoints_t cutPoints;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
indices_t sortIndices(samples&);
|
indices_t sortIndices(samples&);
|
||||||
void computeCutPointsAnt();
|
void computeCutPointsOriginal();
|
||||||
void computeCutPoints();
|
void computeCutPointsProposed();
|
||||||
bool evaluateCutPoint(CutPoint_t, CutPoint_t);
|
bool evaluateCutPoint(cutPoint_t, cutPoint_t);
|
||||||
void filterCutPoints();
|
void filterCutPoints();
|
||||||
void applyCutPoints();
|
void applyCutPoints();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CPPFImdlp();
|
CPPFImdlp();
|
||||||
CPPFImdlp(int, bool debug = false);
|
CPPFImdlp(bool, int, bool debug = false);
|
||||||
~CPPFImdlp();
|
~CPPFImdlp();
|
||||||
std::vector<CutPoint_t> getCutPoints();
|
cutPoints_t getCutPoints();
|
||||||
labels getDiscretizedValues();
|
labels getDiscretizedValues();
|
||||||
void debugPoints(samples&, labels&);
|
void debugPoints(samples&, labels&);
|
||||||
void fit(samples&, labels&);
|
void fit(samples&, labels&);
|
||||||
|
@@ -41,7 +41,7 @@ namespace mdlp {
|
|||||||
entropy = Metrics::entropy(y, indices, start, end, nClasses);
|
entropy = Metrics::entropy(y, indices, start, end, nClasses);
|
||||||
entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft);
|
entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft);
|
||||||
entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight);
|
entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight);
|
||||||
iGain = entropy - (float)nElementsLeft / nElements * entropyLeft - (float)nElementsRight / nElements * entropyRight;
|
iGain = entropy - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements;
|
||||||
return iGain;
|
return iGain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -10,7 +10,7 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
|
|||||||
float fromValue, toValue;
|
float fromValue, toValue;
|
||||||
cdef cppclass CPPFImdlp:
|
cdef cppclass CPPFImdlp:
|
||||||
CPPFImdlp() except +
|
CPPFImdlp() except +
|
||||||
CPPFImdlp(int, bool) except +
|
CPPFImdlp(bool, int, bool) except +
|
||||||
void fit(vector[float]&, vector[int]&)
|
void fit(vector[float]&, vector[int]&)
|
||||||
vector[int] transform(vector[float]&)
|
vector[int] transform(vector[float]&)
|
||||||
vector[int] getDiscretizedValues()
|
vector[int] getDiscretizedValues()
|
||||||
@@ -18,7 +18,7 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
|
|||||||
void debugPoints(vector[float]&, vector[int]&)
|
void debugPoints(vector[float]&, vector[int]&)
|
||||||
|
|
||||||
|
|
||||||
class PCutPoint_t:
|
class PcutPoint_t:
|
||||||
def __init__(self, start, end, fromValue, toValue):
|
def __init__(self, start, end, fromValue, toValue):
|
||||||
self.start = start
|
self.start = start
|
||||||
self.end = end
|
self.end = end
|
||||||
@@ -27,8 +27,9 @@ class PCutPoint_t:
|
|||||||
|
|
||||||
cdef class CFImdlp:
|
cdef class CFImdlp:
|
||||||
cdef CPPFImdlp *thisptr
|
cdef CPPFImdlp *thisptr
|
||||||
def __cinit__(self, precision=6, debug=False):
|
def __cinit__(self, precision=6, debug=False, proposed=True):
|
||||||
self.thisptr = new CPPFImdlp(precision, debug)
|
# Proposed or original algorithm
|
||||||
|
self.thisptr = new CPPFImdlp(proposed, precision, debug)
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
del self.thisptr
|
del self.thisptr
|
||||||
def fit(self, X, y):
|
def fit(self, X, y):
|
||||||
|
Binary file not shown.
@@ -61,7 +61,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
self.n_features_ = X.shape[1]
|
self.n_features_ = X.shape[1]
|
||||||
self.X_ = X
|
self.X_ = X
|
||||||
self.y_ = y
|
self.y_ = y
|
||||||
self.discretizer_ = CFImdlp(debug=False)
|
self.discretizer_ = CFImdlp(debug=True, proposed=False)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, X):
|
def transform(self, X):
|
||||||
@@ -104,19 +104,31 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
print("Cuts calculados en python: ", cuts)
|
print("Cuts calculados en python: ", cuts)
|
||||||
print("Cuts calculados en C++")
|
print("Cuts calculados en C++")
|
||||||
print("Cut points for each feature in Iris dataset:")
|
print("Cut points for each feature in Iris dataset:")
|
||||||
for i in range(0, self.n_features_):
|
for i in range(0, 1):
|
||||||
# datax = self.X_[np.argsort(self.X_[:, i]), i]
|
# datax = self.X_[np.argsort(self.X_[:, i]), i]
|
||||||
# y_ = self.y_[np.argsort(self.X_[:, i])]
|
# y_ = self.y_[np.argsort(self.X_[:, i])]
|
||||||
datax = self.X_[:, i]
|
datax = self.X_[:, i]
|
||||||
y_ = self.y_
|
y_ = self.y_
|
||||||
Xcutpoints = self.discretizer_.cut_points(datax, y_)
|
self.discretizer_.fit(datax, y_)
|
||||||
|
Xcutpoints = self.discretizer_.get_cut_points()
|
||||||
print(
|
print(
|
||||||
f"New ({len(Xcutpoints)}):{self.features_[i]:20s}: "
|
f"New ({len(Xcutpoints)}):{self.features_[i]:20s}: "
|
||||||
f"{Xcutpoints}"
|
f"{[i['toValue'] for i in Xcutpoints]}"
|
||||||
)
|
|
||||||
Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
|
|
||||||
print(
|
|
||||||
f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
|
|
||||||
f"{Xcutpoints}"
|
|
||||||
)
|
)
|
||||||
|
X_translated = [
|
||||||
|
f"{i['classNumber']} - ({i['start']}, {i['end']}) - "
|
||||||
|
f"({i['fromValue']}, {i['toValue']})"
|
||||||
|
for i in Xcutpoints
|
||||||
|
]
|
||||||
|
print(X_translated)
|
||||||
|
print("*******************************")
|
||||||
|
print("Disretized values:")
|
||||||
|
print(self.discretizer_.transform(datax))
|
||||||
|
print("*******************************")
|
||||||
|
print("indices:", np.argsort(X[:, 0]))
|
||||||
|
# Xcutpoints = self.discretizer_.cut_points_ant(datax, y_)
|
||||||
|
# print(
|
||||||
|
# f"Ant ({len(Xcutpoints)}):{self.features_[i]:20s}: "
|
||||||
|
# f"{Xcutpoints}"
|
||||||
|
# )
|
||||||
return X
|
return X
|
||||||
|
@@ -30,7 +30,7 @@ namespace mdlp {
|
|||||||
prev = X[testSortedIndices[i]];
|
prev = X[testSortedIndices[i]];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::vector<CutPoint_t> testCutPoints(samples& X, indices_t& indices, labels& y)
|
std::vector<cutPoint_t> testCutPoints(samples& X, indices_t& indices, labels& y)
|
||||||
{
|
{
|
||||||
this->X = X;
|
this->X = X;
|
||||||
this->y = y;
|
this->y = y;
|
||||||
@@ -56,7 +56,7 @@ namespace mdlp {
|
|||||||
}
|
}
|
||||||
// TEST_F(TestMetrics, EvaluateCutPoint)
|
// TEST_F(TestMetrics, EvaluateCutPoint)
|
||||||
// {
|
// {
|
||||||
// CutPoint_t rest, candidate;
|
// cutPoint_t rest, candidate;
|
||||||
// rest.start = 0;
|
// rest.start = 0;
|
||||||
// rest.end = 10;
|
// rest.end = 10;
|
||||||
// candidate.start = 0;
|
// candidate.start = 0;
|
||||||
@@ -64,13 +64,13 @@ namespace mdlp {
|
|||||||
// float computed = evaluateCutPoint(rest, candidate);
|
// float computed = evaluateCutPoint(rest, candidate);
|
||||||
// ASSERT_NEAR(0.468996, computed, precision_test);
|
// ASSERT_NEAR(0.468996, computed, precision_test);
|
||||||
// }
|
// }
|
||||||
TEST_F(TestMetrics, ComputeCutPoints)
|
TEST_F(TestMetrics, ComputeCutPointsOriginal)
|
||||||
{
|
{
|
||||||
std::vector<CutPoint_t> computed, expected;
|
std::vector<cutPoint_t> computed, expected;
|
||||||
computeCutPoints();
|
computeCutPointsOriginal();
|
||||||
computed = getCutPoints();
|
computed = getCutPoints();
|
||||||
for (auto cut : computed) {
|
for (auto cut : computed) {
|
||||||
std::cout << "(" << cut.start << ", " << cut.end << ") -> (" << cut.fromValue << ", " << cut.toValue << ")" << std::endl;
|
std::cout << cut.classNumber << " -> (" << cut.start << ", " << cut.end << ") -> (" << cut.fromValue << ", " << cut.toValue << ")" << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -1,12 +0,0 @@
|
|||||||
cmake -S . -B build -Wno-dev
|
|
||||||
if test $? -ne 0; then
|
|
||||||
echo "Error in creating build commands."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
cmake --build build
|
|
||||||
if test $? -ne 0; then
|
|
||||||
echo "Error in build command."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
cd build
|
|
||||||
ctest --output-on-failure
|
|
@@ -2,14 +2,15 @@
|
|||||||
#define TYPES_H
|
#define TYPES_H
|
||||||
#include <vector>
|
#include <vector>
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
typedef std::vector<float> samples;
|
|
||||||
typedef std::vector<int> labels;
|
|
||||||
typedef std::vector<size_t> indices_t;
|
|
||||||
struct CutPointBody {
|
struct CutPointBody {
|
||||||
size_t start, end; // indices of the sorted vector
|
size_t start, end; // indices of the sorted vector
|
||||||
int classNumber; // class assigned to the cut point
|
int classNumber; // class assigned to the cut point
|
||||||
float fromValue, toValue;
|
float fromValue, toValue;
|
||||||
};
|
};
|
||||||
typedef CutPointBody CutPoint_t;
|
typedef CutPointBody cutPoint_t;
|
||||||
|
typedef std::vector<float> samples;
|
||||||
|
typedef std::vector<int> labels;
|
||||||
|
typedef std::vector<size_t> indices_t;
|
||||||
|
typedef std::vector<cutPoint_t> cutPoints_t;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
103
sample.py
103
sample.py
@@ -2,6 +2,59 @@ from sklearn.datasets import load_iris
|
|||||||
from fimdlp.mdlp import FImdlp
|
from fimdlp.mdlp import FImdlp
|
||||||
from fimdlp.cppfimdlp import CFImdlp
|
from fimdlp.cppfimdlp import CFImdlp
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from math import log
|
||||||
|
|
||||||
|
|
||||||
|
def entropy(y: np.array) -> float:
|
||||||
|
"""Compute entropy of a labels set
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y : np.array
|
||||||
|
set of labels
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
entropy
|
||||||
|
"""
|
||||||
|
n_labels = len(y)
|
||||||
|
if n_labels <= 1:
|
||||||
|
return 0
|
||||||
|
counts = np.bincount(y)
|
||||||
|
proportions = counts / n_labels
|
||||||
|
n_classes = np.count_nonzero(proportions)
|
||||||
|
if n_classes <= 1:
|
||||||
|
return 0
|
||||||
|
entropy = 0.0
|
||||||
|
# Compute standard entropy.
|
||||||
|
for prop in proportions:
|
||||||
|
if prop != 0.0:
|
||||||
|
entropy -= prop * log(prop, 2)
|
||||||
|
return entropy
|
||||||
|
|
||||||
|
|
||||||
|
def information_gain(
|
||||||
|
labels: np.array, labels_up: np.array, labels_dn: np.array
|
||||||
|
) -> float:
|
||||||
|
imp_prev = entropy(labels)
|
||||||
|
card_up = card_dn = imp_up = imp_dn = 0
|
||||||
|
if labels_up is not None:
|
||||||
|
card_up = labels_up.shape[0]
|
||||||
|
imp_up = entropy(labels_up)
|
||||||
|
if labels_dn is not None:
|
||||||
|
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||||
|
imp_dn = entropy(labels_dn)
|
||||||
|
samples = card_up + card_dn
|
||||||
|
if samples == 0:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
result = (
|
||||||
|
imp_prev
|
||||||
|
- (card_up / samples) * imp_up
|
||||||
|
- (card_dn / samples) * imp_dn
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
data = load_iris()
|
data = load_iris()
|
||||||
@@ -10,26 +63,38 @@ y = data.target
|
|||||||
features = data.feature_names
|
features = data.feature_names
|
||||||
test = FImdlp()
|
test = FImdlp()
|
||||||
test.fit(X, y, features=features)
|
test.fit(X, y, features=features)
|
||||||
# test.transform(X)
|
test.transform(X)
|
||||||
|
|
||||||
test = CFImdlp(debug=False)
|
# test = CFImdlp(debug=False)
|
||||||
# k = test.cut_points(X[:, 0], y)
|
# # k = test.cut_points(X[:, 0], y)
|
||||||
# print(k)
|
# # print(k)
|
||||||
# k = test.cut_points_ant(X[:, 0], y)
|
# # k = test.cut_points_ant(X[:, 0], y)
|
||||||
# print(k)
|
# # print(k)
|
||||||
# test.debug_points(X[:, 0], y)
|
# # test.debug_points(X[:, 0], y)
|
||||||
X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
|
# X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
|
||||||
indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
|
# indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
|
||||||
y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
|
# y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
|
||||||
# test.fit(X[:, 0], y)
|
# # To check
|
||||||
test.fit(X, y)
|
# indices2 = np.argsort(X)
|
||||||
result = test.get_cut_points()
|
# Xs = np.array(X)[indices2]
|
||||||
for item in result:
|
# ys = np.array(y)[indices2]
|
||||||
print(
|
# # test.fit(X[:, 0], y)
|
||||||
f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
|
# test.fit(X, y)
|
||||||
f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
|
# result = test.get_cut_points()
|
||||||
)
|
# for item in result:
|
||||||
print(test.get_discretized_values())
|
# print(
|
||||||
|
# f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
|
||||||
|
# f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
|
||||||
|
# )
|
||||||
|
# print(test.get_discretized_values())
|
||||||
|
|
||||||
|
# print(Xs, ys)
|
||||||
|
# print("**********************")
|
||||||
|
# test = [(0, 3), (4, 4), (5, 5), (6, 8), (9, 9)]
|
||||||
|
# print(ys)
|
||||||
|
# for start, end in test:
|
||||||
|
# print("Testing ", start, end, ys[:end], ys[end:])
|
||||||
|
# print("Information gain: ", information_gain(ys, ys[:end], ys[end:]))
|
||||||
# print(test.transform(X))
|
# print(test.transform(X))
|
||||||
# print(X)
|
# print(X)
|
||||||
# print(indices)
|
# print(indices)
|
||||||
|
BIN
test1.xlsx
Normal file
BIN
test1.xlsx
Normal file
Binary file not shown.
Reference in New Issue
Block a user