Test alternative

This commit is contained in:
2022-12-08 20:16:00 +01:00
parent 89c7366c4e
commit 5d930accca
19 changed files with 14525 additions and 43 deletions

6
.gitignore vendored
View File

@@ -33,8 +33,8 @@ MANIFEST
*.manifest
*.spec
# Installer logs
pip-log.txt
# Installer log2s
pip-log2.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
@@ -56,7 +56,7 @@ coverage.xml
*.pot
# Django stuff:
*.log
*.log2
local_settings.py
db.sqlite3
db.sqlite3-journal

4993
c.txt Normal file

File diff suppressed because it is too large Load Diff

232
debug.cpp
View File

@@ -11,21 +11,21 @@ for (auto cutPoint = cutIdx.begin(); cutPoint != cutIdx.end(); ++cutPoint) {
// << Metrics::informationGain(y, 0, y.size(), *cutPoint, Metrics::numClasses(y, 0, y.size())) << std::endl;
}
def test(self) :
def test(self):
print("Calculating cut points in python for first feature")
yz = self.y_.copy()
xz = X[:, 0].copy()
xz = xz[np.argsort(X[:, 0])]
yz = yz[np.argsort(X[:, 0])]
cuts = []
for i in range(1, len(yz)) :
for i in range(1, len(yz)):
if yz[i] != yz[i - 1] and xz[i - 1] < xz[i] :
print(f"Cut point: ({xz[i-1]}, {xz[i]}) ({yz[i-1]}, {yz[i]})")
cuts.append((xz[i] + xz[i - 1]) / 2)
print("Cuts calculados en python: ", cuts)
print("-- Cuts calculados en C++ --")
print("Cut points for each feature in Iris dataset:")
for i in range(0, 1) :
for i in range(0, 1):
# datax = self.X_[np.argsort(self.X_[:, i]), i]
# y_ = self.y_[np.argsort(self.X_[:, i])]
datax = self.X_[:, i]
@@ -46,4 +46,228 @@ def test(self) :
print("Disretized values:")
print(self.discretizer_.get_discretized_values())
print("*******************************")
return X
return X
c++
i: 0 4.3, 0
i : 1 4.4, 0
i : 2 4.4, 0
i : 3 4.4, 0
i : 4 4.5, 0
i : 5 4.6, 0
i : 6 4.6, 0
i : 7 4.6, 0
i : 8 4.6, 0
i : 9 4.7, 0
i : 10 4.7, 0
i : 11 4.8, 0
i : 12 4.8, 0
i : 13 4.8, 0
i : 14 4.8, 0
i : 15 4.8, 0
i : 16 4.9, 0
i : 17 4.9, 0
i : 18 4.9, 0
i : 19 4.9, 0
i : 20 4.9, 1
python
i : 0 4.3 0
i : 1 4.4 0
i : 2 4.4 0
i : 3 4.4 0
i : 4 4.5 0
i : 5 4.6 0
i : 6 4.6 0
i : 7 4.6 0
i : 8 4.6 0
i : 9 4.7 0
i : 10 4.7 0
i : 11 4.8 0
i : 12 4.8 0
i : 13 4.8 0
i : 14 4.8 0
i : 15 4.8 0
i : 16 4.9 1
i : 17 4.9 2
i : 18 4.9 0
i : 19 4.9 0
i : 20 4.9 0
idx: 20 entropy_left : 0 entropy_right : 0.488187 -> 0 150
idx : 21 entropy_left : 0.0670374 entropy_right : 0.489381 -> 0 150
idx : 22 entropy_left : 0.125003 entropy_right : 0.490573 -> 0 150
idx : 24 entropy_left : 0.11507 entropy_right : 0.482206 -> 0 150
idx : 25 entropy_left : 0.162294 entropy_right : 0.483488 -> 0 150
idx : 29 entropy_left : 0.141244 entropy_right : 0.462922 -> 0 150
idx : 30 entropy_left : 0.178924 entropy_right : 0.464386 -> 0 150
idx : 33 entropy_left : 0.163818 entropy_right : 0.444778 -> 0 150
idx : 34 entropy_left : 0.195735 entropy_right : 0.44637 -> 0 150
idx : 44 entropy_left : 0.154253 entropy_right : 0.339183 -> 0 150
idx : 45 entropy_left : 0.178924 entropy_right : 0.34098 -> 0 150
idx : 51 entropy_left : 0.159328 entropy_right : 0.217547 -> 0 150
idx : 52 entropy_left : 0.180508 entropy_right : 0.219019 -> 0 150
idx : 53 entropy_left : 0.177368 entropy_right : 0.189687 -> 0 150
idx : 58 entropy_left : 0.265229 entropy_right : 0.196677 -> 0 150
idx : 59 entropy_left : 0.261331 entropy_right : 0.162291 -> 0 150
idx : 61 entropy_left : 0.289819 entropy_right : 0.164857 -> 0 150
idx : 62 entropy_left : 0.302928 entropy_right : 0.166175 -> 0 150
idx : 68 entropy_left : 0.36831 entropy_right : 0.174607 -> 0 150
idx : 69 entropy_left : 0.364217 entropy_right : 0.131848 -> 0 150
idx : 70 entropy_left : 0.373248 entropy_right : 0.133048 -> 0 150
idx : 71 entropy_left : 0.381826 entropy_right : 0.134273 -> 0 150
idx : 72 entropy_left : 0.377855 entropy_right : 0.0805821 -> 0 150
idx : 74 entropy_left : 0.393817 entropy_right : 0.0822096 -> 0 150
idx : 75 entropy_left : 0.401218 entropy_right : 0.0830509 -> 0 150
idx : 76 entropy_left : 0.397415 entropy_right : 0 -> 0 150
idx : 77 entropy_left : 0.4045 entropy_right : 0 -> 0 150
idx : 78 entropy_left : 0.411247 entropy_right : 0 -> 0 150
idx : 79 entropy_left : 0.417674 entropy_right : 0 -> 0 150
idx : 81 entropy_left : 0.429626 entropy_right : 0 -> 0 150
idx : 83 entropy_left : 0.440472 entropy_right : 0 -> 0 150
idx : 84 entropy_left : 0.445513 entropy_right : 0 -> 0 150
idx : 87 entropy_left : 0.459246 entropy_right : 0 -> 0 150
idx : 88 entropy_left : 0.463395 entropy_right : 0 -> 0 150
idx : 89 entropy_left : 0.467347 entropy_right : 0 -> 0 150
idx : 91 entropy_left : 0.474691 entropy_right : 0 -> 0 150
idx : 95 entropy_left : 0.487368 entropy_right : 0 -> 0 150
idx : 97 entropy_left : 0.492813 entropy_right : 0 -> 0 150
idx : 99 entropy_left : 0.497728 entropy_right : 0 -> 0 150
idx : 101 entropy_left : 0.502156 entropy_right : 0 -> 0 150
idx : 102 entropy_left : 0.504201 entropy_right : 0 -> 0 150
idx : 104 entropy_left : 0.507973 entropy_right : 0 -> 0 150
idx : 105 entropy_left : 0.509709 entropy_right : 0 -> 0 150
idx : 106 entropy_left : 0.511351 entropy_right : 0 -> 0 150
idx : 107 entropy_left : 0.512902 entropy_right : 0 -> 0 150
idx : 109 entropy_left : 0.515747 entropy_right : 0 -> 0 150
idx : 110 entropy_left : 0.517047 entropy_right : 0 -> 0 150
idx : 113 entropy_left : 0.520497 entropy_right : 0 -> 0 150
idx : 114 entropy_left : 0.521506 entropy_right : 0 -> 0 150
idx : 117 entropy_left : 0.524149 entropy_right : 0 -> 0 150
idx : 118 entropy_left : 0.52491 entropy_right : 0 -> 0 150
idx : 120 entropy_left : 0.526264 entropy_right : 0 -> 0 150
idx : 122 entropy_left : 0.52741 entropy_right : 0 -> 0 150
idx : 127 entropy_left : 0.52946 entropy_right : 0 -> 0 150
idx : 130 entropy_left : 0.530197 entropy_right : 0 -> 0 150
idx : 132 entropy_left : 0.530507 entropy_right : 0 -> 0 150
idx : 133 entropy_left : 0.530611 entropy_right : 0 -> 0 150
idx : 134 entropy_left : 0.530684 entropy_right : 0 -> 0 150
idx : 135 entropy_left : 0.530726 entropy_right : 0 -> 0 150
idx : 137 entropy_left : 0.530721 entropy_right : 0 -> 0 150
idx : 138 entropy_left : 0.530677 entropy_right : 0 -> 0 150
cut : 5.5 index : 53
start : 0 cut : 53 end : 150
k = 3 k1 = 3 k2 = 3 ent = 0.528321 ent1 = 0.177368 ent2 = 0.189687
ig = 0.342987 delta = 4.16006 N 150 term 0.0758615
¡Ding!5.5 53
idx : 20 entropy_left : 0 entropy_right : 1.5485806065228545 -> 0 150
idx : 21 entropy_left : 0.2761954276479391 entropy_right : 1.549829505666378 -> 0 150
idx : 22 entropy_left : 0.5304060778306042 entropy_right : 1.5511852922535474 -> 0 150
idx : 24 entropy_left : 0.4971501836369671 entropy_right : 1.5419822842863982 -> 0 150
idx : 25 entropy_left : 0.6395563653739031 entropy_right : 1.5433449229510985 -> 0 150
idx : 29 entropy_left : 0.574828144380386 entropy_right : 1.5202013991459298 -> 0 150
idx : 30 entropy_left : 0.6746799231474564 entropy_right : 1.521677608876836 -> 0 150
idx : 33 entropy_left : 0.6311718053929063 entropy_right : 1.4992098113026513 -> 0 150
idx : 34 entropy_left : 0.7085966983474103 entropy_right : 1.5007111828980744 -> 0 150
idx : 44 entropy_left : 0.5928251064639408 entropy_right : 1.3764263022492553 -> 0 150
idx : 45 entropy_left : 0.6531791627726858 entropy_right : 1.3779796176519241 -> 0 150
idx : 51 entropy_left : 0.5990326006132177 entropy_right : 1.2367928607774141 -> 0 150
idx : 52 entropy_left : 0.6496096346956632 entropy_right : 1.2377158231343603 -> 0 150
idx : 53 entropy_left : 0.6412482850735854 entropy_right : 1.2046986815511866 -> 0 150
idx : 58 entropy_left : 0.8211258609270055 entropy_right : 1.2056112071736118 -> 0 150
idx : 59 entropy_left : 0.8128223064150747 entropy_right : 1.167065448996099 -> 0 150
idx : 61 entropy_left : 0.8623538561746379 entropy_right : 1.1653351793699953 -> 0 150
idx : 62 entropy_left : 0.9353028851500502 entropy_right : 1.1687172769890006 -> 0 150
idx : 68 entropy_left : 1.031929035599206 entropy_right : 1.1573913563403753 -> 0 150
idx : 69 entropy_left : 1.0246284743137688 entropy_right : 1.109500797247481 -> 0 150
idx : 70 entropy_left : 1.036186417911213 entropy_right : 1.105866621101474 -> 0 150
idx : 71 entropy_left : 1.0895830429620594 entropy_right : 1.1104593064416028 -> 0 150
idx : 72 entropy_left : 1.0822273380873693 entropy_right : 1.0511407586429597 -> 0 150
idx : 74 entropy_left : 1.1015727511177442 entropy_right : 1.041722068095403 -> 0 150
idx : 75 entropy_left : 1.1457749842070042 entropy_right : 1.0462881865460743 -> 0 150
idx : 76 entropy_left : 1.1387129726704701 entropy_right : 0.9568886656798212 -> 0 150
idx : 77 entropy_left : 1.1468549240968817 entropy_right : 0.9505668528932196 -> 0 150
idx : 78 entropy_left : 1.1848333092150132 entropy_right : 0.9544340029249649 -> 0 150
idx : 79 entropy_left : 1.1918623939938016 entropy_right : 0.9477073729342066 -> 0 150
idx : 81 entropy_left : 1.2548698305334247 entropy_right : 0.9557589912150009 -> 0 150
idx : 83 entropy_left : 1.2659342914094807 entropy_right : 0.9411864371816835 -> 0 150
idx : 84 entropy_left : 1.2922669208691815 entropy_right : 0.9456603046006402 -> 0 150
idx : 87 entropy_left : 1.3041589171425696 entropy_right : 0.9182958340544896 -> 0 150
idx : 88 entropy_left : 1.327572716814381 entropy_right : 0.9235785996175947 -> 0 150
idx : 89 entropy_left : 1.330465426809402 entropy_right : 0.9127341558073343 -> 0 150
idx : 91 entropy_left : 1.3709454625942779 entropy_right : 0.9238422284571814 -> 0 150
idx : 95 entropy_left : 1.378063041001916 entropy_right : 0.8698926856041563 -> 0 150
idx : 97 entropy_left : 1.4115390027326744 entropy_right : 0.8835850861052532 -> 0 150
idx : 99 entropy_left : 1.4130351465796736 entropy_right : 0.8478617451660526 -> 0 150
idx : 101 entropy_left : 1.4412464483479606 entropy_right : 0.863120568566631 -> 0 150
idx : 102 entropy_left : 1.4415827640191903 entropy_right : 0.8426578772022391 -> 0 150
idx : 104 entropy_left : 1.4655411381577925 entropy_right : 0.8589810370425963 -> 0 150
idx : 105 entropy_left : 1.465665295753282 entropy_right : 0.8366407419411673 -> 0 150
idx : 106 entropy_left : 1.4762911618692924 entropy_right : 0.8453509366224365 -> 0 150
idx : 107 entropy_left : 1.4762132849962355 entropy_right : 0.8203636429576732 -> 0 150
idx : 109 entropy_left : 1.4951379218217782 entropy_right : 0.8390040613676977 -> 0 150
idx : 110 entropy_left : 1.4949188482339508 entropy_right : 0.8112781244591328 -> 0 150
idx : 113 entropy_left : 1.5183041104369397 entropy_right : 0.8418521897563207 -> 0 150
idx : 114 entropy_left : 1.51802714866133 entropy_right : 0.8112781244591328 -> 0 150
idx : 117 entropy_left : 1.5364854516368571 entropy_right : 0.8453509366224365 -> 0 150
idx : 118 entropy_left : 1.5361890331151247 entropy_right : 0.8112781244591328 -> 0 150
idx : 120 entropy_left : 1.5462566034163763 entropy_right : 0.8366407419411673 -> 0 150
idx : 122 entropy_left : 1.545378825051491 entropy_right : 0.74959525725948 -> 0 150
idx : 127 entropy_left : 1.5644893588382582 entropy_right : 0.828055725379504 -> 0 150
idx : 130 entropy_left : 1.562956340286807 entropy_right : 0.6098403047164004 -> 0 150
idx : 132 entropy_left : 1.5687623685201277 entropy_right : 0.6500224216483541 -> 0 150
idx : 133 entropy_left : 1.5680951037987416 entropy_right : 0.5225593745369408 -> 0 150
idx : 134 entropy_left : 1.5706540443736308 entropy_right : 0.5435644431995964 -> 0 150
idx : 135 entropy_left : 1.5699201014782036 entropy_right : 0.35335933502142136 -> 0 150
idx : 137 entropy_left : 1.5744201314186457 entropy_right : 0.39124356362925566 -> 0 150
idx : 138 entropy_left : 1.5736921054134685 entropy_right : 0 -> 0 150
¡Ding!4.9 20
k = 2 k1 = 1 k2 = 2 ent = 0.5225593745369408 ent1 = 0 ent2 = 0.5435644431995964
ig = 0.010969310349085326 delta = 2.849365059382915 N 17 term 0.4029038270225244
idx : 135 entropy_left : 0 entropy_right : 0.35335933502142136 -> 134 150
idx : 137 entropy_left : 0.9182958340544896 entropy_right : 0.39124356362925566 -> 134 150
idx : 138 entropy_left : 1.0 entropy_right : 0 -> 134 150
start : 134 cut : 135 end : 150
k = 2 k1 = 1 k2 = 2 ent = 0.5435644431995964 ent1 = 0 ent2 = 0.35335933502142136
ig = 0.21229006661701388 delta = 2.426944705701254 N 16 term 0.39586470633186077
idx : 137 entropy_left : 0 entropy_right : 0.39124356362925566 -> 135 150
idx : 138 entropy_left : 0.9182958340544896 entropy_right : 0 -> 135 150
start : 135 cut : 137 end : 150
k = 2 k1 = 1 k2 = 2 ent = 0.35335933502142136 ent1 = 0 ent2 = 0.39124356362925566
ig = 0.01428157987606643 delta = 2.8831233792732727 N 15 term 0.44603188675539174
idx : 138 entropy_left : 0 entropy_right : 0 -> 137 150
start : 137 cut : 138 end : 150
k = 2 k1 = 1 k2 = 1 ent = 0.39124356362925566 ent1 = 0 ent2 = 0
ig = 0.39124356362925566 delta = 2.0248677947990927 N 13 term 0.4315254073477115
[[4.9, 5.2, 5.4, 6.75]]
cut : 1.4 index : 81
start : 50 cut : 81 end : 96
k = 2 k1 = 2 k2 = 1 ent = 0.151097 ent1 = 0.205593 ent2 = 0
ig = 0.0125455 delta = 2.91635 N 46 term 0.182787
idx : 80 entropy_left : 0 entropy_right : 0 -> 50 81
cut : 1.4 index : 80
start : 50 cut : 80 end : 81
k = 2 k1 = 1 k2 = 1 ent = 0.205593 ent1 = 0 ent2 = 0
ig = 0.205593 delta = 2.39617 N 31 term 0.235583
idx : 112 entropy_left : 0 entropy_right : 0.175565 -> 103 150
idx : 113 entropy_left : 0.468996 entropy_right : 0 -> 103 150
cut : 1.8 index : 112
start : 103 cut : 112 end : 150
k = 2 k1 = 1 k2 = 2 ent = 0.148549 ent1 = 0 ent2 = 0.175565
ig = 0.00660326 delta = 2.86139 N 47 term 0.178403
idx : 113 entropy_left : 0 entropy_right : 0 -> 112 150
cut : 1.8 index : 113
start : 112 cut : 113 end : 150
k = 2 k1 = 1 k2 = 1 ent = 0.175565 ent1 = 0 ent2 = 0
ig = 0.175565 delta = 2.45622 N 38 term 0.201728
[[4.900000095367432, 4.949999809265137, 5.0, 5.099999904632568, 5.199999809265137, 5.25, 5.400000095367432, 5.449999809265137,
5.5, 5.550000190734863, 5.599999904632568, 5.699999809265137, 5.800000190734863, 5.900000095367432, 5.949999809265137, 6.0, 6.050000190734863,
6.099999904632568, 6.149999618530273, 6.199999809265137, 6.25, 6.300000190734863, 6.400000095367432, 6.5, 6.550000190734863, 6.649999618530273, 6.699999809265137,
6.75, 6.800000190734863, 6.850000381469727, 6.900000095367432, 6.949999809265137, 7.050000190734863]]

View File

@@ -13,12 +13,12 @@ namespace mdlp {
return os;
}
CPPFImdlp::CPPFImdlp() : proposal(true), precision(6), debug(false)
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false)
{
divider = pow(10, precision);
numClasses = 0;
}
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug) : proposal(proposal), precision(precision), debug(debug)
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug)
{
divider = pow(10, precision);
numClasses = 0;

View File

@@ -26,7 +26,7 @@ namespace mdlp {
entropy -= p * log2(p);
}
}
return entropy;
return entropy < 0 ? 0 : entropy;
}
float Metrics::informationGain(labels& y, indices_t& indices, size_t start, size_t end, size_t cutPoint, int nClasses)
{
@@ -45,3 +45,13 @@ namespace mdlp {
}
}
/*
cache_t entropyCache;
std::map<std::tuple<int, int>, double> c;
// Set the value at index (3, 5) to 7.8.
c[std::make_tuple(3, 5)] = 7.8;
// Print the value at index (3, 5).
std::cout << c[std::make_tuple(3, 5)] << std::endl;
*/

159
fimdlp/ccFImdlp.cc Normal file
View File

@@ -0,0 +1,159 @@
#include "ccFImdlp.h"
#include <numeric>
#include <iostream>
#include <algorithm>
#include <set>
#include "ccMetrics.h"
namespace mdlp {
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false), divider(pow(10, precision)), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
{
}
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug), divider(pow(10, precision)), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
{
}
CPPFImdlp::~CPPFImdlp()
= default;
CPPFImdlp& CPPFImdlp::fitx(samples& X_, labels& y_)
{
X = X_;
y = y_;
if (X.size() != y.size()) {
throw invalid_argument("X and y must have the same size");
}
if (X.size() == 0 || y.size() == 0) {
throw invalid_argument("X and y must have at least one element");
}
indices = sortIndices(X_);
metrics.setData(y, indices);
computeCutPointsRecursive(0, X.size());
//simulateCutPointsRecursive();
return *this;
}
void CPPFImdlp::simulateCutPointsRecursive()
{
cutPoints_t jobs = cutPoints_t();
jobs.push_back(cutPoint_t({ 0, X.size() }));
while (jobs.size() > 0) {
auto interval = jobs.back();
jobs.pop_back();
//cout << "start: " << interval.start << " end: " << interval.end << endl;
auto cut = getCandidateSimulate(interval.start, interval.end);
if (cut == -1 || !mdlp(interval.start, cut, interval.end)) {
if (interval.start != 0)
xCutPoints.push_back(xcutPoint_t({ interval.start, (X[indices[interval.start]] + X[indices[interval.start - 1]]) / 2 }));
if (interval.end != X.size())
xCutPoints.push_back(xcutPoint_t({ interval.end, (X[indices[interval.end]] + X[indices[interval.end - 1]]) / 2 }));
continue;
}
jobs.push_back(cutPoint_t({ interval.start, size_t(cut) }));
jobs.push_back(cutPoint_t({ size_t(cut), interval.end }));
}
}
void CPPFImdlp::computeCutPointsRecursive(size_t start, size_t end)
{
xcutPoint_t cut;
//cout << "start: " << start << " end: " << end << endl;
if (end - start < 2)
return;
cut = getCandidate(start, end);
if (cut.value == -1 || !mdlp(start, cut.index, end)) {
// cut.value == -1 means that there is no candidate in the interval
// that enhances the information gain
//cout << "¡Ding! " << cut.value << " " << cut.index << endl;
if (start != 0)
xCutPoints.push_back(xcutPoint_t({ start, (X[indices[start]] + X[indices[start - 1]]) / 2 }));
if (end != X.size())
xCutPoints.push_back(xcutPoint_t({ end, (X[indices[end]] + X[indices[end - 1]]) / 2 }));
return;
}
computeCutPointsRecursive(start, cut.index);
computeCutPointsRecursive(cut.index, end);
}
xcutPoint_t CPPFImdlp::getCandidate(size_t start, size_t end)
{
xcutPoint_t candidate;
int elements = end - start;
candidate.value = -1;
candidate.index = -1;
float entropy_left, entropy_right, minEntropy = numeric_limits<float>::max();
for (auto idx = start + 1; idx < end; idx++) {
if (y[indices[idx]] == y[indices[idx - 1]])
continue;
entropy_left = float(idx - start) / elements * metrics.entropy(start, idx);
entropy_right = float(end - idx) / elements * metrics.entropy(idx, end);
if (entropy_left + entropy_right < minEntropy) {
minEntropy = entropy_left + entropy_right;
candidate.value = (X[indices[idx]] + X[indices[idx - 1]]) / 2;
candidate.index = idx;
}
}
return candidate;
}
int CPPFImdlp::getCandidateSimulate(size_t start, size_t end)
{
int candidate = -1;
int elements = end - start;
float entropy_left, entropy_right, minEntropy = numeric_limits<float>::max();
for (auto idx = start + 1; idx < end; idx++) {
if (y[indices[idx]] == y[indices[idx - 1]])
continue;
entropy_left = float(idx - start) / elements * metrics.entropy(start, idx);
entropy_right = float(end - idx) / elements * metrics.entropy(idx, end);
if (minEntropy > entropy_left + entropy_right) {
minEntropy = entropy_left + entropy_right;
candidate = idx;
}
}
return candidate;
}
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
{
int k, k1, k2;
float ig, delta;
float ent, ent1, ent2;
auto N = float(end - start);
if (N < 2) {
return false;
}
k = metrics.computeNumClasses(start, end);
k1 = metrics.computeNumClasses(start, cut);
k2 = metrics.computeNumClasses(cut, end);
ent = metrics.entropy(start, end);
ent1 = metrics.entropy(start, cut);
ent2 = metrics.entropy(cut, end);
ig = metrics.informationGain(start, cut, end);
delta = log2(pow(3, float(k)) - 2) - (float(k) * ent - float(k1) * ent1 - float(k2) * ent2);
float term = 1 / N * (log2(N - 1) + delta);
if (debug) {
cout << "start: " << start << " cut: " << cut << " end: " << end << endl;
cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << endl;
cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << endl;
}
return ig > term;
}
samples CPPFImdlp::getCutPointsx()
{
// Remove duplicates and sort
samples output(xCutPoints.size());
set<float> s;
unsigned size = xCutPoints.size();
for (unsigned i = 0; i < size; i++)
s.insert(xCutPoints[i].value);
output.assign(s.begin(), s.end());
sort(output.begin(), output.end());
return output;
}
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples& X_)
{
indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0);
for (size_t i = 0; i < X_.size(); i++)
sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
{ return X_[i1] < X_[i2]; });
return idx;
}
}

35
fimdlp/ccFImdlp.h Normal file
View File

@@ -0,0 +1,35 @@
#ifndef CCFIMDLP_H
#define CCFIMDLP_H
#include "typesFImdlp.h"
#include "ccMetrics.h"
#include <utility>
namespace mdlp {
class CPPFImdlp {
protected:
bool proposal; // proposed algorithm or original algorithm
int precision;
bool debug;
float divider;
indices_t indices; // sorted indices to use with X and y
samples X;
labels y;
Metrics metrics;
xcutPoints_t xCutPoints;
static indices_t sortIndices(samples&);
void computeCutPointsRecursive(size_t, size_t);
xcutPoint_t getCandidate(size_t, size_t);
bool mdlp(size_t, size_t, size_t);
void simulateCutPointsRecursive();
int getCandidateSimulate(size_t, size_t);
public:
CPPFImdlp();
CPPFImdlp(bool, int, bool debug = false);
~CPPFImdlp();
indices_t getIndices();
CPPFImdlp& fitx(samples&, labels&);
samples getCutPointsx();
};
}
#endif

74
fimdlp/ccMetrics.cc Normal file
View File

@@ -0,0 +1,74 @@
#include "ccMetrics.h"
#include <set>
#include <iostream>
using namespace std;
namespace mdlp {
Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
{
}
int Metrics::computeNumClasses(size_t start, size_t end)
{
set<int> nClasses;
for (auto i = start; i < end; ++i) {
nClasses.insert(y[indices[i]]);
}
return nClasses.size();
}
void Metrics::setData(labels& y_, indices_t& indices_)
{
indices = indices_;
y = y_;
numClasses = computeNumClasses(0, indices.size());
}
float Metrics::entropy(size_t start, size_t end)
{
float p, ventropy = 0;
int nElements = 0;
labels counts(numClasses + 1, 0);
if (end - start < 2)
return 0;
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
return entropyCache[make_tuple(start, end)];
}
for (auto i = &indices[start]; i != &indices[end]; ++i) {
counts[y[*i]]++;
nElements++;
}
for (auto count : counts) {
if (count > 0) {
p = (float)count / nElements;
ventropy -= p * log2(p);
}
}
entropyCache[make_tuple(start, end)] = ventropy;
return ventropy;
}
float Metrics::informationGain(size_t start, size_t cut, size_t end)
{
float iGain;
float entropyInterval, entropyLeft, entropyRight;
int nElementsLeft = cut - start, nElementsRight = end - cut;
int nElements = end - start;
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
cout << "**********Cache IG hit for " << start << " " << end << endl;
return igCache[make_tuple(start, cut, end)];
}
entropyInterval = entropy(start, end);
entropyLeft = entropy(start, cut);
entropyRight = entropy(cut, end);
iGain = entropyInterval - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements;
igCache[make_tuple(start, cut, end)] = iGain;
return iGain;
}
}
/*
cache_t entropyCache;
std::map<std::tuple<int, int>, double> c;
// Set the value at index (3, 5) to 7.8.
c[std::make_tuple(3, 5)] = 7.8;
// Print the value at index (3, 5).
std::cout << c[std::make_tuple(3, 5)] << std::endl;
*/

21
fimdlp/ccMetrics.h Normal file
View File

@@ -0,0 +1,21 @@
#ifndef CCMETRICS_H
#define CCMETRICS_H
#include "typesFImdlp.h"
#include <cmath>
namespace mdlp {
class Metrics {
protected:
labels& y;
indices_t& indices;
int numClasses;
cacheEnt_t entropyCache;
cacheIg_t igCache;
public:
Metrics(labels&, indices_t&);
void setData(labels&, indices_t&);
int computeNumClasses(size_t, size_t);
float entropy(size_t, size_t);
float informationGain(size_t, size_t, size_t);
};
}
#endif

View File

@@ -3,7 +3,7 @@
from libcpp.vector cimport vector
from libcpp cimport bool
cdef extern from "CPPFImdlp.h" namespace "mdlp":
cdef extern from "ccFImdlp.h" namespace "mdlp":
cdef struct CutPointBody:
size_t start, end;
int classNumber;
@@ -11,9 +11,8 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
cdef cppclass CPPFImdlp:
CPPFImdlp() except +
CPPFImdlp(bool, int, bool) except +
CPPFImdlp& fit(vector[float]&, vector[int]&)
vector[int] getDiscretizedValues()
vector[float] getCutPoints()
CPPFImdlp& fitx(vector[float]&, vector[int]&)
vector[float] getCutPointsx()
class PcutPoint_t:
@@ -31,10 +30,8 @@ cdef class CFImdlp:
def __dealloc__(self):
del self.thisptr
def fit(self, X, y):
self.thisptr.fit(X, y)
self.thisptr.fitx(X, y)
return self
def get_discretized_values(self):
return self.thisptr.getDiscretizedValues()
def get_cut_points(self):
return self.thisptr.getCutPoints()
return self.thisptr.getCutPointsx()

36
fimdlp/m2.cpp Normal file
View File

@@ -0,0 +1,36 @@
#include <vector>
using namespace std;
struct CutPointBody {
size_t start, end; // indices of the sorted vector
int classNumber; // class assigned to the cut point
float fromValue, toValue;
};
typedef CutPointBody cutPoint_t;
typedef vector<float> samples;
typedef vector<int> labels;
typedef vector<size_t> indices_t;
typedef vector<cutPoint_t> cutPoints_t;
//typedef std::map<std::tuple<int, int>, float> cache_t;
struct cutPointStruct {
size_t index;
float value;
};
typedef cutPointStruct xcutPoint_t;
typedef vector<xcutPoint_t> xcutPoints_t;
class Metrics {
private:
labels& y;
indices_t& indices;
int numClasses;
public:
Metrics(labels&, indices_t&);
int computeNumClasses(size_t, size_t);
float entropy(size_t, size_t);
float informationGain(size_t, size_t, size_t);
};
Metrics::Metrics(labels& y_, indices_t& indices_) : y(y_), indices(indices_)
{
numClasses = computeNumClasses(0, indices.size());
}

View File

@@ -86,7 +86,9 @@ class FImdlp(TransformerMixin, BaseEstimator):
self.cut_points_ = [None] * self.n_features_
# Can do it in parallel
for feature in self.features_:
self.discretizer_[feature] = PyFImdlp(proposal=self.proposal)
self.discretizer_[feature] = CFImdlp(
proposal=self.proposal, debug=False
)
self.discretizer_[feature].fit(X[:, feature], y)
self.cut_points_[feature] = self.discretizer_[
feature
@@ -132,10 +134,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
# Check that the input is of the same shape as the one passed
# during fit.
if X.shape[1] != self.n_features_:
raise ValueError(
"Shape of input is different from what was seen in `fit`"
)
# if X.shape[1] != self.n_features_:
# raise ValueError(
# "Shape of input is different from what was seen in `fit`"
# )
result = np.zeros_like(X, dtype=np.int32) - 1
# Can do it in parallel
for feature in range(self.n_features_):

View File

@@ -1,14 +1,15 @@
import numpy as np
from math import log
from math import log2
from types import SimpleNamespace
class PyFImdlp:
def __init__(self, proposal=True):
def __init__(self, proposal=True, debug=False):
self.proposal = proposal
self.n_features_ = None
self.X_ = None
self.y_ = None
self.debug = debug
self.features_ = None
self.cut_points_ = []
self.entropy_cache = {}
@@ -17,9 +18,315 @@ class PyFImdlp:
def fit(self, X, y):
self.n_features_ = len(X)
self.indices_ = np.argsort(X)
self.use_indices = True
self.X_ = X[self.indices_] if not self.use_indices else X
self.y_ = y[self.indices_] if not self.use_indices else y
self.use_indices = False
X = [
4.3,
4.4,
4.4,
4.4,
4.5,
4.6,
4.6,
4.6,
4.6,
4.7,
4.7,
4.8,
4.8,
4.8,
4.8,
4.8,
4.9,
4.9,
4.9,
4.9,
4.9,
4.9,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5.1,
5.1,
5.1,
5.1,
5.1,
5.1,
5.1,
5.1,
5.1,
5.2,
5.2,
5.2,
5.2,
5.3,
5.4,
5.4,
5.4,
5.4,
5.4,
5.4,
5.5,
5.5,
5.5,
5.5,
5.5,
5.5,
5.5,
5.6,
5.6,
5.6,
5.6,
5.6,
5.6,
5.7,
5.7,
5.7,
5.7,
5.7,
5.7,
5.7,
5.7,
5.8,
5.8,
5.8,
5.8,
5.8,
5.8,
5.8,
5.9,
5.9,
5.9,
6,
6,
6,
6,
6,
6,
6.1,
6.1,
6.1,
6.1,
6.1,
6.1,
6.2,
6.2,
6.2,
6.2,
6.3,
6.3,
6.3,
6.3,
6.3,
6.3,
6.3,
6.3,
6.3,
6.4,
6.4,
6.4,
6.4,
6.4,
6.4,
6.4,
6.5,
6.5,
6.5,
6.5,
6.5,
6.6,
6.6,
6.7,
6.7,
6.7,
6.7,
6.7,
6.7,
6.7,
6.7,
6.8,
6.8,
6.8,
6.9,
6.9,
6.9,
6.9,
7,
7.1,
7.2,
7.2,
7.2,
7.3,
7.4,
7.6,
7.7,
7.7,
7.7,
7.7,
7.9,
]
y = [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
2,
0,
0,
1,
0,
0,
0,
0,
1,
0,
0,
0,
1,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
0,
0,
0,
0,
0,
0,
1,
0,
1,
1,
1,
1,
1,
0,
1,
1,
2,
1,
1,
1,
1,
1,
1,
0,
1,
2,
0,
1,
1,
2,
0,
1,
2,
1,
2,
2,
1,
1,
2,
1,
1,
1,
2,
1,
2,
2,
1,
1,
1,
1,
2,
2,
1,
1,
2,
2,
1,
2,
2,
1,
2,
1,
2,
2,
1,
2,
2,
2,
1,
2,
2,
2,
1,
2,
2,
1,
1,
2,
2,
2,
2,
2,
1,
1,
1,
2,
2,
1,
2,
1,
2,
2,
1,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
]
# self.X_ = X[self.indices_] if not self.use_indices else X
# self.y_ = y[self.indices_] if not self.use_indices else y
self.X_ = X
self.y_ = y
self.compute_cut_points(0, len(y))
return self
@@ -27,9 +334,11 @@ class PyFImdlp:
return sorted(list(set([cut.value for cut in self.cut_points_])))
def compute_cut_points(self, start, end):
# print((start, end))
cut = self.get_candidate(start, end)
if cut.value is None:
return
print("cut: ", cut.value, " index: ", cut.index)
if self.mdlp(cut, start, end):
print("¡Ding!", cut.value, cut.index)
self.cut_points_.append(cut)
@@ -45,10 +354,26 @@ class PyFImdlp:
ent1 = self.entropy(start, cut.index)
ent2 = self.entropy(cut.index, end)
ig = self.information_gain(start, cut.index, end)
delta = log(pow(3, k) - 2, 2) - (
delta = log2(pow(3, k) - 2, 2) - (
float(k) * ent - float(k1) * ent1 - float(k2) * ent2
)
term = 1 / N * (log(N - 1, 2) + delta)
term = 1 / N * (log2(N - 1, 2) + delta)
print("start: ", start, " cut: ", cut.index, " end: ", end)
print(
"k=",
k,
" k1=",
k1,
" k2=",
k2,
" ent=",
ent,
" ent1=",
ent1,
" ent2=",
ent2,
)
print("ig=", ig, " delta=", delta, " N ", N, " term ", term)
return ig > term
def num_classes(self, start, end):
@@ -88,6 +413,18 @@ class PyFImdlp:
entropy_left = self.entropy(start, idx)
entropy_right = self.entropy(idx, end)
entropy_cut = entropy_left + entropy_right
print(
"idx: ",
idx,
" entropy_left: ",
entropy_left,
" entropy_right : ",
entropy_right,
" -> ",
start,
" ",
end,
)
if entropy_cut < minEntropy:
minEntropy = entropy_cut
candidate.index = idx
@@ -118,7 +455,7 @@ class PyFImdlp:
# Compute standard entropy.
for prop in proportions:
if prop != 0.0:
entropy -= prop * log(prop, 2)
entropy -= prop * log2(prop, 2)
self.entropy_cache[(start, end)] = entropy
return entropy

View File

@@ -1,18 +1,25 @@
#ifndef TYPES_H
#define TYPES_H
#include <vector>
#include <map>
using namespace std;
namespace mdlp {
struct CutPointBody {
size_t start, end; // indices of the sorted vector
int classNumber; // class assigned to the cut point
float fromValue, toValue;
};
typedef CutPointBody cutPoint_t;
typedef vector<float> samples;
typedef vector<int> labels;
typedef vector<size_t> indices_t;
typedef vector<cutPoint_t> cutPoints_t;
typedef map<tuple<int, int>, float> cacheEnt_t;
typedef map<tuple<int, int, int>, float> cacheIg_t;
struct cutPointStruct {
size_t index;
float value;
};
typedef cutPointStruct xcutPoint_t;
typedef vector<xcutPoint_t> xcutPoints_t;
}
#endif

8557
p.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,13 @@
from sklearn.datasets import load_iris
from fimdlp.mdlp import FImdlp
from fimdlp.cppfimdlp import CFImdlp
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from math import log
import time
from math import log2
from scipy.io import arff
import pandas as pd
def entropy(y: np.array) -> float:
@@ -30,7 +35,7 @@ def entropy(y: np.array) -> float:
# Compute standard entropy.
for prop in proportions:
if prop != 0.0:
entropy -= prop * log(prop, 2)
entropy -= prop * log2(prop, 2)
return entropy
@@ -57,14 +62,37 @@ def information_gain(
return result
data = load_iris()
X = data.data
y = data.target
features = data.feature_names
class_name = "speaker"
file_name = "kdd_JapaneseVowels.arff"
data = arff.loadarff(file_name)
df = pd.DataFrame(data[0])
df.dropna(axis=0, how="any", inplace=True)
dataset = df
X = df.drop(class_name, axis=1)
features = X.columns
class_name = class_name
y, _ = pd.factorize(df[class_name])
X = X.to_numpy()
# data = load_iris()
# X = data.data
# y = data.target
# features = data.feature_names
test = FImdlp()
test.fit(X, y)
test.transform(X)
now = time.time()
test.fit(X, y, features=[i for i in (range(3, 14))])
fit_time = time.time()
print("Fitting: ", fit_time - now)
now = time.time()
Xt = test.transform(X)
print("Transforming: ", time.time() - now)
print(test.get_cut_points())
clf = RandomForestClassifier(random_state=0)
print(clf.fit(Xt, y).score(Xt, y))
# for proposal in [True, False]:
# X = data.data
# y = data.target

View File

@@ -12,8 +12,10 @@ setup(
name="cppfimdlp",
sources=[
"fimdlp/cfimdlp.pyx",
"fimdlp/CPPFImdlp.cpp",
"fimdlp/Metrics.cpp",
# "fimdlp/CPPFImdlp.cpp",
# "fimdlp/Metrics.cpp",
"fimdlp/ccMetrics.cc",
"fimdlp/ccFImdlp.cc",
],
language="c++",
include_dirs=["fimdlp"],