mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-16 16:05:52 +00:00
Test alternative
This commit is contained in:
6
.gitignore
vendored
6
.gitignore
vendored
@@ -33,8 +33,8 @@ MANIFEST
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
# Installer log2s
|
||||
pip-log2.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
@@ -56,7 +56,7 @@ coverage.xml
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
*.log2
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
232
debug.cpp
232
debug.cpp
@@ -11,21 +11,21 @@ for (auto cutPoint = cutIdx.begin(); cutPoint != cutIdx.end(); ++cutPoint) {
|
||||
// << Metrics::informationGain(y, 0, y.size(), *cutPoint, Metrics::numClasses(y, 0, y.size())) << std::endl;
|
||||
}
|
||||
|
||||
def test(self) :
|
||||
def test(self):
|
||||
print("Calculating cut points in python for first feature")
|
||||
yz = self.y_.copy()
|
||||
xz = X[:, 0].copy()
|
||||
xz = xz[np.argsort(X[:, 0])]
|
||||
yz = yz[np.argsort(X[:, 0])]
|
||||
cuts = []
|
||||
for i in range(1, len(yz)) :
|
||||
for i in range(1, len(yz)):
|
||||
if yz[i] != yz[i - 1] and xz[i - 1] < xz[i] :
|
||||
print(f"Cut point: ({xz[i-1]}, {xz[i]}) ({yz[i-1]}, {yz[i]})")
|
||||
cuts.append((xz[i] + xz[i - 1]) / 2)
|
||||
print("Cuts calculados en python: ", cuts)
|
||||
print("-- Cuts calculados en C++ --")
|
||||
print("Cut points for each feature in Iris dataset:")
|
||||
for i in range(0, 1) :
|
||||
for i in range(0, 1):
|
||||
# datax = self.X_[np.argsort(self.X_[:, i]), i]
|
||||
# y_ = self.y_[np.argsort(self.X_[:, i])]
|
||||
datax = self.X_[:, i]
|
||||
@@ -46,4 +46,228 @@ def test(self) :
|
||||
print("Disretized values:")
|
||||
print(self.discretizer_.get_discretized_values())
|
||||
print("*******************************")
|
||||
return X
|
||||
return X
|
||||
|
||||
c++
|
||||
i: 0 4.3, 0
|
||||
i : 1 4.4, 0
|
||||
i : 2 4.4, 0
|
||||
i : 3 4.4, 0
|
||||
i : 4 4.5, 0
|
||||
i : 5 4.6, 0
|
||||
i : 6 4.6, 0
|
||||
i : 7 4.6, 0
|
||||
i : 8 4.6, 0
|
||||
i : 9 4.7, 0
|
||||
i : 10 4.7, 0
|
||||
i : 11 4.8, 0
|
||||
i : 12 4.8, 0
|
||||
i : 13 4.8, 0
|
||||
i : 14 4.8, 0
|
||||
i : 15 4.8, 0
|
||||
i : 16 4.9, 0
|
||||
i : 17 4.9, 0
|
||||
i : 18 4.9, 0
|
||||
i : 19 4.9, 0
|
||||
i : 20 4.9, 1
|
||||
|
||||
python
|
||||
i : 0 4.3 0
|
||||
i : 1 4.4 0
|
||||
i : 2 4.4 0
|
||||
i : 3 4.4 0
|
||||
i : 4 4.5 0
|
||||
i : 5 4.6 0
|
||||
i : 6 4.6 0
|
||||
i : 7 4.6 0
|
||||
i : 8 4.6 0
|
||||
i : 9 4.7 0
|
||||
i : 10 4.7 0
|
||||
i : 11 4.8 0
|
||||
i : 12 4.8 0
|
||||
i : 13 4.8 0
|
||||
i : 14 4.8 0
|
||||
i : 15 4.8 0
|
||||
i : 16 4.9 1
|
||||
i : 17 4.9 2
|
||||
i : 18 4.9 0
|
||||
i : 19 4.9 0
|
||||
i : 20 4.9 0
|
||||
|
||||
|
||||
|
||||
idx: 20 entropy_left : 0 entropy_right : 0.488187 -> 0 150
|
||||
idx : 21 entropy_left : 0.0670374 entropy_right : 0.489381 -> 0 150
|
||||
idx : 22 entropy_left : 0.125003 entropy_right : 0.490573 -> 0 150
|
||||
idx : 24 entropy_left : 0.11507 entropy_right : 0.482206 -> 0 150
|
||||
idx : 25 entropy_left : 0.162294 entropy_right : 0.483488 -> 0 150
|
||||
idx : 29 entropy_left : 0.141244 entropy_right : 0.462922 -> 0 150
|
||||
idx : 30 entropy_left : 0.178924 entropy_right : 0.464386 -> 0 150
|
||||
idx : 33 entropy_left : 0.163818 entropy_right : 0.444778 -> 0 150
|
||||
idx : 34 entropy_left : 0.195735 entropy_right : 0.44637 -> 0 150
|
||||
idx : 44 entropy_left : 0.154253 entropy_right : 0.339183 -> 0 150
|
||||
idx : 45 entropy_left : 0.178924 entropy_right : 0.34098 -> 0 150
|
||||
idx : 51 entropy_left : 0.159328 entropy_right : 0.217547 -> 0 150
|
||||
idx : 52 entropy_left : 0.180508 entropy_right : 0.219019 -> 0 150
|
||||
idx : 53 entropy_left : 0.177368 entropy_right : 0.189687 -> 0 150
|
||||
idx : 58 entropy_left : 0.265229 entropy_right : 0.196677 -> 0 150
|
||||
idx : 59 entropy_left : 0.261331 entropy_right : 0.162291 -> 0 150
|
||||
idx : 61 entropy_left : 0.289819 entropy_right : 0.164857 -> 0 150
|
||||
idx : 62 entropy_left : 0.302928 entropy_right : 0.166175 -> 0 150
|
||||
idx : 68 entropy_left : 0.36831 entropy_right : 0.174607 -> 0 150
|
||||
idx : 69 entropy_left : 0.364217 entropy_right : 0.131848 -> 0 150
|
||||
idx : 70 entropy_left : 0.373248 entropy_right : 0.133048 -> 0 150
|
||||
idx : 71 entropy_left : 0.381826 entropy_right : 0.134273 -> 0 150
|
||||
idx : 72 entropy_left : 0.377855 entropy_right : 0.0805821 -> 0 150
|
||||
idx : 74 entropy_left : 0.393817 entropy_right : 0.0822096 -> 0 150
|
||||
idx : 75 entropy_left : 0.401218 entropy_right : 0.0830509 -> 0 150
|
||||
idx : 76 entropy_left : 0.397415 entropy_right : 0 -> 0 150
|
||||
idx : 77 entropy_left : 0.4045 entropy_right : 0 -> 0 150
|
||||
idx : 78 entropy_left : 0.411247 entropy_right : 0 -> 0 150
|
||||
idx : 79 entropy_left : 0.417674 entropy_right : 0 -> 0 150
|
||||
idx : 81 entropy_left : 0.429626 entropy_right : 0 -> 0 150
|
||||
idx : 83 entropy_left : 0.440472 entropy_right : 0 -> 0 150
|
||||
idx : 84 entropy_left : 0.445513 entropy_right : 0 -> 0 150
|
||||
idx : 87 entropy_left : 0.459246 entropy_right : 0 -> 0 150
|
||||
idx : 88 entropy_left : 0.463395 entropy_right : 0 -> 0 150
|
||||
idx : 89 entropy_left : 0.467347 entropy_right : 0 -> 0 150
|
||||
idx : 91 entropy_left : 0.474691 entropy_right : 0 -> 0 150
|
||||
idx : 95 entropy_left : 0.487368 entropy_right : 0 -> 0 150
|
||||
idx : 97 entropy_left : 0.492813 entropy_right : 0 -> 0 150
|
||||
idx : 99 entropy_left : 0.497728 entropy_right : 0 -> 0 150
|
||||
idx : 101 entropy_left : 0.502156 entropy_right : 0 -> 0 150
|
||||
idx : 102 entropy_left : 0.504201 entropy_right : 0 -> 0 150
|
||||
idx : 104 entropy_left : 0.507973 entropy_right : 0 -> 0 150
|
||||
idx : 105 entropy_left : 0.509709 entropy_right : 0 -> 0 150
|
||||
idx : 106 entropy_left : 0.511351 entropy_right : 0 -> 0 150
|
||||
idx : 107 entropy_left : 0.512902 entropy_right : 0 -> 0 150
|
||||
idx : 109 entropy_left : 0.515747 entropy_right : 0 -> 0 150
|
||||
idx : 110 entropy_left : 0.517047 entropy_right : 0 -> 0 150
|
||||
idx : 113 entropy_left : 0.520497 entropy_right : 0 -> 0 150
|
||||
idx : 114 entropy_left : 0.521506 entropy_right : 0 -> 0 150
|
||||
idx : 117 entropy_left : 0.524149 entropy_right : 0 -> 0 150
|
||||
idx : 118 entropy_left : 0.52491 entropy_right : 0 -> 0 150
|
||||
idx : 120 entropy_left : 0.526264 entropy_right : 0 -> 0 150
|
||||
idx : 122 entropy_left : 0.52741 entropy_right : 0 -> 0 150
|
||||
idx : 127 entropy_left : 0.52946 entropy_right : 0 -> 0 150
|
||||
idx : 130 entropy_left : 0.530197 entropy_right : 0 -> 0 150
|
||||
idx : 132 entropy_left : 0.530507 entropy_right : 0 -> 0 150
|
||||
idx : 133 entropy_left : 0.530611 entropy_right : 0 -> 0 150
|
||||
idx : 134 entropy_left : 0.530684 entropy_right : 0 -> 0 150
|
||||
idx : 135 entropy_left : 0.530726 entropy_right : 0 -> 0 150
|
||||
idx : 137 entropy_left : 0.530721 entropy_right : 0 -> 0 150
|
||||
idx : 138 entropy_left : 0.530677 entropy_right : 0 -> 0 150
|
||||
cut : 5.5 index : 53
|
||||
start : 0 cut : 53 end : 150
|
||||
k = 3 k1 = 3 k2 = 3 ent = 0.528321 ent1 = 0.177368 ent2 = 0.189687
|
||||
ig = 0.342987 delta = 4.16006 N 150 term 0.0758615
|
||||
¡Ding!5.5 53
|
||||
|
||||
|
||||
idx : 20 entropy_left : 0 entropy_right : 1.5485806065228545 -> 0 150
|
||||
idx : 21 entropy_left : 0.2761954276479391 entropy_right : 1.549829505666378 -> 0 150
|
||||
idx : 22 entropy_left : 0.5304060778306042 entropy_right : 1.5511852922535474 -> 0 150
|
||||
idx : 24 entropy_left : 0.4971501836369671 entropy_right : 1.5419822842863982 -> 0 150
|
||||
idx : 25 entropy_left : 0.6395563653739031 entropy_right : 1.5433449229510985 -> 0 150
|
||||
idx : 29 entropy_left : 0.574828144380386 entropy_right : 1.5202013991459298 -> 0 150
|
||||
idx : 30 entropy_left : 0.6746799231474564 entropy_right : 1.521677608876836 -> 0 150
|
||||
idx : 33 entropy_left : 0.6311718053929063 entropy_right : 1.4992098113026513 -> 0 150
|
||||
idx : 34 entropy_left : 0.7085966983474103 entropy_right : 1.5007111828980744 -> 0 150
|
||||
idx : 44 entropy_left : 0.5928251064639408 entropy_right : 1.3764263022492553 -> 0 150
|
||||
idx : 45 entropy_left : 0.6531791627726858 entropy_right : 1.3779796176519241 -> 0 150
|
||||
idx : 51 entropy_left : 0.5990326006132177 entropy_right : 1.2367928607774141 -> 0 150
|
||||
idx : 52 entropy_left : 0.6496096346956632 entropy_right : 1.2377158231343603 -> 0 150
|
||||
idx : 53 entropy_left : 0.6412482850735854 entropy_right : 1.2046986815511866 -> 0 150
|
||||
idx : 58 entropy_left : 0.8211258609270055 entropy_right : 1.2056112071736118 -> 0 150
|
||||
idx : 59 entropy_left : 0.8128223064150747 entropy_right : 1.167065448996099 -> 0 150
|
||||
idx : 61 entropy_left : 0.8623538561746379 entropy_right : 1.1653351793699953 -> 0 150
|
||||
idx : 62 entropy_left : 0.9353028851500502 entropy_right : 1.1687172769890006 -> 0 150
|
||||
idx : 68 entropy_left : 1.031929035599206 entropy_right : 1.1573913563403753 -> 0 150
|
||||
idx : 69 entropy_left : 1.0246284743137688 entropy_right : 1.109500797247481 -> 0 150
|
||||
idx : 70 entropy_left : 1.036186417911213 entropy_right : 1.105866621101474 -> 0 150
|
||||
idx : 71 entropy_left : 1.0895830429620594 entropy_right : 1.1104593064416028 -> 0 150
|
||||
idx : 72 entropy_left : 1.0822273380873693 entropy_right : 1.0511407586429597 -> 0 150
|
||||
idx : 74 entropy_left : 1.1015727511177442 entropy_right : 1.041722068095403 -> 0 150
|
||||
idx : 75 entropy_left : 1.1457749842070042 entropy_right : 1.0462881865460743 -> 0 150
|
||||
idx : 76 entropy_left : 1.1387129726704701 entropy_right : 0.9568886656798212 -> 0 150
|
||||
idx : 77 entropy_left : 1.1468549240968817 entropy_right : 0.9505668528932196 -> 0 150
|
||||
idx : 78 entropy_left : 1.1848333092150132 entropy_right : 0.9544340029249649 -> 0 150
|
||||
idx : 79 entropy_left : 1.1918623939938016 entropy_right : 0.9477073729342066 -> 0 150
|
||||
idx : 81 entropy_left : 1.2548698305334247 entropy_right : 0.9557589912150009 -> 0 150
|
||||
idx : 83 entropy_left : 1.2659342914094807 entropy_right : 0.9411864371816835 -> 0 150
|
||||
idx : 84 entropy_left : 1.2922669208691815 entropy_right : 0.9456603046006402 -> 0 150
|
||||
idx : 87 entropy_left : 1.3041589171425696 entropy_right : 0.9182958340544896 -> 0 150
|
||||
idx : 88 entropy_left : 1.327572716814381 entropy_right : 0.9235785996175947 -> 0 150
|
||||
idx : 89 entropy_left : 1.330465426809402 entropy_right : 0.9127341558073343 -> 0 150
|
||||
idx : 91 entropy_left : 1.3709454625942779 entropy_right : 0.9238422284571814 -> 0 150
|
||||
idx : 95 entropy_left : 1.378063041001916 entropy_right : 0.8698926856041563 -> 0 150
|
||||
idx : 97 entropy_left : 1.4115390027326744 entropy_right : 0.8835850861052532 -> 0 150
|
||||
idx : 99 entropy_left : 1.4130351465796736 entropy_right : 0.8478617451660526 -> 0 150
|
||||
idx : 101 entropy_left : 1.4412464483479606 entropy_right : 0.863120568566631 -> 0 150
|
||||
idx : 102 entropy_left : 1.4415827640191903 entropy_right : 0.8426578772022391 -> 0 150
|
||||
idx : 104 entropy_left : 1.4655411381577925 entropy_right : 0.8589810370425963 -> 0 150
|
||||
idx : 105 entropy_left : 1.465665295753282 entropy_right : 0.8366407419411673 -> 0 150
|
||||
idx : 106 entropy_left : 1.4762911618692924 entropy_right : 0.8453509366224365 -> 0 150
|
||||
idx : 107 entropy_left : 1.4762132849962355 entropy_right : 0.8203636429576732 -> 0 150
|
||||
idx : 109 entropy_left : 1.4951379218217782 entropy_right : 0.8390040613676977 -> 0 150
|
||||
idx : 110 entropy_left : 1.4949188482339508 entropy_right : 0.8112781244591328 -> 0 150
|
||||
idx : 113 entropy_left : 1.5183041104369397 entropy_right : 0.8418521897563207 -> 0 150
|
||||
idx : 114 entropy_left : 1.51802714866133 entropy_right : 0.8112781244591328 -> 0 150
|
||||
idx : 117 entropy_left : 1.5364854516368571 entropy_right : 0.8453509366224365 -> 0 150
|
||||
idx : 118 entropy_left : 1.5361890331151247 entropy_right : 0.8112781244591328 -> 0 150
|
||||
idx : 120 entropy_left : 1.5462566034163763 entropy_right : 0.8366407419411673 -> 0 150
|
||||
idx : 122 entropy_left : 1.545378825051491 entropy_right : 0.74959525725948 -> 0 150
|
||||
idx : 127 entropy_left : 1.5644893588382582 entropy_right : 0.828055725379504 -> 0 150
|
||||
idx : 130 entropy_left : 1.562956340286807 entropy_right : 0.6098403047164004 -> 0 150
|
||||
idx : 132 entropy_left : 1.5687623685201277 entropy_right : 0.6500224216483541 -> 0 150
|
||||
idx : 133 entropy_left : 1.5680951037987416 entropy_right : 0.5225593745369408 -> 0 150
|
||||
idx : 134 entropy_left : 1.5706540443736308 entropy_right : 0.5435644431995964 -> 0 150
|
||||
idx : 135 entropy_left : 1.5699201014782036 entropy_right : 0.35335933502142136 -> 0 150
|
||||
idx : 137 entropy_left : 1.5744201314186457 entropy_right : 0.39124356362925566 -> 0 150
|
||||
idx : 138 entropy_left : 1.5736921054134685 entropy_right : 0 -> 0 150
|
||||
¡Ding!4.9 20
|
||||
|
||||
k = 2 k1 = 1 k2 = 2 ent = 0.5225593745369408 ent1 = 0 ent2 = 0.5435644431995964
|
||||
ig = 0.010969310349085326 delta = 2.849365059382915 N 17 term 0.4029038270225244
|
||||
idx : 135 entropy_left : 0 entropy_right : 0.35335933502142136 -> 134 150
|
||||
idx : 137 entropy_left : 0.9182958340544896 entropy_right : 0.39124356362925566 -> 134 150
|
||||
idx : 138 entropy_left : 1.0 entropy_right : 0 -> 134 150
|
||||
start : 134 cut : 135 end : 150
|
||||
k = 2 k1 = 1 k2 = 2 ent = 0.5435644431995964 ent1 = 0 ent2 = 0.35335933502142136
|
||||
ig = 0.21229006661701388 delta = 2.426944705701254 N 16 term 0.39586470633186077
|
||||
idx : 137 entropy_left : 0 entropy_right : 0.39124356362925566 -> 135 150
|
||||
idx : 138 entropy_left : 0.9182958340544896 entropy_right : 0 -> 135 150
|
||||
start : 135 cut : 137 end : 150
|
||||
k = 2 k1 = 1 k2 = 2 ent = 0.35335933502142136 ent1 = 0 ent2 = 0.39124356362925566
|
||||
ig = 0.01428157987606643 delta = 2.8831233792732727 N 15 term 0.44603188675539174
|
||||
idx : 138 entropy_left : 0 entropy_right : 0 -> 137 150
|
||||
start : 137 cut : 138 end : 150
|
||||
k = 2 k1 = 1 k2 = 1 ent = 0.39124356362925566 ent1 = 0 ent2 = 0
|
||||
ig = 0.39124356362925566 delta = 2.0248677947990927 N 13 term 0.4315254073477115
|
||||
[[4.9, 5.2, 5.4, 6.75]]
|
||||
|
||||
|
||||
cut : 1.4 index : 81
|
||||
start : 50 cut : 81 end : 96
|
||||
k = 2 k1 = 2 k2 = 1 ent = 0.151097 ent1 = 0.205593 ent2 = 0
|
||||
ig = 0.0125455 delta = 2.91635 N 46 term 0.182787
|
||||
idx : 80 entropy_left : 0 entropy_right : 0 -> 50 81
|
||||
cut : 1.4 index : 80
|
||||
start : 50 cut : 80 end : 81
|
||||
k = 2 k1 = 1 k2 = 1 ent = 0.205593 ent1 = 0 ent2 = 0
|
||||
ig = 0.205593 delta = 2.39617 N 31 term 0.235583
|
||||
idx : 112 entropy_left : 0 entropy_right : 0.175565 -> 103 150
|
||||
idx : 113 entropy_left : 0.468996 entropy_right : 0 -> 103 150
|
||||
cut : 1.8 index : 112
|
||||
start : 103 cut : 112 end : 150
|
||||
k = 2 k1 = 1 k2 = 2 ent = 0.148549 ent1 = 0 ent2 = 0.175565
|
||||
ig = 0.00660326 delta = 2.86139 N 47 term 0.178403
|
||||
idx : 113 entropy_left : 0 entropy_right : 0 -> 112 150
|
||||
cut : 1.8 index : 113
|
||||
start : 112 cut : 113 end : 150
|
||||
k = 2 k1 = 1 k2 = 1 ent = 0.175565 ent1 = 0 ent2 = 0
|
||||
ig = 0.175565 delta = 2.45622 N 38 term 0.201728
|
||||
[[4.900000095367432, 4.949999809265137, 5.0, 5.099999904632568, 5.199999809265137, 5.25, 5.400000095367432, 5.449999809265137,
|
||||
5.5, 5.550000190734863, 5.599999904632568, 5.699999809265137, 5.800000190734863, 5.900000095367432, 5.949999809265137, 6.0, 6.050000190734863,
|
||||
6.099999904632568, 6.149999618530273, 6.199999809265137, 6.25, 6.300000190734863, 6.400000095367432, 6.5, 6.550000190734863, 6.649999618530273, 6.699999809265137,
|
||||
6.75, 6.800000190734863, 6.850000381469727, 6.900000095367432, 6.949999809265137, 7.050000190734863]]
|
@@ -13,12 +13,12 @@ namespace mdlp {
|
||||
return os;
|
||||
|
||||
}
|
||||
CPPFImdlp::CPPFImdlp() : proposal(true), precision(6), debug(false)
|
||||
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false)
|
||||
{
|
||||
divider = pow(10, precision);
|
||||
numClasses = 0;
|
||||
}
|
||||
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug) : proposal(proposal), precision(precision), debug(debug)
|
||||
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug)
|
||||
{
|
||||
divider = pow(10, precision);
|
||||
numClasses = 0;
|
||||
|
@@ -26,7 +26,7 @@ namespace mdlp {
|
||||
entropy -= p * log2(p);
|
||||
}
|
||||
}
|
||||
return entropy;
|
||||
return entropy < 0 ? 0 : entropy;
|
||||
}
|
||||
float Metrics::informationGain(labels& y, indices_t& indices, size_t start, size_t end, size_t cutPoint, int nClasses)
|
||||
{
|
||||
@@ -45,3 +45,13 @@ namespace mdlp {
|
||||
}
|
||||
|
||||
}
|
||||
/*
|
||||
cache_t entropyCache;
|
||||
std::map<std::tuple<int, int>, double> c;
|
||||
|
||||
// Set the value at index (3, 5) to 7.8.
|
||||
c[std::make_tuple(3, 5)] = 7.8;
|
||||
|
||||
// Print the value at index (3, 5).
|
||||
std::cout << c[std::make_tuple(3, 5)] << std::endl;
|
||||
*/
|
159
fimdlp/ccFImdlp.cc
Normal file
159
fimdlp/ccFImdlp.cc
Normal file
@@ -0,0 +1,159 @@
|
||||
#include "ccFImdlp.h"
|
||||
#include <numeric>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include "ccMetrics.h"
|
||||
|
||||
namespace mdlp {
|
||||
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false), divider(pow(10, precision)), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
|
||||
{
|
||||
}
|
||||
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug), divider(pow(10, precision)), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
|
||||
{
|
||||
}
|
||||
CPPFImdlp::~CPPFImdlp()
|
||||
= default;
|
||||
|
||||
CPPFImdlp& CPPFImdlp::fitx(samples& X_, labels& y_)
|
||||
{
|
||||
X = X_;
|
||||
y = y_;
|
||||
if (X.size() != y.size()) {
|
||||
throw invalid_argument("X and y must have the same size");
|
||||
}
|
||||
if (X.size() == 0 || y.size() == 0) {
|
||||
throw invalid_argument("X and y must have at least one element");
|
||||
}
|
||||
indices = sortIndices(X_);
|
||||
metrics.setData(y, indices);
|
||||
computeCutPointsRecursive(0, X.size());
|
||||
//simulateCutPointsRecursive();
|
||||
return *this;
|
||||
}
|
||||
void CPPFImdlp::simulateCutPointsRecursive()
|
||||
{
|
||||
cutPoints_t jobs = cutPoints_t();
|
||||
jobs.push_back(cutPoint_t({ 0, X.size() }));
|
||||
while (jobs.size() > 0) {
|
||||
auto interval = jobs.back();
|
||||
jobs.pop_back();
|
||||
//cout << "start: " << interval.start << " end: " << interval.end << endl;
|
||||
auto cut = getCandidateSimulate(interval.start, interval.end);
|
||||
if (cut == -1 || !mdlp(interval.start, cut, interval.end)) {
|
||||
if (interval.start != 0)
|
||||
xCutPoints.push_back(xcutPoint_t({ interval.start, (X[indices[interval.start]] + X[indices[interval.start - 1]]) / 2 }));
|
||||
if (interval.end != X.size())
|
||||
xCutPoints.push_back(xcutPoint_t({ interval.end, (X[indices[interval.end]] + X[indices[interval.end - 1]]) / 2 }));
|
||||
continue;
|
||||
}
|
||||
jobs.push_back(cutPoint_t({ interval.start, size_t(cut) }));
|
||||
jobs.push_back(cutPoint_t({ size_t(cut), interval.end }));
|
||||
|
||||
}
|
||||
}
|
||||
void CPPFImdlp::computeCutPointsRecursive(size_t start, size_t end)
|
||||
{
|
||||
xcutPoint_t cut;
|
||||
//cout << "start: " << start << " end: " << end << endl;
|
||||
if (end - start < 2)
|
||||
return;
|
||||
cut = getCandidate(start, end);
|
||||
if (cut.value == -1 || !mdlp(start, cut.index, end)) {
|
||||
// cut.value == -1 means that there is no candidate in the interval
|
||||
// that enhances the information gain
|
||||
//cout << "¡Ding! " << cut.value << " " << cut.index << endl;
|
||||
if (start != 0)
|
||||
xCutPoints.push_back(xcutPoint_t({ start, (X[indices[start]] + X[indices[start - 1]]) / 2 }));
|
||||
if (end != X.size())
|
||||
xCutPoints.push_back(xcutPoint_t({ end, (X[indices[end]] + X[indices[end - 1]]) / 2 }));
|
||||
return;
|
||||
}
|
||||
computeCutPointsRecursive(start, cut.index);
|
||||
computeCutPointsRecursive(cut.index, end);
|
||||
}
|
||||
xcutPoint_t CPPFImdlp::getCandidate(size_t start, size_t end)
|
||||
{
|
||||
xcutPoint_t candidate;
|
||||
int elements = end - start;
|
||||
candidate.value = -1;
|
||||
candidate.index = -1;
|
||||
float entropy_left, entropy_right, minEntropy = numeric_limits<float>::max();
|
||||
for (auto idx = start + 1; idx < end; idx++) {
|
||||
if (y[indices[idx]] == y[indices[idx - 1]])
|
||||
continue;
|
||||
entropy_left = float(idx - start) / elements * metrics.entropy(start, idx);
|
||||
entropy_right = float(end - idx) / elements * metrics.entropy(idx, end);
|
||||
if (entropy_left + entropy_right < minEntropy) {
|
||||
minEntropy = entropy_left + entropy_right;
|
||||
candidate.value = (X[indices[idx]] + X[indices[idx - 1]]) / 2;
|
||||
candidate.index = idx;
|
||||
}
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
int CPPFImdlp::getCandidateSimulate(size_t start, size_t end)
|
||||
{
|
||||
int candidate = -1;
|
||||
int elements = end - start;
|
||||
float entropy_left, entropy_right, minEntropy = numeric_limits<float>::max();
|
||||
for (auto idx = start + 1; idx < end; idx++) {
|
||||
if (y[indices[idx]] == y[indices[idx - 1]])
|
||||
continue;
|
||||
entropy_left = float(idx - start) / elements * metrics.entropy(start, idx);
|
||||
entropy_right = float(end - idx) / elements * metrics.entropy(idx, end);
|
||||
if (minEntropy > entropy_left + entropy_right) {
|
||||
minEntropy = entropy_left + entropy_right;
|
||||
candidate = idx;
|
||||
}
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
|
||||
{
|
||||
int k, k1, k2;
|
||||
float ig, delta;
|
||||
float ent, ent1, ent2;
|
||||
auto N = float(end - start);
|
||||
if (N < 2) {
|
||||
return false;
|
||||
}
|
||||
k = metrics.computeNumClasses(start, end);
|
||||
k1 = metrics.computeNumClasses(start, cut);
|
||||
k2 = metrics.computeNumClasses(cut, end);
|
||||
ent = metrics.entropy(start, end);
|
||||
ent1 = metrics.entropy(start, cut);
|
||||
ent2 = metrics.entropy(cut, end);
|
||||
ig = metrics.informationGain(start, cut, end);
|
||||
delta = log2(pow(3, float(k)) - 2) - (float(k) * ent - float(k1) * ent1 - float(k2) * ent2);
|
||||
float term = 1 / N * (log2(N - 1) + delta);
|
||||
if (debug) {
|
||||
cout << "start: " << start << " cut: " << cut << " end: " << end << endl;
|
||||
cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << endl;
|
||||
cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << endl;
|
||||
}
|
||||
return ig > term;
|
||||
}
|
||||
samples CPPFImdlp::getCutPointsx()
|
||||
{
|
||||
// Remove duplicates and sort
|
||||
samples output(xCutPoints.size());
|
||||
set<float> s;
|
||||
unsigned size = xCutPoints.size();
|
||||
for (unsigned i = 0; i < size; i++)
|
||||
s.insert(xCutPoints[i].value);
|
||||
output.assign(s.begin(), s.end());
|
||||
sort(output.begin(), output.end());
|
||||
return output;
|
||||
}
|
||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
||||
indices_t CPPFImdlp::sortIndices(samples& X_)
|
||||
{
|
||||
indices_t idx(X_.size());
|
||||
iota(idx.begin(), idx.end(), 0);
|
||||
for (size_t i = 0; i < X_.size(); i++)
|
||||
sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
|
||||
{ return X_[i1] < X_[i2]; });
|
||||
return idx;
|
||||
}
|
||||
}
|
35
fimdlp/ccFImdlp.h
Normal file
35
fimdlp/ccFImdlp.h
Normal file
@@ -0,0 +1,35 @@
|
||||
#ifndef CCFIMDLP_H
|
||||
#define CCFIMDLP_H
|
||||
#include "typesFImdlp.h"
|
||||
#include "ccMetrics.h"
|
||||
#include <utility>
|
||||
namespace mdlp {
|
||||
class CPPFImdlp {
|
||||
protected:
|
||||
bool proposal; // proposed algorithm or original algorithm
|
||||
int precision;
|
||||
bool debug;
|
||||
float divider;
|
||||
indices_t indices; // sorted indices to use with X and y
|
||||
samples X;
|
||||
labels y;
|
||||
Metrics metrics;
|
||||
xcutPoints_t xCutPoints;
|
||||
|
||||
static indices_t sortIndices(samples&);
|
||||
void computeCutPointsRecursive(size_t, size_t);
|
||||
xcutPoint_t getCandidate(size_t, size_t);
|
||||
bool mdlp(size_t, size_t, size_t);
|
||||
void simulateCutPointsRecursive();
|
||||
int getCandidateSimulate(size_t, size_t);
|
||||
|
||||
public:
|
||||
CPPFImdlp();
|
||||
CPPFImdlp(bool, int, bool debug = false);
|
||||
~CPPFImdlp();
|
||||
indices_t getIndices();
|
||||
CPPFImdlp& fitx(samples&, labels&);
|
||||
samples getCutPointsx();
|
||||
};
|
||||
}
|
||||
#endif
|
74
fimdlp/ccMetrics.cc
Normal file
74
fimdlp/ccMetrics.cc
Normal file
@@ -0,0 +1,74 @@
|
||||
#include "ccMetrics.h"
|
||||
#include <set>
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
namespace mdlp {
|
||||
Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
|
||||
{
|
||||
}
|
||||
int Metrics::computeNumClasses(size_t start, size_t end)
|
||||
{
|
||||
set<int> nClasses;
|
||||
for (auto i = start; i < end; ++i) {
|
||||
nClasses.insert(y[indices[i]]);
|
||||
}
|
||||
return nClasses.size();
|
||||
}
|
||||
void Metrics::setData(labels& y_, indices_t& indices_)
|
||||
{
|
||||
indices = indices_;
|
||||
y = y_;
|
||||
numClasses = computeNumClasses(0, indices.size());
|
||||
}
|
||||
float Metrics::entropy(size_t start, size_t end)
|
||||
{
|
||||
float p, ventropy = 0;
|
||||
int nElements = 0;
|
||||
labels counts(numClasses + 1, 0);
|
||||
if (end - start < 2)
|
||||
return 0;
|
||||
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
|
||||
return entropyCache[make_tuple(start, end)];
|
||||
}
|
||||
for (auto i = &indices[start]; i != &indices[end]; ++i) {
|
||||
counts[y[*i]]++;
|
||||
nElements++;
|
||||
}
|
||||
for (auto count : counts) {
|
||||
if (count > 0) {
|
||||
p = (float)count / nElements;
|
||||
ventropy -= p * log2(p);
|
||||
}
|
||||
}
|
||||
entropyCache[make_tuple(start, end)] = ventropy;
|
||||
return ventropy;
|
||||
}
|
||||
float Metrics::informationGain(size_t start, size_t cut, size_t end)
|
||||
{
|
||||
float iGain;
|
||||
float entropyInterval, entropyLeft, entropyRight;
|
||||
int nElementsLeft = cut - start, nElementsRight = end - cut;
|
||||
int nElements = end - start;
|
||||
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
|
||||
cout << "**********Cache IG hit for " << start << " " << end << endl;
|
||||
return igCache[make_tuple(start, cut, end)];
|
||||
}
|
||||
entropyInterval = entropy(start, end);
|
||||
entropyLeft = entropy(start, cut);
|
||||
entropyRight = entropy(cut, end);
|
||||
iGain = entropyInterval - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements;
|
||||
igCache[make_tuple(start, cut, end)] = iGain;
|
||||
return iGain;
|
||||
}
|
||||
|
||||
}
|
||||
/*
|
||||
cache_t entropyCache;
|
||||
std::map<std::tuple<int, int>, double> c;
|
||||
|
||||
// Set the value at index (3, 5) to 7.8.
|
||||
c[std::make_tuple(3, 5)] = 7.8;
|
||||
|
||||
// Print the value at index (3, 5).
|
||||
std::cout << c[std::make_tuple(3, 5)] << std::endl;
|
||||
*/
|
21
fimdlp/ccMetrics.h
Normal file
21
fimdlp/ccMetrics.h
Normal file
@@ -0,0 +1,21 @@
|
||||
#ifndef CCMETRICS_H
|
||||
#define CCMETRICS_H
|
||||
#include "typesFImdlp.h"
|
||||
#include <cmath>
|
||||
namespace mdlp {
|
||||
class Metrics {
|
||||
protected:
|
||||
labels& y;
|
||||
indices_t& indices;
|
||||
int numClasses;
|
||||
cacheEnt_t entropyCache;
|
||||
cacheIg_t igCache;
|
||||
public:
|
||||
Metrics(labels&, indices_t&);
|
||||
void setData(labels&, indices_t&);
|
||||
int computeNumClasses(size_t, size_t);
|
||||
float entropy(size_t, size_t);
|
||||
float informationGain(size_t, size_t, size_t);
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -3,7 +3,7 @@
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp cimport bool
|
||||
|
||||
cdef extern from "CPPFImdlp.h" namespace "mdlp":
|
||||
cdef extern from "ccFImdlp.h" namespace "mdlp":
|
||||
cdef struct CutPointBody:
|
||||
size_t start, end;
|
||||
int classNumber;
|
||||
@@ -11,9 +11,8 @@ cdef extern from "CPPFImdlp.h" namespace "mdlp":
|
||||
cdef cppclass CPPFImdlp:
|
||||
CPPFImdlp() except +
|
||||
CPPFImdlp(bool, int, bool) except +
|
||||
CPPFImdlp& fit(vector[float]&, vector[int]&)
|
||||
vector[int] getDiscretizedValues()
|
||||
vector[float] getCutPoints()
|
||||
CPPFImdlp& fitx(vector[float]&, vector[int]&)
|
||||
vector[float] getCutPointsx()
|
||||
|
||||
|
||||
class PcutPoint_t:
|
||||
@@ -31,10 +30,8 @@ cdef class CFImdlp:
|
||||
def __dealloc__(self):
|
||||
del self.thisptr
|
||||
def fit(self, X, y):
|
||||
self.thisptr.fit(X, y)
|
||||
self.thisptr.fitx(X, y)
|
||||
return self
|
||||
def get_discretized_values(self):
|
||||
return self.thisptr.getDiscretizedValues()
|
||||
def get_cut_points(self):
|
||||
return self.thisptr.getCutPoints()
|
||||
return self.thisptr.getCutPointsx()
|
||||
|
Binary file not shown.
36
fimdlp/m2.cpp
Normal file
36
fimdlp/m2.cpp
Normal file
@@ -0,0 +1,36 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
struct CutPointBody {
|
||||
size_t start, end; // indices of the sorted vector
|
||||
int classNumber; // class assigned to the cut point
|
||||
float fromValue, toValue;
|
||||
};
|
||||
typedef CutPointBody cutPoint_t;
|
||||
typedef vector<float> samples;
|
||||
typedef vector<int> labels;
|
||||
typedef vector<size_t> indices_t;
|
||||
typedef vector<cutPoint_t> cutPoints_t;
|
||||
//typedef std::map<std::tuple<int, int>, float> cache_t;
|
||||
struct cutPointStruct {
|
||||
size_t index;
|
||||
float value;
|
||||
};
|
||||
typedef cutPointStruct xcutPoint_t;
|
||||
typedef vector<xcutPoint_t> xcutPoints_t;
|
||||
class Metrics {
|
||||
private:
|
||||
labels& y;
|
||||
indices_t& indices;
|
||||
int numClasses;
|
||||
public:
|
||||
Metrics(labels&, indices_t&);
|
||||
int computeNumClasses(size_t, size_t);
|
||||
float entropy(size_t, size_t);
|
||||
float informationGain(size_t, size_t, size_t);
|
||||
};
|
||||
Metrics::Metrics(labels& y_, indices_t& indices_) : y(y_), indices(indices_)
|
||||
{
|
||||
numClasses = computeNumClasses(0, indices.size());
|
||||
}
|
@@ -86,7 +86,9 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
self.cut_points_ = [None] * self.n_features_
|
||||
# Can do it in parallel
|
||||
for feature in self.features_:
|
||||
self.discretizer_[feature] = PyFImdlp(proposal=self.proposal)
|
||||
self.discretizer_[feature] = CFImdlp(
|
||||
proposal=self.proposal, debug=False
|
||||
)
|
||||
self.discretizer_[feature].fit(X[:, feature], y)
|
||||
self.cut_points_[feature] = self.discretizer_[
|
||||
feature
|
||||
@@ -132,10 +134,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
|
||||
# Check that the input is of the same shape as the one passed
|
||||
# during fit.
|
||||
if X.shape[1] != self.n_features_:
|
||||
raise ValueError(
|
||||
"Shape of input is different from what was seen in `fit`"
|
||||
)
|
||||
# if X.shape[1] != self.n_features_:
|
||||
# raise ValueError(
|
||||
# "Shape of input is different from what was seen in `fit`"
|
||||
# )
|
||||
result = np.zeros_like(X, dtype=np.int32) - 1
|
||||
# Can do it in parallel
|
||||
for feature in range(self.n_features_):
|
||||
|
@@ -1,14 +1,15 @@
|
||||
import numpy as np
|
||||
from math import log
|
||||
from math import log2
|
||||
from types import SimpleNamespace
|
||||
|
||||
|
||||
class PyFImdlp:
|
||||
def __init__(self, proposal=True):
|
||||
def __init__(self, proposal=True, debug=False):
|
||||
self.proposal = proposal
|
||||
self.n_features_ = None
|
||||
self.X_ = None
|
||||
self.y_ = None
|
||||
self.debug = debug
|
||||
self.features_ = None
|
||||
self.cut_points_ = []
|
||||
self.entropy_cache = {}
|
||||
@@ -17,9 +18,315 @@ class PyFImdlp:
|
||||
def fit(self, X, y):
|
||||
self.n_features_ = len(X)
|
||||
self.indices_ = np.argsort(X)
|
||||
self.use_indices = True
|
||||
self.X_ = X[self.indices_] if not self.use_indices else X
|
||||
self.y_ = y[self.indices_] if not self.use_indices else y
|
||||
self.use_indices = False
|
||||
X = [
|
||||
4.3,
|
||||
4.4,
|
||||
4.4,
|
||||
4.4,
|
||||
4.5,
|
||||
4.6,
|
||||
4.6,
|
||||
4.6,
|
||||
4.6,
|
||||
4.7,
|
||||
4.7,
|
||||
4.8,
|
||||
4.8,
|
||||
4.8,
|
||||
4.8,
|
||||
4.8,
|
||||
4.9,
|
||||
4.9,
|
||||
4.9,
|
||||
4.9,
|
||||
4.9,
|
||||
4.9,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5.1,
|
||||
5.1,
|
||||
5.1,
|
||||
5.1,
|
||||
5.1,
|
||||
5.1,
|
||||
5.1,
|
||||
5.1,
|
||||
5.1,
|
||||
5.2,
|
||||
5.2,
|
||||
5.2,
|
||||
5.2,
|
||||
5.3,
|
||||
5.4,
|
||||
5.4,
|
||||
5.4,
|
||||
5.4,
|
||||
5.4,
|
||||
5.4,
|
||||
5.5,
|
||||
5.5,
|
||||
5.5,
|
||||
5.5,
|
||||
5.5,
|
||||
5.5,
|
||||
5.5,
|
||||
5.6,
|
||||
5.6,
|
||||
5.6,
|
||||
5.6,
|
||||
5.6,
|
||||
5.6,
|
||||
5.7,
|
||||
5.7,
|
||||
5.7,
|
||||
5.7,
|
||||
5.7,
|
||||
5.7,
|
||||
5.7,
|
||||
5.7,
|
||||
5.8,
|
||||
5.8,
|
||||
5.8,
|
||||
5.8,
|
||||
5.8,
|
||||
5.8,
|
||||
5.8,
|
||||
5.9,
|
||||
5.9,
|
||||
5.9,
|
||||
6,
|
||||
6,
|
||||
6,
|
||||
6,
|
||||
6,
|
||||
6,
|
||||
6.1,
|
||||
6.1,
|
||||
6.1,
|
||||
6.1,
|
||||
6.1,
|
||||
6.1,
|
||||
6.2,
|
||||
6.2,
|
||||
6.2,
|
||||
6.2,
|
||||
6.3,
|
||||
6.3,
|
||||
6.3,
|
||||
6.3,
|
||||
6.3,
|
||||
6.3,
|
||||
6.3,
|
||||
6.3,
|
||||
6.3,
|
||||
6.4,
|
||||
6.4,
|
||||
6.4,
|
||||
6.4,
|
||||
6.4,
|
||||
6.4,
|
||||
6.4,
|
||||
6.5,
|
||||
6.5,
|
||||
6.5,
|
||||
6.5,
|
||||
6.5,
|
||||
6.6,
|
||||
6.6,
|
||||
6.7,
|
||||
6.7,
|
||||
6.7,
|
||||
6.7,
|
||||
6.7,
|
||||
6.7,
|
||||
6.7,
|
||||
6.7,
|
||||
6.8,
|
||||
6.8,
|
||||
6.8,
|
||||
6.9,
|
||||
6.9,
|
||||
6.9,
|
||||
6.9,
|
||||
7,
|
||||
7.1,
|
||||
7.2,
|
||||
7.2,
|
||||
7.2,
|
||||
7.3,
|
||||
7.4,
|
||||
7.6,
|
||||
7.7,
|
||||
7.7,
|
||||
7.7,
|
||||
7.7,
|
||||
7.9,
|
||||
]
|
||||
y = [
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
]
|
||||
# self.X_ = X[self.indices_] if not self.use_indices else X
|
||||
# self.y_ = y[self.indices_] if not self.use_indices else y
|
||||
self.X_ = X
|
||||
self.y_ = y
|
||||
self.compute_cut_points(0, len(y))
|
||||
return self
|
||||
|
||||
@@ -27,9 +334,11 @@ class PyFImdlp:
|
||||
return sorted(list(set([cut.value for cut in self.cut_points_])))
|
||||
|
||||
def compute_cut_points(self, start, end):
|
||||
# print((start, end))
|
||||
cut = self.get_candidate(start, end)
|
||||
if cut.value is None:
|
||||
return
|
||||
print("cut: ", cut.value, " index: ", cut.index)
|
||||
if self.mdlp(cut, start, end):
|
||||
print("¡Ding!", cut.value, cut.index)
|
||||
self.cut_points_.append(cut)
|
||||
@@ -45,10 +354,26 @@ class PyFImdlp:
|
||||
ent1 = self.entropy(start, cut.index)
|
||||
ent2 = self.entropy(cut.index, end)
|
||||
ig = self.information_gain(start, cut.index, end)
|
||||
delta = log(pow(3, k) - 2, 2) - (
|
||||
delta = log2(pow(3, k) - 2, 2) - (
|
||||
float(k) * ent - float(k1) * ent1 - float(k2) * ent2
|
||||
)
|
||||
term = 1 / N * (log(N - 1, 2) + delta)
|
||||
term = 1 / N * (log2(N - 1, 2) + delta)
|
||||
print("start: ", start, " cut: ", cut.index, " end: ", end)
|
||||
print(
|
||||
"k=",
|
||||
k,
|
||||
" k1=",
|
||||
k1,
|
||||
" k2=",
|
||||
k2,
|
||||
" ent=",
|
||||
ent,
|
||||
" ent1=",
|
||||
ent1,
|
||||
" ent2=",
|
||||
ent2,
|
||||
)
|
||||
print("ig=", ig, " delta=", delta, " N ", N, " term ", term)
|
||||
return ig > term
|
||||
|
||||
def num_classes(self, start, end):
|
||||
@@ -88,6 +413,18 @@ class PyFImdlp:
|
||||
entropy_left = self.entropy(start, idx)
|
||||
entropy_right = self.entropy(idx, end)
|
||||
entropy_cut = entropy_left + entropy_right
|
||||
print(
|
||||
"idx: ",
|
||||
idx,
|
||||
" entropy_left: ",
|
||||
entropy_left,
|
||||
" entropy_right : ",
|
||||
entropy_right,
|
||||
" -> ",
|
||||
start,
|
||||
" ",
|
||||
end,
|
||||
)
|
||||
if entropy_cut < minEntropy:
|
||||
minEntropy = entropy_cut
|
||||
candidate.index = idx
|
||||
@@ -118,7 +455,7 @@ class PyFImdlp:
|
||||
# Compute standard entropy.
|
||||
for prop in proportions:
|
||||
if prop != 0.0:
|
||||
entropy -= prop * log(prop, 2)
|
||||
entropy -= prop * log2(prop, 2)
|
||||
self.entropy_cache[(start, end)] = entropy
|
||||
return entropy
|
||||
|
||||
|
@@ -1,18 +1,25 @@
|
||||
#ifndef TYPES_H
|
||||
#define TYPES_H
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
using namespace std;
|
||||
namespace mdlp {
|
||||
struct CutPointBody {
|
||||
size_t start, end; // indices of the sorted vector
|
||||
int classNumber; // class assigned to the cut point
|
||||
float fromValue, toValue;
|
||||
};
|
||||
typedef CutPointBody cutPoint_t;
|
||||
typedef vector<float> samples;
|
||||
typedef vector<int> labels;
|
||||
typedef vector<size_t> indices_t;
|
||||
typedef vector<cutPoint_t> cutPoints_t;
|
||||
typedef map<tuple<int, int>, float> cacheEnt_t;
|
||||
typedef map<tuple<int, int, int>, float> cacheIg_t;
|
||||
struct cutPointStruct {
|
||||
size_t index;
|
||||
float value;
|
||||
};
|
||||
typedef cutPointStruct xcutPoint_t;
|
||||
typedef vector<xcutPoint_t> xcutPoints_t;
|
||||
}
|
||||
#endif
|
44
sample.py
44
sample.py
@@ -1,8 +1,13 @@
|
||||
from sklearn.datasets import load_iris
|
||||
from fimdlp.mdlp import FImdlp
|
||||
from fimdlp.cppfimdlp import CFImdlp
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
import numpy as np
|
||||
from math import log
|
||||
import time
|
||||
from math import log2
|
||||
|
||||
from scipy.io import arff
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def entropy(y: np.array) -> float:
|
||||
@@ -30,7 +35,7 @@ def entropy(y: np.array) -> float:
|
||||
# Compute standard entropy.
|
||||
for prop in proportions:
|
||||
if prop != 0.0:
|
||||
entropy -= prop * log(prop, 2)
|
||||
entropy -= prop * log2(prop, 2)
|
||||
return entropy
|
||||
|
||||
|
||||
@@ -57,14 +62,37 @@ def information_gain(
|
||||
return result
|
||||
|
||||
|
||||
data = load_iris()
|
||||
X = data.data
|
||||
y = data.target
|
||||
features = data.feature_names
|
||||
class_name = "speaker"
|
||||
file_name = "kdd_JapaneseVowels.arff"
|
||||
data = arff.loadarff(file_name)
|
||||
df = pd.DataFrame(data[0])
|
||||
df.dropna(axis=0, how="any", inplace=True)
|
||||
dataset = df
|
||||
X = df.drop(class_name, axis=1)
|
||||
features = X.columns
|
||||
class_name = class_name
|
||||
y, _ = pd.factorize(df[class_name])
|
||||
X = X.to_numpy()
|
||||
|
||||
# data = load_iris()
|
||||
# X = data.data
|
||||
# y = data.target
|
||||
# features = data.feature_names
|
||||
|
||||
|
||||
test = FImdlp()
|
||||
test.fit(X, y)
|
||||
test.transform(X)
|
||||
now = time.time()
|
||||
test.fit(X, y, features=[i for i in (range(3, 14))])
|
||||
fit_time = time.time()
|
||||
print("Fitting: ", fit_time - now)
|
||||
now = time.time()
|
||||
Xt = test.transform(X)
|
||||
print("Transforming: ", time.time() - now)
|
||||
print(test.get_cut_points())
|
||||
|
||||
clf = RandomForestClassifier(random_state=0)
|
||||
print(clf.fit(Xt, y).score(Xt, y))
|
||||
|
||||
# for proposal in [True, False]:
|
||||
# X = data.data
|
||||
# y = data.target
|
||||
|
Reference in New Issue
Block a user