Add Entropy method

2025-08-17 00:15:52 +00:00 · 2022-11-28 10:15:03 +01:00
parent a54c774f95
commit 3d27c4c3b7
6 changed files with 74 additions and 16 deletions
--- a/fimdlp/CPPFImdlp.cpp
+++ b/fimdlp/CPPFImdlp.cpp
@@ -1,6 +1,7 @@
 #include "CPPFImdlp.h"
 #include <numeric>
 #include <iostream>
+#include "Metrics.h"
 namespace CPPFImdlp
 {
    CPPFImdlp::CPPFImdlp() : debug(false), precision(6)
@@ -17,33 +18,35 @@ namespace CPPFImdlp
    std::vector<float> CPPFImdlp::cutPoints(std::vector<float> &X, std::vector<int> &y)
    {
        std::vector<float> cutPts;
-        float antx, cutPoint;
-        int anty;
+        float xPrev, cutPoint;
+        int yPrev;
        std::vector<size_t> indices = sortIndices(X);
-        antx = X.at(indices[0]);
-        anty = y.at(indices[0]);
-        for (auto index = indices.begin(); index != indices.end(); ++index)
-        {
-            // std::cout << X.at(*index) << " -> " << y.at(*index) << " // ";
-            //  Definition 2 Cut points are always on boundaries
-            if (y.at(*index) != anty && antx < X.at(*index))
-            //  Weka implementation
-            // if (antx < X.at(*index))
-            {
-                cutPoint = round((X.at(*index) + antx) / 2 * divider) / divider;
+        xPrev = X.at(indices[0]);
+        yPrev = y.at(indices[0]);
        if (debug)
        {
-                    std::cout << "Cut point: " << (antx + X.at(*index)) / 2 << " //";
-                    std::cout << X.at(*index) << " -> " << y.at(*index) << " anty= " << anty;
-                    std::cout << "* (" << X.at(*index) << ", " << antx << ")=" << ((X.at(*index) + antx) / 2) << std::endl;
+            std::cout << "Entropy: " << Metrics::entropy(y, 0, y.size(), Metrics::numClasses(y)) << std::endl;
+        }
+        for (auto index = indices.begin(); index != indices.end(); ++index)
+        {
+            //  Definition 2 Cut points are always on boundaries
+            if (y.at(*index) != yPrev && xPrev < X.at(*index))
+            {
+                cutPoint = round((X.at(*index) + xPrev) / 2 * divider) / divider;
+                if (debug)
+                {
+                    std::cout << "Cut point: " << (xPrev + X.at(*index)) / 2 << " //";
+                    std::cout << X.at(*index) << " -> " << y.at(*index) << " yPrev= " << yPrev;
+                    std::cout << "* (" << X.at(*index) << ", " << xPrev << ")=" << ((X.at(*index) + xPrev) / 2) << std::endl;
                }
                cutPts.push_back(cutPoint);
            }
-            antx = X.at(*index);
-            anty = y.at(*index);
+            xPrev = X.at(*index);
+            yPrev = y.at(*index);
        }
        return cutPts;
    }
+    // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
    std::vector<size_t> CPPFImdlp::sortIndices(std::vector<float> &X)
    {
        std::vector<size_t> idx(X.size());
--- a/fimdlp/Metrics.cpp
+++ b/fimdlp/Metrics.cpp
@@ -0,0 +1,40 @@
+#include "Metrics.h"
+namespace CPPFImdlp
+{
+    Metrics::Metrics()
+    {
+    }
+    float Metrics::entropy(std::vector<int> &y, int start, int end, int nClasses)
+    {
+        float entropy = 0;
+        int nElements = end - start;
+        std::vector<int>
+            counts(nClasses, 0);
+        for (auto i = start; i < end; i++)
+        {
+            counts[y[i]]++;
+        }
+        for (auto i = 0; i < nClasses; i++)
+        {
+            if (counts[i] > 0)
+            {
+                float p = (float)counts[i] / nElements;
+                entropy -= p * log2(p);
+            }
+        }
+        return entropy;
+    }
+    int Metrics::numClasses(std::vector<int> &y)
+    {
+        int nClasses = 1;
+        int yAnt = y.at(0);
+        for (auto i = y.begin(); i != y.end(); ++i)
+        {
+            if (*i != yAnt)
+            {
+                nClasses++;
+            }
+        }
+        return nClasses;
+    }
+}
--- a/fimdlp/Metrics.h
+++ b/fimdlp/Metrics.h
@@ -0,0 +1,16 @@
+#ifndef METRICS_H
+#define METRICS_H
+#include <vector>
+#include <Python.h>
+#include <utility>
+namespace CPPFImdlp
+{
+    class Metrics
+    {
+    public:
+        Metrics();
+        static float entropy(std::vector<int> &, int, int, int);
+        static int numClasses(std::vector<int> &);
+    };
+}
+#endif
--- a/fimdlp/cppfimdlp.cpython-310-darwin.so
+++ b/fimdlp/cppfimdlp.cpython-310-darwin.so
--- a/fimdlp/mdlp.py
+++ b/fimdlp/mdlp.py
@@ -95,10 +95,8 @@ class FImdlp(TransformerMixin, BaseEstimator):
        print("Cut points for each feature in Iris dataset:")
        yz = self.y_.copy()
        xz = X[:, 0].copy()
-        xzz = self.discretizer_.sort_vectors(xz, yz)
        print("Xz: ", xz)
        print("Yz: ", yz)
-        print("Xzz: ", xzz)
        print("Solución:")
        print("Xz*: ", np.sort(X[:, 0]))
        print("yz*: ", yz[np.argsort(X[:, 0])])
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@ setup(
            sources=[
                "fimdlp/cfimdlp.pyx",
                "fimdlp/CPPFImdlp.cpp",
+                "fimdlp/Metrics.cpp",
            ],
            language="c++",
            include_dirs=["fimdlp"],