Fix xgboost error in predict/predict_proba

Update tests
2025-04-12 17:48:23 +02:00 · 2025-01-09 11:25:19 +01:00
16 changed files with 263 additions and 71 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,11 +3,8 @@
 	path = lib/json
 	url = https://github.com/nlohmann/json.git
 [submodule "lib/catch2"]
-	path = tests/lib/catch2
+	path = lib/catch2
 	url = https://github.com/catchorg/Catch2.git
 [submodule "lib/mdlp"]
-	path = tests/lib/mdlp
+	path = lib/mdlp
 	url = https://github.com/rmontanana/mdlp
 [submodule "tests/lib/Files"]
 	path = tests/lib/Files
 	url = https://github.com/rmontanana/ArffFiles
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,9 +78,9 @@ add_subdirectory(pyclfs)
 # -------
 if (ENABLE_TESTING)
  MESSAGE("Testing enabled")
-  add_git_submodule(tests/lib/catch2)
+  add_git_submodule(lib/catch2)
-  add_git_submodule(tests/lib/mdlp)
+  add_git_submodule(lib/mdlp)
-  add_subdirectory(tests/lib/Files)
+  add_subdirectory(lib/Files)
  include(CTest)
  add_subdirectory(tests)
 endif (ENABLE_TESTING)
--- a/lib/Files/ArffFiles.cc
+++ b/lib/Files/ArffFiles.cc
@@ -0,0 +1,168 @@
 #include "ArffFiles.h"
 #include <fstream>
 #include <sstream>
 #include <map>
 #include <iostream>
 ArffFiles::ArffFiles() = default;
 std::vector<std::string> ArffFiles::getLines() const
 {
    return lines;
 }
 unsigned long int ArffFiles::getSize() const
 {
    return lines.size();
 }
 std::vector<std::pair<std::string, std::string>> ArffFiles::getAttributes() const
 {
    return attributes;
 }
 std::string ArffFiles::getClassName() const
 {
    return className;
 }
 std::string ArffFiles::getClassType() const
 {
    return classType;
 }
 std::vector<std::vector<float>>& ArffFiles::getX()
 {
    return X;
 }
 std::vector<int>& ArffFiles::getY()
 {
    return y;
 }
 void ArffFiles::loadCommon(std::string fileName)
 {
    std::ifstream file(fileName);
    if (!file.is_open()) {
        throw std::invalid_argument("Unable to open file");
    }
    std::string line;
    std::string keyword;
    std::string attribute;
    std::string type;
    std::string type_w;
    while (getline(file, line)) {
        if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
            continue;
        }
        if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
            std::stringstream ss(line);
            ss >> keyword >> attribute;
            type = "";
            while (ss >> type_w)
                type += type_w + " ";
            attributes.emplace_back(trim(attribute), trim(type));
            continue;
        }
        if (line[0] == '@') {
            continue;
        }
        lines.push_back(line);
    }
    file.close();
    if (attributes.empty())
        throw std::invalid_argument("No attributes found");
 }
 void ArffFiles::load(const std::string& fileName, bool classLast)
 {
    int labelIndex;
    loadCommon(fileName);
    if (classLast) {
        className = std::get<0>(attributes.back());
        classType = std::get<1>(attributes.back());
        attributes.pop_back();
        labelIndex = static_cast<int>(attributes.size());
    } else {
        className = std::get<0>(attributes.front());
        classType = std::get<1>(attributes.front());
        attributes.erase(attributes.begin());
        labelIndex = 0;
    }
    generateDataset(labelIndex);
 }
 void ArffFiles::load(const std::string& fileName, const std::string& name)
 {
    int labelIndex;
    loadCommon(fileName);
    bool found = false;
    for (int i = 0; i < attributes.size(); ++i) {
        if (attributes[i].first == name) {
            className = std::get<0>(attributes[i]);
            classType = std::get<1>(attributes[i]);
            attributes.erase(attributes.begin() + i);
            labelIndex = i;
            found = true;
            break;
        }
    }
    if (!found) {
        throw std::invalid_argument("Class name not found");
    }
    generateDataset(labelIndex);
 }
 void ArffFiles::generateDataset(int labelIndex)
 {
    X = std::vector<std::vector<float>>(attributes.size(), std::vector<float>(lines.size()));
    auto yy = std::vector<std::string>(lines.size(), "");
    auto removeLines = std::vector<int>(); // Lines with missing values
    for (size_t i = 0; i < lines.size(); i++) {
        std::stringstream ss(lines[i]);
        std::string value;
        int pos = 0;
        int xIndex = 0;
        while (getline(ss, value, ',')) {
            if (pos++ == labelIndex) {
                yy[i] = value;
            } else {
                if (value == "?") {
                    X[xIndex++][i] = -1;
                    removeLines.push_back(i);
                } else
                    X[xIndex++][i] = stof(value);
            }
        }
    }
    for (auto i : removeLines) {
        yy.erase(yy.begin() + i);
        for (auto& x : X) {
            x.erase(x.begin() + i);
        }
    }
    y = factorize(yy);
 }
 std::string ArffFiles::trim(const std::string& source)
 {
    std::string s(source);
    s.erase(0, s.find_first_not_of(" '\n\r\t"));
    s.erase(s.find_last_not_of(" '\n\r\t") + 1);
    return s;
 }
 std::vector<int> ArffFiles::factorize(const std::vector<std::string>& labels_t)
 {
    std::vector<int> yy;
    yy.reserve(labels_t.size());
    std::map<std::string, int> labelMap;
    int i = 0;
    for (const std::string& label : labels_t) {
        if (labelMap.find(label) == labelMap.end()) {
            labelMap[label] = i++;
        }
        yy.push_back(labelMap[label]);
    }
    return yy;
 }
--- a/lib/Files/ArffFiles.h
+++ b/lib/Files/ArffFiles.h
@@ -0,0 +1,32 @@
 #ifndef ARFFFILES_H
 #define ARFFFILES_H
 #include <string>
 #include <vector>
 class ArffFiles {
 private:
    std::vector<std::string> lines;
    std::vector<std::pair<std::string, std::string>> attributes;
    std::string className;
    std::string classType;
    std::vector<std::vector<float>> X;
    std::vector<int> y;
    void generateDataset(int);
    void loadCommon(std::string);
 public:
    ArffFiles();
    void load(const std::string&, bool = true);
    void load(const std::string&, const std::string&);
    std::vector<std::string> getLines() const;
    unsigned long int getSize() const;
    std::string getClassName() const;
    std::string getClassType() const;
    static std::string trim(const std::string&);
    std::vector<std::vector<float>>& getX();
    std::vector<int>& getY();
    std::vector<std::pair<std::string, std::string>> getAttributes() const;
    static std::vector<int> factorize(const std::vector<std::string>& labels_t);
 };
 #endif
--- a/lib/Files/CMakeLists.txt
+++ b/lib/Files/CMakeLists.txt
@@ -0,0 +1 @@
 add_library(ArffFiles ArffFiles.cc)
--- a/tests/lib/catch2
+++ b/tests/lib/catch2
--- a/tests/lib/mdlp
+++ b/tests/lib/mdlp
--- a/pyclfs/CMakeLists.txt
+++ b/pyclfs/CMakeLists.txt
@@ -4,5 +4,5 @@ include_directories(
    ${PyClassifiers_SOURCE_DIR}/lib/json/include
    ${Bayesnet_INCLUDE_DIRS}
 )
-add_library(PyClassifiers ODTE.cc STree.cc SVC.cc RandomForest.cc XGBoost.cc PyClassifier.cc PyWrap.cc PBC4cip.cc)
+add_library(PyClassifiers ODTE.cc STree.cc SVC.cc RandomForest.cc XGBoost.cc PyClassifier.cc PyWrap.cc)
 target_link_libraries(PyClassifiers ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy)
--- a/pyclfs/PBC4cip.cc
+++ b/pyclfs/PBC4cip.cc
@@ -1,8 +0,0 @@
 #include "PBC4cip.h"
 namespace pywrap {
    PBC4cip::PBC4cip() : PyClassifier("core.PBC4cip", "PBC4cip", true)
    {
        validHyperparameters = { "random_state" };
    }
 } /* namespace pywrap */
--- a/pyclfs/PBC4cip.h
+++ b/pyclfs/PBC4cip.h
@@ -1,13 +0,0 @@
 #ifndef PBC4CIP_H
 #define PBC4CIP_H
 #include "PyClassifier.h"
 namespace pywrap {
    class PBC4cip : public PyClassifier {
    public:
        PBC4cip();
        ~PBC4cip() = default;
    };
 } /* namespace pywrap */
 #endif /* PBC4CIP_H */
--- a/pyclfs/PyClassifier.cc
+++ b/pyclfs/PyClassifier.cc
@@ -93,11 +93,19 @@ namespace pywrap {
            PyErr_Print();
            throw std::runtime_error("Error creating object for predict in " + module + " and class " + className);
        }
-        int* data = reinterpret_cast<int*>(prediction.get_data());
+        if (xgboost) {
-        std::vector<int> vPrediction(data, data + prediction.shape(0));
+            long* data = reinterpret_cast<long*>(prediction.get_data());
-        auto resultTensor = torch::tensor(vPrediction, torch::kInt32);
+            std::vector<int> vPrediction(data, data + prediction.shape(0));
-        Py_XDECREF(incoming);
+            auto resultTensor = torch::tensor(vPrediction, torch::kInt32);
-        return resultTensor;
+            Py_XDECREF(incoming);
            return resultTensor;
        } else {
            int* data = reinterpret_cast<int*>(prediction.get_data());
            std::vector<int> vPrediction(data, data + prediction.shape(0));
            auto resultTensor = torch::tensor(vPrediction, torch::kInt32);
            Py_XDECREF(incoming);
            return resultTensor;
        }
    }
    torch::Tensor PyClassifier::predict_proba(torch::Tensor& X)
    {
@@ -118,11 +126,19 @@ namespace pywrap {
            PyErr_Print();
            throw std::runtime_error("Error creating object for predict_proba in " + module + " and class " + className);
        }
-        double* data = reinterpret_cast<double*>(prediction.get_data());
+        if (xgboost) {
-        std::vector<double> vPrediction(data, data + prediction.shape(0) * prediction.shape(1));
+            float* data = reinterpret_cast<float*>(prediction.get_data());
-        auto resultTensor = torch::tensor(vPrediction, torch::kFloat64).reshape({ prediction.shape(0), prediction.shape(1) });
+            std::vector<float> vPrediction(data, data + prediction.shape(0) * prediction.shape(1));
-        Py_XDECREF(incoming);
+            auto resultTensor = torch::tensor(vPrediction, torch::kFloat64).reshape({ prediction.shape(0), prediction.shape(1) });
-        return resultTensor;
+            Py_XDECREF(incoming);
            return resultTensor;
        } else {
            double* data = reinterpret_cast<double*>(prediction.get_data());
            std::vector<double> vPrediction(data, data + prediction.shape(0) * prediction.shape(1));
            auto resultTensor = torch::tensor(vPrediction, torch::kFloat64).reshape({ prediction.shape(0), prediction.shape(1) });
            Py_XDECREF(incoming);
            return resultTensor;
        }
    }
    float PyClassifier::score(torch::Tensor& X, torch::Tensor& y)
    {
@@ -135,4 +151,4 @@ namespace pywrap {
    {
        this->hyperparameters = hyperparameters;
    }
-} /* namespace pywrap */
+} /* namespace pywrap */
--- a/pyclfs/PyClassifier.h
+++ b/pyclfs/PyClassifier.h
@@ -49,6 +49,7 @@ namespace pywrap {
        nlohmann::json hyperparameters;
        void trainModel(const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing = bayesnet::Smoothing_t::NONE) override {};
        std::vector<std::string> notes;
        bool xgboost = false;
    private:
        PyWrap* pyWrap;
        std::string module;
--- a/pyclfs/XGBoost.cc
+++ b/pyclfs/XGBoost.cc
@@ -5,5 +5,6 @@ namespace pywrap {
    XGBoost::XGBoost() : PyClassifier("xgboost", "XGBClassifier", true)
    {
        validHyperparameters = { "tree_method", "early_stopping_rounds", "n_jobs" };
        xgboost = true;
    }
 } /* namespace pywrap */
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -3,14 +3,15 @@ if(ENABLE_TESTING)
    include_directories(
        ${PyClassifiers_SOURCE_DIR}
        ${PyClassifiers_SOURCE_DIR}/lib/Files
-        ${PyClassifiers_SOURCE_DIR}/lib/mdlp
+        ${PyClassifiers_SOURCE_DIR}/lib/mdlp/src
        ${PyClassifiers_SOURCE_DIR}/lib/json/include
        ${Python3_INCLUDE_DIRS}
        ${TORCH_INCLUDE_DIRS}
        ${CMAKE_BINARY_DIR}/configured_files/include
        /usr/local/include
    )
    file(GLOB_RECURSE PyClassifiers_SOURCES "${PyClassifiers_SOURCE_DIR}/pyclfs/*.cc")
    set(TEST_SOURCES_PYCLASSIFIERS TestPythonClassifiers.cc TestUtils.cc ${PyClassifiers_SOURCES})
    add_executable(${TEST_PYCLASSIFIERS} ${TEST_SOURCES_PYCLASSIFIERS})
-    target_link_libraries(${TEST_PYCLASSIFIERS} PUBLIC "${TORCH_LIBRARIES}" ${Python3_LIBRARIES} ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy ArffFiles mdlp Catch2::Catch2WithMain)
+    target_link_libraries(${TEST_PYCLASSIFIERS} PUBLIC "${TORCH_LIBRARIES}" ${Python3_LIBRARIES} ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy ArffFiles fimdlp Catch2::Catch2WithMain)
 endif(ENABLE_TESTING)
--- a/tests/TestPythonClassifiers.cc
+++ b/tests/TestPythonClassifiers.cc
@@ -33,8 +33,8 @@ TEST_CASE("Test Python Classifiers score", "[PyClassifiers]")
        {"RandomForest", new pywrap::RandomForest()}
    };
    map<std::string, std::string> versions = {
-        {"ODTE", "1.0.0"},
+        {"ODTE", "1.0.0-1"},
-        {"STree", "1.3.2"},
+        {"STree", "1.4.0"},
        {"SVC", "1.5.1"},
        {"RandomForest", "1.5.1"}
    };
@@ -116,33 +116,30 @@ TEST_CASE("XGBoost", "[PyClassifiers]")
    clf.setHyperparameters(hyperparameters);
    auto score = clf.score(raw.Xt, raw.yt);
    REQUIRE(score == Catch::Approx(0.98).epsilon(raw.epsilon));
    std::cout << "XGBoost score: " << score << std::endl;
 }
-// TEST_CASE("XGBoost predict proba", "[PyClassifiers]")
+TEST_CASE("XGBoost predict proba", "[PyClassifiers]")
 // {
 //     auto raw = RawDatasets("iris", true);
 //     auto clf = pywrap::XGBoost();
 //     clf.fit(raw.Xt, raw.yt, raw.featurest, raw.classNamet, raw.statest);
 //     // nlohmann::json hyperparameters = { "n_jobs=1" };
 //     // clf.setHyperparameters(hyperparameters);
 //     auto predict = clf.predict(raw.Xt);
 //     for (int row = 0; row < predict.size(0); row++) {
 //         auto sum = 0.0;
 //         for (int col = 0; col < predict.size(1); col++) {
 //             std::cout << std::setw(12) << std::setprecision(10) << predict[row][col].item<double>() << " ";
 //             sum += predict[row][col].item<int>();
 //         }
 //         std::cout << std::endl;
 //         // REQUIRE(sum == Catch::Approx(1.0).epsilon(raw.epsilon));
 //     }
 //     std::cout << predict << std::endl;
 // }
 TEST_CASE("PBC4cip", "[PyClassifiers]")
 {
    auto raw = RawDatasets("iris", true);
-    auto clf = pywrap::PBC4cip();
+    auto clf = pywrap::XGBoost();
    clf.fit(raw.Xt, raw.yt, raw.featurest, raw.classNamet, raw.statest);
-    nlohmann::json hyperparameters = { };
+    // nlohmann::json hyperparameters = { "n_jobs=1" };
-    clf.setHyperparameters(hyperparameters);
+    // clf.setHyperparameters(hyperparameters);
-    auto score = clf.score(raw.Xt, raw.yt);
+    auto predict_proba = clf.predict_proba(raw.Xt);
-    REQUIRE(score == Catch::Approx(0.98).epsilon(raw.epsilon));
+    auto predict = clf.predict(raw.Xt);
    // std::cout << "Predict proba: " << predict_proba << std::endl;
    // std::cout << "Predict proba size: " << predict_proba.sizes() << std::endl;
    // assert(predict.size(0) == predict_proba.size(0));
    for (int row = 0; row < predict_proba.size(0); row++) {
        // auto sum = 0.0;
        // std::cout << "Row " << std::setw(3) << row << ": ";
        // for (int col = 0; col < predict_proba.size(1); col++) {
        //     std::cout << std::setw(9) << std::fixed << std::setprecision(7) << predict_proba[row][col].item<double>() << " ";
        //     sum += predict_proba[row][col].item<double>();
        // }
        // std::cout << " -> " << std::setw(9) << std::fixed << std::setprecision(7) << sum << " -> " << torch::argmax(predict_proba[row]).item<int>() << " = " << predict[row].item<int>() << std::endl;
        //     // REQUIRE(sum == Catch::Approx(1.0).epsilon(raw.epsilon));
        REQUIRE(torch::argmax(predict_proba[row]).item<int>() == predict[row].item<int>());
        REQUIRE(torch::sum(predict_proba[row]).item<double>() == Catch::Approx(1.0).epsilon(raw.epsilon));
    }
 }
--- a/tests/lib/Files
+++ b/tests/lib/Files
Author	SHA1	Message	Date
Ricardo Montañana Gómez	830265d91b	Fix xgboost error in predict/predict_proba	2025-04-12 17:48:23 +02:00
Ricardo Montañana Gómez	761f57be6c	Update tests	2025-01-09 11:25:19 +01:00