Fix memory management vulnerabilities

2025-07-03 19:53:00 +02:00
parent 2fcef1a0de
commit 91225207f2
7 changed files with 731 additions and 130 deletions
--- a/pyclfs/PyClassifier.cc
+++ b/pyclfs/PyClassifier.cc
@@ -15,25 +15,91 @@ namespace pywrap {
    }
    np::ndarray tensor2numpy(torch::Tensor& X)
    {
-        int m = X.size(0);
-        int n = X.size(1);
-        auto Xn = np::from_data(X.data_ptr(), np::dtype::get_builtin<float>(), bp::make_tuple(m, n), bp::make_tuple(sizeof(X.dtype()) * 2 * n, sizeof(X.dtype()) * 2), bp::object());
-        Xn = Xn.transpose();
+        // Validate tensor dimensions
+        if (X.dim() != 2) {
+            throw std::runtime_error("tensor2numpy: Expected 2D tensor, got " + std::to_string(X.dim()) + "D");
+        }
+        
+        // Ensure tensor is contiguous and in the expected format
+        X = X.contiguous();
+        
+        if (X.dtype() != torch::kFloat32) {
+            throw std::runtime_error("tensor2numpy: Expected float32 tensor");
+        }
+        
+        int64_t m = X.size(0);
+        int64_t n = X.size(1);
+        
+        // Calculate correct strides in bytes
+        int64_t element_size = X.element_size();
+        int64_t stride0 = X.stride(0) * element_size;
+        int64_t stride1 = X.stride(1) * element_size;
+        
+        auto Xn = np::from_data(X.data_ptr(), np::dtype::get_builtin<float>(), 
+                               bp::make_tuple(m, n), 
+                               bp::make_tuple(stride0, stride1), 
+                               bp::object());
+        // Don't transpose - tensor is already in correct [samples, features] format
        return Xn;
    }
    np::ndarray tensorInt2numpy(torch::Tensor& X)
    {
-        int m = X.size(0);
-        int n = X.size(1);
-        auto Xn = np::from_data(X.data_ptr(), np::dtype::get_builtin<int>(), bp::make_tuple(m, n), bp::make_tuple(sizeof(X.dtype()) * 2 * n, sizeof(X.dtype()) * 2), bp::object());
-        Xn = Xn.transpose();
-        //std::cout << "Transposed array:\n" << boost::python::extract<char const*>(boost::python::str(Xn)) << std::endl;
+        // Validate tensor dimensions
+        if (X.dim() != 2) {
+            throw std::runtime_error("tensorInt2numpy: Expected 2D tensor, got " + std::to_string(X.dim()) + "D");
+        }
+        
+        // Ensure tensor is contiguous and in the expected format
+        X = X.contiguous();
+        
+        if (X.dtype() != torch::kInt32) {
+            throw std::runtime_error("tensorInt2numpy: Expected int32 tensor");
+        }
+        
+        int64_t m = X.size(0);
+        int64_t n = X.size(1);
+        
+        // Calculate correct strides in bytes
+        int64_t element_size = X.element_size();
+        int64_t stride0 = X.stride(0) * element_size;
+        int64_t stride1 = X.stride(1) * element_size;
+        
+        auto Xn = np::from_data(X.data_ptr(), np::dtype::get_builtin<int>(), 
+                               bp::make_tuple(m, n), 
+                               bp::make_tuple(stride0, stride1), 
+                               bp::object());
+        // Don't transpose - tensor is already in correct [samples, features] format
        return Xn;
    }
    std::pair<np::ndarray, np::ndarray> tensors2numpy(torch::Tensor& X, torch::Tensor& y)
    {
-        int n = X.size(1);
-        auto yn = np::from_data(y.data_ptr(), np::dtype::get_builtin<int32_t>(), bp::make_tuple(n), bp::make_tuple(sizeof(y.dtype()) * 2), bp::object());
+        // Validate y tensor dimensions
+        if (y.dim() != 1) {
+            throw std::runtime_error("tensors2numpy: Expected 1D y tensor, got " + std::to_string(y.dim()) + "D");
+        }
+        
+        // Validate dimensions match
+        if (X.size(0) != y.size(0)) {
+            throw std::runtime_error("tensors2numpy: X and y dimension mismatch: X[" + 
+                                   std::to_string(X.size(0)) + "], y[" + std::to_string(y.size(0)) + "]");
+        }
+        
+        // Ensure y tensor is contiguous
+        y = y.contiguous();
+        
+        if (y.dtype() != torch::kInt32) {
+            throw std::runtime_error("tensors2numpy: Expected int32 y tensor");
+        }
+        
+        int64_t n = y.size(0);
+        int64_t element_size = y.element_size();
+        int64_t stride = y.stride(0) * element_size;
+        
+        auto yn = np::from_data(y.data_ptr(), np::dtype::get_builtin<int32_t>(), 
+                               bp::make_tuple(n), 
+                               bp::make_tuple(stride), 
+                               bp::object());
+        
        if (X.dtype() == torch::kInt32) {
            return { tensorInt2numpy(X), yn };
        }
@@ -63,12 +129,21 @@ namespace pywrap {
        if (!fitted && hyperparameters.size() > 0) {
            pyWrap->setHyperparameters(id, hyperparameters);
        }
-        auto [Xn, yn] = tensors2numpy(X, y);
-        CPyObject Xp = bp::incref(bp::object(Xn).ptr());
-        CPyObject yp = bp::incref(bp::object(yn).ptr());
-        pyWrap->fit(id, Xp, yp);
-        fitted = true;
-        return *this;
+        try {
+            auto [Xn, yn] = tensors2numpy(X, y);
+            CPyObject Xp = bp::incref(bp::object(Xn).ptr());
+            CPyObject yp = bp::incref(bp::object(yn).ptr());
+            pyWrap->fit(id, Xp, yp);
+            fitted = true;
+            return *this;
+        }
+        catch (const std::exception& e) {
+            // Clear any Python errors before re-throwing
+            if (PyErr_Occurred()) {
+                PyErr_Clear();
+            }
+            throw;
+        }
    }
    PyClassifier& PyClassifier::fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states, const bayesnet::Smoothing_t smoothing)
    {
@@ -76,76 +151,148 @@ namespace pywrap {
    }
    torch::Tensor PyClassifier::predict(torch::Tensor& X)
    {
-        int dimension = X.size(1);
-        CPyObject Xp;
-        if (X.dtype() == torch::kInt32) {
-            auto Xn = tensorInt2numpy(X);
-            Xp = bp::incref(bp::object(Xn).ptr());
-        } else {
-            auto Xn = tensor2numpy(X);
-            Xp = bp::incref(bp::object(Xn).ptr());
+        try {
+            CPyObject Xp;
+            if (X.dtype() == torch::kInt32) {
+                auto Xn = tensorInt2numpy(X);
+                Xp = bp::incref(bp::object(Xn).ptr());
+            } else {
+                auto Xn = tensor2numpy(X);
+                Xp = bp::incref(bp::object(Xn).ptr());
+            }
+            
+            // Use RAII guard for automatic cleanup
+            PyObjectGuard incoming(pyWrap->predict(id, Xp));
+            if (!incoming) {
+                throw std::runtime_error("predict() returned NULL for " + module + ":" + className);
+            }
+            
+            bp::handle<> handle(incoming.release());  // Transfer ownership to boost
+            bp::object object(handle);
+            np::ndarray prediction = np::from_object(object);
+            
+            if (PyErr_Occurred()) {
+                PyErr_Clear();
+                throw std::runtime_error("Error creating numpy object for predict in " + module + ":" + className);
+            }
+            
+            // Validate numpy array
+            if (prediction.get_nd() != 1) {
+                throw std::runtime_error("Expected 1D prediction array, got " + std::to_string(prediction.get_nd()) + "D");
+            }
+            
+            // Safe type conversion with validation
+            std::vector<int> vPrediction;
+            if (xgboost) {
+                // Validate data type for XGBoost (typically returns long)
+                if (prediction.get_dtype() == np::dtype::get_builtin<long>()) {
+                    long* data = reinterpret_cast<long*>(prediction.get_data());
+                    vPrediction.reserve(prediction.shape(0));
+                    for (int i = 0; i < prediction.shape(0); ++i) {
+                        vPrediction.push_back(static_cast<int>(data[i]));
+                    }
+                } else {
+                    throw std::runtime_error("XGBoost prediction: unexpected data type");
+                }
+            } else {
+                // Validate data type for other classifiers (typically returns int)
+                if (prediction.get_dtype() == np::dtype::get_builtin<int>()) {
+                    int* data = reinterpret_cast<int*>(prediction.get_data());
+                    vPrediction.assign(data, data + prediction.shape(0));
+                } else {
+                    throw std::runtime_error("Prediction: unexpected data type");
+                }
+            }
+            
+            return torch::tensor(vPrediction, torch::kInt32);
        }
-        PyObject* incoming = pyWrap->predict(id, Xp);
-        bp::handle<> handle(incoming);
-        bp::object object(handle);
-        np::ndarray prediction = np::from_object(object);
-        if (PyErr_Occurred()) {
-            PyErr_Print();
-            throw std::runtime_error("Error creating object for predict in " + module + " and class " + className);
-        }
-        if (xgboost) {
-            long* data = reinterpret_cast<long*>(prediction.get_data());
-            std::vector<int> vPrediction(data, data + prediction.shape(0));
-            auto resultTensor = torch::tensor(vPrediction, torch::kInt32);
-            Py_XDECREF(incoming);
-            return resultTensor;
-        } else {
-            int* data = reinterpret_cast<int*>(prediction.get_data());
-            std::vector<int> vPrediction(data, data + prediction.shape(0));
-            auto resultTensor = torch::tensor(vPrediction, torch::kInt32);
-            Py_XDECREF(incoming);
-            return resultTensor;
+        catch (const std::exception& e) {
+            // Clear any Python errors before re-throwing
+            if (PyErr_Occurred()) {
+                PyErr_Clear();
+            }
+            throw;
        }
    }
    torch::Tensor PyClassifier::predict_proba(torch::Tensor& X)
    {
-        int dimension = X.size(1);
-        CPyObject Xp;
-        if (X.dtype() == torch::kInt32) {
-            auto Xn = tensorInt2numpy(X);
-            Xp = bp::incref(bp::object(Xn).ptr());
-        } else {
-            auto Xn = tensor2numpy(X);
-            Xp = bp::incref(bp::object(Xn).ptr());
+        try {
+            CPyObject Xp;
+            if (X.dtype() == torch::kInt32) {
+                auto Xn = tensorInt2numpy(X);
+                Xp = bp::incref(bp::object(Xn).ptr());
+            } else {
+                auto Xn = tensor2numpy(X);
+                Xp = bp::incref(bp::object(Xn).ptr());
+            }
+            
+            // Use RAII guard for automatic cleanup
+            PyObjectGuard incoming(pyWrap->predict_proba(id, Xp));
+            if (!incoming) {
+                throw std::runtime_error("predict_proba() returned NULL for " + module + ":" + className);
+            }
+            
+            bp::handle<> handle(incoming.release());  // Transfer ownership to boost
+            bp::object object(handle);
+            np::ndarray prediction = np::from_object(object);
+            
+            if (PyErr_Occurred()) {
+                PyErr_Clear();
+                throw std::runtime_error("Error creating numpy object for predict_proba in " + module + ":" + className);
+            }
+            
+            // Validate numpy array dimensions
+            if (prediction.get_nd() != 2) {
+                throw std::runtime_error("Expected 2D probability array, got " + std::to_string(prediction.get_nd()) + "D");
+            }
+            
+            int64_t rows = prediction.shape(0);
+            int64_t cols = prediction.shape(1);
+            
+            // Safe type conversion with validation
+            if (xgboost) {
+                // Validate data type for XGBoost (typically returns float)
+                if (prediction.get_dtype() == np::dtype::get_builtin<float>()) {
+                    float* data = reinterpret_cast<float*>(prediction.get_data());
+                    std::vector<float> vPrediction(data, data + rows * cols);
+                    return torch::tensor(vPrediction, torch::kFloat32).reshape({rows, cols});
+                } else {
+                    throw std::runtime_error("XGBoost predict_proba: unexpected data type");
+                }
+            } else {
+                // Validate data type for other classifiers (typically returns double)
+                if (prediction.get_dtype() == np::dtype::get_builtin<double>()) {
+                    double* data = reinterpret_cast<double*>(prediction.get_data());
+                    std::vector<double> vPrediction(data, data + rows * cols);
+                    return torch::tensor(vPrediction, torch::kFloat64).reshape({rows, cols});
+                } else {
+                    throw std::runtime_error("predict_proba: unexpected data type");
+                }
+            }
        }
-        PyObject* incoming = pyWrap->predict_proba(id, Xp);
-        bp::handle<> handle(incoming);
-        bp::object object(handle);
-        np::ndarray prediction = np::from_object(object);
-        if (PyErr_Occurred()) {
-            PyErr_Print();
-            throw std::runtime_error("Error creating object for predict_proba in " + module + " and class " + className);
-        }
-        if (xgboost) {
-            float* data = reinterpret_cast<float*>(prediction.get_data());
-            std::vector<float> vPrediction(data, data + prediction.shape(0) * prediction.shape(1));
-            auto resultTensor = torch::tensor(vPrediction, torch::kFloat64).reshape({ prediction.shape(0), prediction.shape(1) });
-            Py_XDECREF(incoming);
-            return resultTensor;
-        } else {
-            double* data = reinterpret_cast<double*>(prediction.get_data());
-            std::vector<double> vPrediction(data, data + prediction.shape(0) * prediction.shape(1));
-            auto resultTensor = torch::tensor(vPrediction, torch::kFloat64).reshape({ prediction.shape(0), prediction.shape(1) });
-            Py_XDECREF(incoming);
-            return resultTensor;
+        catch (const std::exception& e) {
+            // Clear any Python errors before re-throwing
+            if (PyErr_Occurred()) {
+                PyErr_Clear();
+            }
+            throw;
        }
    }
    float PyClassifier::score(torch::Tensor& X, torch::Tensor& y)
    {
-        auto [Xn, yn] = tensors2numpy(X, y);
-        CPyObject Xp = bp::incref(bp::object(Xn).ptr());
-        CPyObject yp = bp::incref(bp::object(yn).ptr());
-        return pyWrap->score(id, Xp, yp);
+        try {
+            auto [Xn, yn] = tensors2numpy(X, y);
+            CPyObject Xp = bp::incref(bp::object(Xn).ptr());
+            CPyObject yp = bp::incref(bp::object(yn).ptr());
+            return pyWrap->score(id, Xp, yp);
+        }
+        catch (const std::exception& e) {
+            // Clear any Python errors before re-throwing
+            if (PyErr_Occurred()) {
+                PyErr_Clear();
+            }
+            throw;
+        }
    }
    void PyClassifier::setHyperparameters(const nlohmann::json& hyperparameters)
    {
--- a/pyclfs/PyHelper.hpp
+++ b/pyclfs/PyHelper.hpp
@@ -27,13 +27,28 @@ namespace pywrap {
    private:
        PyObject* p;
    public:
-        CPyObject() : p(NULL)
+        CPyObject() : p(nullptr)
        {
        }

        CPyObject(PyObject* _p) : p(_p)
        {
        }
+
+        // Copy constructor
+        CPyObject(const CPyObject& other) : p(other.p)
+        {
+            if (p) {
+                Py_INCREF(p);
+            }
+        }
+
+        // Move constructor
+        CPyObject(CPyObject&& other) noexcept : p(other.p)
+        {
+            other.p = nullptr;
+        }
+
        ~CPyObject()
        {
            Release();
@@ -44,7 +59,11 @@ namespace pywrap {
        }
        PyObject* setObject(PyObject* _p)
        {
-            return (p = _p);
+            if (p != _p) {
+                Release();  // Release old reference
+                p = _p;
+            }
+            return p;
        }
        PyObject* AddRef()
        {
@@ -57,31 +76,157 @@ namespace pywrap {
        {
            if (p) {
                Py_XDECREF(p);
+                p = nullptr;
            }
-
-            p = NULL;
        }
        PyObject* operator ->()
        {
            return p;
        }
-        bool is()
+        bool is() const
        {
-            return p ? true : false;
+            return p != nullptr;
+        }
+
+        // Check if object is valid
+        bool isValid() const
+        {
+            return p != nullptr;
        }
        operator PyObject* ()
        {
            return p;
        }
-        PyObject* operator = (PyObject* pp)
+        // Copy assignment operator
+        CPyObject& operator=(const CPyObject& other)
        {
-            p = pp;
+            if (this != &other) {
+                Release();  // Release current reference
+                p = other.p;
+                if (p) {
+                    Py_INCREF(p);  // Add reference to new object
+                }
+            }
+            return *this;
+        }
+
+        // Move assignment operator
+        CPyObject& operator=(CPyObject&& other) noexcept
+        {
+            if (this != &other) {
+                Release();  // Release current reference
+                p = other.p;
+                other.p = nullptr;
+            }
+            return *this;
+        }
+
+        // Assignment from PyObject* - DEPRECATED, use setObject() instead
+        PyObject* operator=(PyObject* pp)
+        {
+            setObject(pp);
            return p;
        }
-        operator bool()
+        explicit operator bool() const
        {
-            return p ? true : false;
+            return p != nullptr;
        }
    };
+
+    // RAII guard for PyObject* - safer alternative to manual reference management
+    class PyObjectGuard {
+    private:
+        PyObject* obj_;
+        bool owns_reference_;
+
+    public:
+        // Constructor takes ownership of a new reference
+        explicit PyObjectGuard(PyObject* obj = nullptr) : obj_(obj), owns_reference_(true) {}
+        
+        // Constructor for borrowed references
+        PyObjectGuard(PyObject* obj, bool borrow) : obj_(obj), owns_reference_(!borrow) {
+            if (borrow && obj_) {
+                Py_INCREF(obj_);
+                owns_reference_ = true;
+            }
+        }
+
+        // Non-copyable to prevent accidental reference issues
+        PyObjectGuard(const PyObjectGuard&) = delete;
+        PyObjectGuard& operator=(const PyObjectGuard&) = delete;
+
+        // Movable
+        PyObjectGuard(PyObjectGuard&& other) noexcept 
+            : obj_(other.obj_), owns_reference_(other.owns_reference_) {
+            other.obj_ = nullptr;
+            other.owns_reference_ = false;
+        }
+
+        PyObjectGuard& operator=(PyObjectGuard&& other) noexcept {
+            if (this != &other) {
+                reset();
+                obj_ = other.obj_;
+                owns_reference_ = other.owns_reference_;
+                other.obj_ = nullptr;
+                other.owns_reference_ = false;
+            }
+            return *this;
+        }
+
+        ~PyObjectGuard() {
+            reset();
+        }
+
+        // Reset to nullptr, releasing current reference if owned
+        void reset(PyObject* new_obj = nullptr) {
+            if (owns_reference_ && obj_) {
+                Py_DECREF(obj_);
+            }
+            obj_ = new_obj;
+            owns_reference_ = (new_obj != nullptr);
+        }
+
+        // Release ownership and return the object
+        PyObject* release() {
+            PyObject* result = obj_;
+            obj_ = nullptr;
+            owns_reference_ = false;
+            return result;
+        }
+
+        // Get the raw pointer (does not transfer ownership)
+        PyObject* get() const {
+            return obj_;
+        }
+
+        // Check if valid
+        bool isValid() const {
+            return obj_ != nullptr;
+        }
+
+        explicit operator bool() const {
+            return obj_ != nullptr;
+        }
+
+        // Access operators
+        PyObject* operator->() const {
+            return obj_;
+        }
+
+        // Implicit conversion to PyObject* for API calls (does not transfer ownership)
+        operator PyObject*() const {
+            return obj_;
+        }
+    };
+
+    // Helper function to create a PyObjectGuard from a borrowed reference
+    inline PyObjectGuard borrowReference(PyObject* obj) {
+        return PyObjectGuard(obj, true);
+    }
+
+    // Helper function to create a PyObjectGuard from a new reference  
+    inline PyObjectGuard newReference(PyObject* obj) {
+        return PyObjectGuard(obj);
+    }
 } /* namespace pywrap */
 #endif
--- a/pyclfs/PyWrap.cc
+++ b/pyclfs/PyWrap.cc
@@ -237,12 +237,12 @@ namespace pywrap {
        CPyObject method = PyUnicode_FromString(name.c_str());
        try {
            if (!(result = PyObject_CallMethodObjArgs(instance, method.getObject(), X.getObject(), NULL)))
-                errorAbort("Couldn't call method predict");
+                errorAbort("Couldn't call method " + name);
        }
        catch (const std::exception& e) {
            errorAbort(e.what());
        }
-        Py_INCREF(result);
+        // PyObject_CallMethodObjArgs already returns a new reference, no need for Py_INCREF
        return result; // Caller must free this object
    }
    double PyWrap::score(const clfId_t id, CPyObject& X, CPyObject& y)