Implement hyperparameters

2023-11-08 10:35:38 +01:00
parent 1f46fc6c24
commit 331381930a
17 changed files with 913 additions and 325 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,9 +1,8 @@
 include_directories(${PyWrap_SOURCE_DIR}/lib/Files)
+include_directories(${PyWrap_SOURCE_DIR}/lib/json/include)
 include_directories(${Python3_INCLUDE_DIRS})
 include_directories(${TORCH_INCLUDE_DIRS})

 add_executable(main main.cc STree.cc SVC.cc RandomForest.cc PyClassifier.cc PyWrap.cc)
-add_executable(example example.cpp PyWrap.cc)

 target_link_libraries(main ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy ArffFiles)
-target_link_libraries(example ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy ArffFiles)
--- a/src/Classifier.h
+++ b/src/Classifier.h
@@ -0,0 +1,13 @@
+#ifndef CLASSIFER_H
+#define CLASSIFER_H
+#include <nlohmann/json.hpp>
+
+namespace pywrap {
+    class Classifier {
+    public:
+        Classifier() = default;
+        virtual ~Classifier() = default;
+        virtual void setHyperparameters(const nlohmann::json& hyperparameters) = 0;
+    };
+} /* namespace pywrap */
+#endif /* CLASSIFER_H */
--- a/src/PyClassifier.cc
+++ b/src/PyClassifier.cc
@@ -1,9 +1,10 @@
 #include "PyClassifier.h"
+#include <iostream>

 namespace pywrap {
    namespace bp = boost::python;
    namespace np = boost::python::numpy;
-    PyClassifier::PyClassifier(const std::string& module, const std::string& className) : module(module), className(className)
+    PyClassifier::PyClassifier(const std::string& module, const std::string& className) : module(module), className(className), fitted(false)
    {
        pyWrap = PyWrap::GetInstance();
        pyWrap->importClass(module, className);
@@ -36,10 +37,14 @@ namespace pywrap {
    }
    PyClassifier& PyClassifier::fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states)
    {
+        if (!fitted && hyperparameters.size() > 0) {
+            std::cout << "Setting hyperparameters" << std::endl;
+        }
        auto [Xn, yn] = tensors2numpy(X, y);
        CPyObject Xp = bp::incref(bp::object(Xn).ptr());
        CPyObject yp = bp::incref(bp::object(yn).ptr());
        pyWrap->fit(module, this->className, Xp, yp);
+        fitted = true;
        return *this;
    }
    torch::Tensor PyClassifier::predict(torch::Tensor& X)
@@ -69,4 +74,19 @@ namespace pywrap {
        auto result = pyWrap->score(module, className, Xp, yp);
        return result;
    }
+    void PyClassifier::setHyperparameters(const nlohmann::json& hyperparameters)
+    {
+        // Check if hyperparameters are valid, default is no hyperparameters
+        const std::vector<std::string> validKeys = { };
+        checkHyperparameters(validKeys, hyperparameters);
+        this->hyperparameters = hyperparameters;
+    }
+    void PyClassifier::checkHyperparameters(const std::vector<std::string>& validKeys, const nlohmann::json& hyperparameters)
+    {
+        for (const auto& item : hyperparameters.items()) {
+            if (find(validKeys.begin(), validKeys.end(), item.key()) == validKeys.end()) {
+                throw std::invalid_argument("Hyperparameter " + item.key() + " is not valid");
+            }
+        }
+    }
 } /* namespace pywrap */
--- a/src/PyClassifier.h
+++ b/src/PyClassifier.h
@@ -2,15 +2,17 @@
 #define PYCLASSIFER_H
 #include "boost/python/detail/wrap_python.hpp"
 #include <boost/python/numpy.hpp>
+#include <nlohmann/json.hpp>
 #include <string>
 #include <map>
 #include <vector>
 #include <utility>
 #include <torch/torch.h>
 #include "PyWrap.h"
+#include "Classifier.h"

 namespace pywrap {
-    class PyClassifier {
+    class PyClassifier : public Classifier {
    public:
        PyClassifier(const std::string& module, const std::string& className);
        virtual ~PyClassifier();
@@ -19,11 +21,15 @@ namespace pywrap {
        double score(torch::Tensor& X, torch::Tensor& y);
        std::string version();
        std::string callMethodString(const std::string& method);
+        void setHyperparameters(const nlohmann::json& hyperparameters) override;
+    protected:
+        void checkHyperparameters(const std::vector<std::string>& validKeys, const nlohmann::json& hyperparameters);
+        nlohmann::json hyperparameters;
    private:
        PyWrap* pyWrap;
        std::string module;
        std::string className;
+        bool fitted;
    };
-
 } /* namespace pywrap */
 #endif /* PYCLASSIFER_H */
--- a/src/STree.cc
+++ b/src/STree.cc
@@ -5,4 +5,11 @@ namespace pywrap {
    {
        return callMethodString("graph");
    }
+    void STree::setHyperparameters(const nlohmann::json& hyperparameters)
+    {
+        // Check if hyperparameters are valid
+        const std::vector<std::string> validKeys = { "C", "n_jobs", "kernel", "max_iter", "max_depth", "random_state", "multiclass_strategy" };
+        checkHyperparameters(validKeys, hyperparameters);
+        this->hyperparameters = hyperparameters;
+    }
 } /* namespace pywrap */
--- a/src/STree.h
+++ b/src/STree.h
@@ -1,5 +1,6 @@
 #ifndef STREE_H
 #define STREE_H
+#include "nlohmann/json.hpp"
 #include "PyClassifier.h"

 namespace pywrap {
@@ -8,6 +9,7 @@ namespace pywrap {
        STree() : PyClassifier("stree", "Stree") {};
        ~STree() = default;
        std::string graph();
+        void setHyperparameters(const nlohmann::json& hyperparameters) override;
    };
 } /* namespace pywrap */
 #endif /* STREE_H */
--- a/src/example.cpp
+++ b/src/example.cpp
@@ -1,257 +0,0 @@
-#include "boost/python/detail/wrap_python.hpp"
-#include <boost/python/numpy.hpp>
-#include <torch/torch.h>
-#include <torch/csrc/utils/tensor_numpy.h>
-#include <string>
-#include <iostream>
-#include "ArffFiles.h"
-#include "PyHelper.hpp"
-#include "PyWrap.h"
-
-
-void errorAbort(const std::string& message)
-{
-    std::cerr << message << std::endl;
-    PyErr_Print();
-    exit(1);
-}
-void print_array(pywrap::np::ndarray& array)
-{
-    std::cout << "Array: " << std::endl;
-    std::cout << pywrap::p::extract<char const*>(pywrap::p::str(array)) << std::endl;
-}
-// np::ndarray to_numpy_matrix(torch::Tensor& input_data, np::dtype numpy_dtype)
-// {
-//     p::tuple shape = p::make_tuple(input_data.size(0), input_data.size(1));
-//     auto tensor_dtype = input_data.dtype();
-//     p::tuple stride = p::make_tuple(sizeof(tensor_dtype) * input_data.size(1), sizeof(tensor_dtype));
-//     auto dito = input_data.transpose(1, 0);
-//     np::ndarray result = np::from_data(dito.data_ptr(), numpy_dtype, shape, stride, p::object());
-//     return result;
-// }
-// np::ndarray to_numpy_vector(torch::Tensor& input_data, np::dtype numpy_dtype)
-// {
-//     p::tuple shape = p::make_tuple(input_data.size(0));
-//     auto tensor_dtype = input_data.dtype();
-//     p::tuple stride = p::make_tuple(sizeof(tensor_dtype), sizeof(tensor_dtype));
-//     np::ndarray result = np::from_data(input_data.data_ptr(), numpy_dtype, shape, stride, p::object());
-//     return result;
-// }
-
-class Paths {
-public:
-    static string datasets()
-    {
-        return "../discretizbench/datasets/";
-    }
-};
-
-tuple<torch::Tensor, torch::Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& name, bool class_last)
-{
-    auto handler = ArffFiles();
-    handler.load(Paths::datasets() + static_cast<string>(name) + ".arff", class_last);
-    // Get Dataset X, y
-    vector<vector<float>> X = handler.getX();
-    vector<int> y = handler.getY();
-    // Get className & Features
-    auto className = handler.getClassName();
-    vector<string> features;
-    auto attributes = handler.getAttributes();
-    transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
-    torch::Tensor Xd;
-    auto states = map<string, vector<int>>();
-    Xd = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
-    for (int i = 0; i < features.size(); ++i) {
-        Xd.index_put_({ i, "..." }, torch::tensor(X[i], torch::kFloat32));
-    }
-    return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
-}
-
-using namespace pywrap;
-np::ndarray tensor2numpy(torch::Tensor& X)
-{
-    int m = X.size(0);
-    int n = X.size(1);
-    auto Xn = np::from_data(X.data_ptr(), np::dtype::get_builtin<float>(), p::make_tuple(m, n), p::make_tuple(sizeof(X.dtype()) * 2 * n, sizeof(X.dtype()) * 2), p::object());
-    Xn = Xn.transpose();
-    return Xn;
-}
-pair<np::ndarray, np::ndarray> tensors2numpy(torch::Tensor& X, torch::Tensor& y)
-{
-    int n = X.size(1);
-    auto yn = np::from_data(y.data_ptr(), np::dtype::get_builtin<int32_t>(), p::make_tuple(n), p::make_tuple(sizeof(y.dtype()) * 2), p::object());
-    return { tensor2numpy(X), yn };
-}
-pair<np::ndarray, np::ndarray> getData(const string& dataset)
-{
-    auto [X, y, featuresx, classNamex, statesx] = loadDataset(dataset, true);
-    auto [Xn, yn] = tensors2numpy(X, y);
-    auto Xn_shapes = Xn.get_shape();
-    auto yn_shapes = yn.get_shape();
-    cout << "Xn_shapes: " << Xn_shapes[0] << ", " << Xn_shapes[1] << endl;
-    cout << "yn_shapes: " << yn_shapes[0] << endl;
-    cout << "X shapes: " << X.sizes() << endl;
-    cout << "y shapes: " << y.sizes() << endl;
-    assert(Xn_shapes[0] == X.sizes()[0]);
-    assert(Xn_shapes[1] == X.sizes()[1]);
-    assert(yn_shapes[0] == y.sizes()[0]);
-
-    return { Xn, yn };
-}
-int main(int argc, char** argv)
-{
-    cout << "* Begin." << endl;
-    {
-        PyWrap* wrapper = PyWrap::GetInstance();
-        string dataset = "iris";
-        // Convert Tensor to numpy array
-        // auto [Xn, yn] = tensors2numpy(X, y);
-        // cout << "Numpy array data: " << endl;
-        // print_array(Xn);
-        // cout << "Numpy array labels: " << endl;
-        // print_array(yn);
-        // Import module
-        string moduleName = "stree";
-        string className = "Stree";
-        // Import
-        {
-            cout << "--Import Phase--" << endl;
-            wrapper->importClass(moduleName, className);
-            cout << "--Import Phase end--" << endl;
-        }
-        // Version
-        {
-            cout << "--Version Phase--" << endl;
-            auto version = wrapper->version(moduleName, className);
-            cout << "Version: " << version << endl;
-            cout << "--Version Phase end--" << endl;
-        }
-        // Fit
-        {
-            cout << "--Fit Phase--" << endl;
-            auto [Xn, yn] = getData(dataset);
-            auto Xn_shapes = Xn.get_shape();
-            auto yn_shapes = yn.get_shape();
-            CPyObject Xp = boost::python::incref(boost::python::object(Xn).ptr());
-            CPyObject yp = boost::python::incref(boost::python::object(yn).ptr());
-            //print_array(yn);
-            // Call fit
-            cout << "Calling fit" << endl;
-            wrapper->fit(moduleName, className, Xp, yp);
-            cout << "--Fit Phase end--" << endl;
-        }
-        // Score
-        {
-            cout << "--Score Phase--" << endl;
-            auto [Xn, yn] = getData(dataset);
-            auto Xn_shapes = Xn.get_shape();
-            auto yn_shapes = yn.get_shape();
-            CPyObject Xp = boost::python::incref(boost::python::object(Xn).ptr());
-            CPyObject yp = boost::python::incref(boost::python::object(yn).ptr());
-            //print_array(yn);
-            // Call score
-            cout << "Calling score" << endl;
-            auto result = wrapper->score(moduleName, className, Xp, yp);
-            cout << "Score: " << result << endl;
-            cout << "--Score Phase end--" << endl;
-        }
-        // Call score
-        // {
-        //     np::initialize();
-        //     cout << "--Score Phase--" << endl;
-        //     auto [X, y, featuresx, classNamex, statesx] = loadDataset(dataset, true);
-        //     auto [Xn, yn] = tensors2numpy(X, y);
-        //     auto Xn_shapes = Xn.get_shape();
-        //     auto yn_shapes = yn.get_shape();
-        //     cout << "Xn_shapes: " << Xn_shapes[0] << ", " << Xn_shapes[1] << endl;
-        //     cout << "yn_shapes: " << yn_shapes[0] << endl;
-        //     cout << "X shapes: " << X.sizes() << endl;
-        //     cout << "y shapes: " << y.sizes() << endl;
-        //     assert(Xn_shapes[0] == X.sizes()[0]);
-        //     assert(Xn_shapes[1] == X.sizes()[1]);
-        //     assert(yn_shapes[0] == y.sizes()[0]);
-        //     CPyObject Xp = Xn.ptr();
-        //     CPyObject yp = yn.ptr();
-        //     print_array(yn);
-        //     cout << "Calling score" << endl;
-        //     auto instance = wrapper->getClass(moduleName, className);
-        //     CPyObject result;
-        //     if (!(result = PyObject_CallMethod(instance, "score", "OO", Xp.getObject(), yp.getObject())))
-        //         errorAbort("Couldn't call method score");
-        //     auto score = PyFloat_AsDouble(result);
-        //     //auto score = wrapper->score(moduleName, className, Xp, yp);
-        //     cout << "Score: " << score << endl;
-        //     cout << "--Score Phase end--" << endl;
-        // }
-        // Clean module
-        {
-            cout << "--Clean Phase--" << endl;
-            wrapper->clean(moduleName, className);
-            cout << "--Clean Phase end--" << endl;
-        }
-    }
-    cout << "* End." << endl;
-}
-// int main(int argc, char** argv)
-// {
-//     auto [data_tensor, y_label, featuresx, classNamex, statesx] = loadDataset("iris", true);
-//     // CPyInstance pInstance;
-//     // auto wrapper = PyWrap();
-//     PyWrap* wrapper = PyWrap::GetInstance();
-//     // PyWrap* wrapper = PyWrap::GetInstance();
-//     int m = data_tensor.size(0);
-//     int n = data_tensor.size(1);
-//     auto data_numpy = np::from_data(data_tensor.data_ptr(), np::dtype::get_builtin<float>(), p::make_tuple(m, n), p::make_tuple(sizeof(data_tensor.dtype()) * 2 * n, sizeof(data_tensor.dtype()) * 2), p::object());
-//     data_numpy = data_numpy.transpose();
-//     auto y_numpy = np::from_data(y_label.data_ptr(), np::dtype::get_builtin<int32_t>(), p::make_tuple(n), p::make_tuple(sizeof(y_label.dtype()) * 2), p::object());
-//     cout << "Numpy array data: " << endl;
-//     print_array(data_numpy);
-//     cout << "Numpy array labels: " << endl;
-//     print_array(y_numpy);
-//     cout << "primero" << endl;
-//     CPyObject p = data_numpy.ptr();
-//     CPyObject yp = y_numpy.ptr();
-//     string moduleName = "sklearn.svm";
-//     string className = "SVC";
-//     string method = "_repr_html_";
-//     // CPyObject module = PyImport_ImportModule(moduleName.c_str());
-//     // if (PyErr_Occurred()) {
-//     //     errorAbort("Could't import module " + moduleName);
-//     // }
-//     // CPyObject classObject = PyObject_GetAttrString(module, className.c_str());
-//     // if (PyErr_Occurred()) {
-//     //     errorAbort("Couldn't find class " + className);
-//     // }
-//     // CPyObject instance = PyObject_CallObject(classObject, NULL);
-//     // if (PyErr_Occurred()) {
-//     //     errorAbort("Couldn't create instance of class " + className);
-//     // }
-//     // wrapper.moduleClassMap.insert({ { moduleName, className }, { module, classObject, instance } });
-//     wrapper->importClass(moduleName, className);
-//     PyObject* instance = wrapper->getClass(moduleName, className);
-//     CPyObject result;
-//     if (!(result = PyObject_CallMethod(instance, method.c_str(), NULL)))
-//         errorAbort("Couldn't call method " + method);
-//     std::string value = PyUnicode_AsUTF8(result);
-//     cout << "Version: " << value << endl;
-//     cout << "Calling fit" << endl;
-//     p.AddRef();
-//     yp.AddRef();
-//     method = "fit";
-//     wrapper->fit(moduleName, className, p, yp);
-//     // PyObject* instance2 = wrapper->getClass(moduleName, className);
-//     // if (!(result = PyObject_CallMethodObjArgs(instance2, PyUnicode_FromString(method.c_str()), p.getObject(), yp.getObject(), NULL)))
-//         // errorAbort("Couldn't call method fit");
-//     // method = "fit";
-//     // if (!(result = PyObject_CallMethodObjArgs(instance, PyUnicode_FromString(method.c_str()), p.getObject(), yp.getObject(), NULL)))
-//     //     errorAbort("Couldn't call method fit");
-//     cout << "Calling score" << endl;
-//     // method = "score";
-//     // if (!(result = PyObject_CallMethodObjArgs(instance, PyUnicode_FromString(method.c_str()), p.getObject(), yp.getObject(), NULL)))
-//     //     errorAbort("Couldn't call method score");
-//     // float score = PyFloat_AsDouble(result);
-//     auto score = wrapper->score(moduleName, className, p, yp);
-//     cout << "Score: " << score << endl;
-//     wrapper->clean(moduleName, className);
-//     return 0;
-// }
--- a/src/main.cc
+++ b/src/main.cc
@@ -52,6 +52,8 @@ int main(int argc, char* argv[])
        cout << "X: " << X.sizes() << endl;
        cout << "y: " << y.sizes() << endl;
        auto clf = pywrap::STree();
+        auto hyperparameters = nlohmann::json({ "max_depth": 3, "C" : 0.7 });
+        clf.setHyperparameters(hyperparameters);
        cout << "STree Version: " << clf.version() << endl;
        auto svc = pywrap::SVC();
        svc.fit(X, y, features, className, states);