#ifndef DATASET_H #define DATASET_H #include #include #include #include #include #include #include "Utils.h" #include "SourceData.h" namespace platform { class Dataset { public: Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType, std::vector numericFeaturesIdx, std::string discretizer_algo = "none") : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType), numericFeaturesIdx(numericFeaturesIdx), discretizer_algorithm(discretizer_algo) { }; explicit Dataset(const Dataset&); std::string getName() const; std::string getClassName() const; int getNClasses() const; std::vector getLabels() const; // return the labels factorization result std::vector getClassesCounts() const; std::vector getFeatures() const; std::map> getStates() const; std::pair>&, std::vector&> getVectors(); std::pair getTensors(); std::tuple getTrainTestTensors(std::vector& train, std::vector& test); int getNFeatures() const; int getNSamples() const; std::vector& getNumericFeatures() { return numericFeatures; } void load(); const bool inline isLoaded() const { return loaded; }; private: std::string path; std::string name; fileType_t fileType; std::string className; int n_samples{ 0 }, n_features{ 0 }; std::vector numericFeaturesIdx; std::string discretizer_algorithm; std::vector numericFeatures; // true if feature is numeric std::vector features; std::vector labels; std::map> states; bool loaded; bool discretize; torch::Tensor X, y; torch::Tensor X_train, X_test, y_train, y_test; std::vector> Xv; std::vector yv; void load_csv(); void load_arff(); void load_rdata(); void computeStates(); std::vector discretizeDataset(std::vector& X, mdlp::labels_t& y); }; }; #endif