From e3a06264a91d454fe63db9d138c0a12620c90f6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sun, 26 May 2024 17:25:36 +0200 Subject: [PATCH] Remove old Files library --- CMakeLists.txt | 1 - lib/Files/ArffFiles.cc | 176 ---------------------------- lib/Files/ArffFiles.h | 34 ------ lib/Files/CMakeLists.txt | 1 - sample/CMakeLists.txt | 2 +- sample/sample.cpp | 244 +++++++++++++++++++-------------------- src/CMakeLists.txt | 10 +- src/common/Dataset.cpp | 2 +- 8 files changed, 129 insertions(+), 341 deletions(-) delete mode 100644 lib/Files/ArffFiles.cc delete mode 100644 lib/Files/ArffFiles.h delete mode 100644 lib/Files/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index c23943f..72010b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,7 +88,6 @@ message(STATUS "Bayesnet_INCLUDE_DIRS=${Bayesnet_INCLUDE_DIRS}") ## Configure test data path cmake_path(SET TEST_DATA_PATH "${CMAKE_CURRENT_SOURCE_DIR}/tests/data") configure_file(src/common/SourceData.h.in "${CMAKE_BINARY_DIR}/configured_files/include/SourceData.h") -add_subdirectory(lib/Files) add_subdirectory(config) add_subdirectory(src) add_subdirectory(sample) diff --git a/lib/Files/ArffFiles.cc b/lib/Files/ArffFiles.cc deleted file mode 100644 index 826bd86..0000000 --- a/lib/Files/ArffFiles.cc +++ /dev/null @@ -1,176 +0,0 @@ -#include "ArffFiles.h" -#include -#include -#include -#include // std::isdigit -#include // std::all_of -#include - -ArffFiles::ArffFiles() = default; - -std::vector ArffFiles::getLines() const -{ - return lines; -} - -unsigned long int ArffFiles::getSize() const -{ - return lines.size(); -} - -std::vector> ArffFiles::getAttributes() const -{ - return attributes; -} - -std::string ArffFiles::getClassName() const -{ - return className; -} - -std::string ArffFiles::getClassType() const -{ - return classType; -} - -std::vector>& ArffFiles::getX() -{ - return X; -} - -std::vector& ArffFiles::getY() -{ - return y; -} - -void ArffFiles::loadCommon(std::string fileName) -{ - std::ifstream file(fileName); - if (!file.is_open()) { - throw std::invalid_argument("Unable to open file"); - } - std::string line; - std::string keyword; - std::string attribute; - std::string type; - std::string type_w; - while (getline(file, line)) { - if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { - continue; - } - if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { - std::stringstream ss(line); - ss >> keyword >> attribute; - type = ""; - while (ss >> type_w) - type += type_w + " "; - attributes.emplace_back(trim(attribute), trim(type)); - continue; - } - if (line[0] == '@') { - continue; - } - lines.push_back(line); - } - file.close(); - if (attributes.empty()) - throw std::invalid_argument("No attributes found"); -} - -void ArffFiles::load(const std::string& fileName, bool classLast) -{ - int labelIndex; - loadCommon(fileName); - if (classLast) { - className = std::get<0>(attributes.back()); - classType = std::get<1>(attributes.back()); - attributes.pop_back(); - labelIndex = static_cast(attributes.size()); - } else { - className = std::get<0>(attributes.front()); - classType = std::get<1>(attributes.front()); - attributes.erase(attributes.begin()); - labelIndex = 0; - } - generateDataset(labelIndex); -} -void ArffFiles::load(const std::string& fileName, const std::string& name) -{ - int labelIndex; - loadCommon(fileName); - bool found = false; - for (int i = 0; i < attributes.size(); ++i) { - if (attributes[i].first == name) { - className = std::get<0>(attributes[i]); - classType = std::get<1>(attributes[i]); - attributes.erase(attributes.begin() + i); - labelIndex = i; - found = true; - break; - } - } - if (!found) { - throw std::invalid_argument("Class name not found"); - } - generateDataset(labelIndex); -} - -void ArffFiles::generateDataset(int labelIndex) -{ - X = std::vector>(attributes.size(), std::vector(lines.size())); - auto yy = std::vector(lines.size(), ""); - auto removeLines = std::vector(); // Lines with missing values - for (size_t i = 0; i < lines.size(); i++) { - std::stringstream ss(lines[i]); - std::string value; - int pos = 0; - int xIndex = 0; - while (getline(ss, value, ',')) { - if (pos++ == labelIndex) { - yy[i] = value; - } else { - if (value == "?") { - X[xIndex++][i] = -1; - removeLines.push_back(i); - } else - X[xIndex++][i] = stof(value); - } - } - } - for (auto i : removeLines) { - yy.erase(yy.begin() + i); - for (auto& x : X) { - x.erase(x.begin() + i); - } - } - y = factorize(yy); -} - -std::string ArffFiles::trim(const std::string& source) -{ - std::string s(source); - s.erase(0, s.find_first_not_of(" '\n\r\t")); - s.erase(s.find_last_not_of(" '\n\r\t") + 1); - return s; -} - -std::vector ArffFiles::factorize(const std::vector& labels_t) -{ - std::vector yy; - labels.clear(); - yy.reserve(labels_t.size()); - std::map labelMap; - int i = 0; - for (const std::string& label : labels_t) { - if (labelMap.find(label) == labelMap.end()) { - labelMap[label] = i++; - bool allDigits = std::all_of(label.begin(), label.end(), isdigit); - if (allDigits) - labels.push_back("Class " + label); - else - labels.push_back(label); - } - yy.push_back(labelMap[label]); - } - return yy; -} \ No newline at end of file diff --git a/lib/Files/ArffFiles.h b/lib/Files/ArffFiles.h deleted file mode 100644 index 21caa05..0000000 --- a/lib/Files/ArffFiles.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef ARFFFILES_H -#define ARFFFILES_H - -#include -#include - -class ArffFiles { -public: - ArffFiles(); - void load(const std::string&, bool = true); - void load(const std::string&, const std::string&); - std::vector getLines() const; - unsigned long int getSize() const; - std::string getClassName() const; - std::string getClassType() const; - std::vector getLabels() const { return labels; } - static std::string trim(const std::string&); - std::vector>& getX(); - std::vector& getY(); - std::vector> getAttributes() const; - std::vector factorize(const std::vector& labels_t); -private: - std::vector lines; - std::vector> attributes; - std::string className; - std::string classType; - std::vector> X; - std::vector y; - std::vector labels; - void generateDataset(int); - void loadCommon(std::string); -}; - -#endif \ No newline at end of file diff --git a/lib/Files/CMakeLists.txt b/lib/Files/CMakeLists.txt deleted file mode 100644 index fce5b8f..0000000 --- a/lib/Files/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_library(ArffFiles ArffFiles.cc) \ No newline at end of file diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt index 670f833..ca0245a 100644 --- a/sample/CMakeLists.txt +++ b/sample/CMakeLists.txt @@ -12,4 +12,4 @@ include_directories( ${Bayesnet_INCLUDE_DIRS} ) add_executable(PlatformSample sample.cpp ${Platform_SOURCE_DIR}/src/main/Models.cpp) -target_link_libraries(PlatformSample "${PyClassifiers}" "${BayesNet}" ArffFiles mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy) \ No newline at end of file +target_link_libraries(PlatformSample "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy) \ No newline at end of file diff --git a/sample/sample.cpp b/sample/sample.cpp index 491f82e..5842c53 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include @@ -79,11 +79,11 @@ int main(int argc, char** argv) } throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}"); } - ); + ); program.add_argument("-p", "--path") .help(" folder where the data files are located, default") .default_value(std::string{ PATH } - ); + ); program.add_argument("-m", "--model") .help("Model to use " + platform::Models::instance()->toString()) .action([](const std::string& value) { @@ -93,7 +93,7 @@ int main(int argc, char** argv) } throw runtime_error("Model must be one of " + platform::Models::instance()->toString()); } - ); + ); program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true); program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true); program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true); @@ -112,129 +112,129 @@ int main(int argc, char** argv) catch (...) { throw runtime_error("Number of folds must be an integer"); }}); - program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>(); - bool class_last, stratified, tensors, dump_cpt; - std::string model_name, file_name, path, complete_file_name; - int nFolds, seed; - try { - program.parse_args(argc, argv); - file_name = program.get("dataset"); - path = program.get("path"); - model_name = program.get("model"); - complete_file_name = path + file_name + ".arff"; - stratified = program.get("stratified"); - tensors = program.get("tensors"); - nFolds = program.get("folds"); - seed = program.get("seed"); - dump_cpt = program.get("dumpcpt"); - class_last = datasets[file_name]; - if (!file_exists(complete_file_name)) { - throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist"); + program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>(); + bool class_last, stratified, tensors, dump_cpt; + std::string model_name, file_name, path, complete_file_name; + int nFolds, seed; + try { + program.parse_args(argc, argv); + file_name = program.get("dataset"); + path = program.get("path"); + model_name = program.get("model"); + complete_file_name = path + file_name + ".arff"; + stratified = program.get("stratified"); + tensors = program.get("tensors"); + nFolds = program.get("folds"); + seed = program.get("seed"); + dump_cpt = program.get("dumpcpt"); + class_last = datasets[file_name]; + if (!file_exists(complete_file_name)) { + throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist"); + } + } + catch (const exception& err) { + cerr << err.what() << std::endl; + cerr << program; + exit(1); } - } - catch (const exception& err) { - cerr << err.what() << std::endl; - cerr << program; - exit(1); - } - /* - * Begin Processing - */ - auto handler = ArffFiles(); - handler.load(complete_file_name, class_last); - // Get Dataset X, y - std::vector& X = handler.getX(); - mdlp::labels_t& y = handler.getY(); - // Get className & Features - auto className = handler.getClassName(); - std::vector features; - auto attributes = handler.getAttributes(); - transform(attributes.begin(), attributes.end(), back_inserter(features), - [](const pair& item) { return item.first; }); - // Discretize Dataset - auto [Xd, maxes] = discretize(X, y, features); - maxes[className] = *max_element(y.begin(), y.end()) + 1; - map> states; - for (auto feature : features) { - states[feature] = std::vector(maxes[feature]); - } - states[className] = std::vector(maxes[className]); - auto clf = platform::Models::instance()->create(model_name); - clf->fit(Xd, y, features, className, states); - if (dump_cpt) { - std::cout << "--- CPT Tables ---" << std::endl; - clf->dump_cpt(); - } - auto lines = clf->show(); - for (auto line : lines) { - std::cout << line << std::endl; - } - std::cout << "--- Topological Order ---" << std::endl; - auto order = clf->topological_order(); - for (auto name : order) { - std::cout << name << ", "; - } - std::cout << "end." << std::endl; - auto score = clf->score(Xd, y); - std::cout << "Score: " << score << std::endl; - auto graph = clf->graph(); - auto dot_file = model_name + "_" + file_name; - ofstream file(dot_file + ".dot"); - file << graph; - file.close(); - std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl; - std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl; - std::string stratified_string = stratified ? " Stratified" : ""; - std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl; - std::cout << "==========================================" << std::endl; - torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); - torch::Tensor yt = torch::tensor(y, torch::kInt32); - for (int i = 0; i < features.size(); ++i) { - Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); - } - float total_score = 0, total_score_train = 0, score_train, score_test; - folding::Fold* fold; - double nodes = 0.0; - if (stratified) - fold = new folding::StratifiedKFold(nFolds, y, seed); - else - fold = new folding::KFold(nFolds, y.size(), seed); - for (auto i = 0; i < nFolds; ++i) { - auto [train, test] = fold->getFold(i); - std::cout << "Fold: " << i + 1 << std::endl; - if (tensors) { - auto ttrain = torch::tensor(train, torch::kInt64); - auto ttest = torch::tensor(test, torch::kInt64); - torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); - torch::Tensor ytraint = yt.index({ ttrain }); - torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); - torch::Tensor ytestt = yt.index({ ttest }); - clf->fit(Xtraint, ytraint, features, className, states); - auto temp = clf->predict(Xtraint); - score_train = clf->score(Xtraint, ytraint); - score_test = clf->score(Xtestt, ytestt); - } else { - auto [Xtrain, ytrain] = extract_indices(train, Xd, y); - auto [Xtest, ytest] = extract_indices(test, Xd, y); - clf->fit(Xtrain, ytrain, features, className, states); - std::cout << "Nodes: " << clf->getNumberOfNodes() << std::endl; - nodes += clf->getNumberOfNodes(); - score_train = clf->score(Xtrain, ytrain); - score_test = clf->score(Xtest, ytest); + /* + * Begin Processing + */ + auto handler = ArffFiles(); + handler.load(complete_file_name, class_last); + // Get Dataset X, y + std::vector& X = handler.getX(); + mdlp::labels_t& y = handler.getY(); + // Get className & Features + auto className = handler.getClassName(); + std::vector features; + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), + [](const pair& item) { return item.first; }); + // Discretize Dataset + auto [Xd, maxes] = discretize(X, y, features); + maxes[className] = *max_element(y.begin(), y.end()) + 1; + map> states; + for (auto feature : features) { + states[feature] = std::vector(maxes[feature]); } + states[className] = std::vector(maxes[className]); + auto clf = platform::Models::instance()->create(model_name); + clf->fit(Xd, y, features, className, states); if (dump_cpt) { std::cout << "--- CPT Tables ---" << std::endl; clf->dump_cpt(); } - total_score_train += score_train; - total_score += score_test; - std::cout << "Score Train: " << score_train << std::endl; - std::cout << "Score Test : " << score_test << std::endl; - std::cout << "-------------------------------------------------------------------------------" << std::endl; - } - std::cout << "Nodes: " << nodes / nFolds << std::endl; - std::cout << "**********************************************************************************" << std::endl; - std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl; - std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0; + auto lines = clf->show(); + for (auto line : lines) { + std::cout << line << std::endl; + } + std::cout << "--- Topological Order ---" << std::endl; + auto order = clf->topological_order(); + for (auto name : order) { + std::cout << name << ", "; + } + std::cout << "end." << std::endl; + auto score = clf->score(Xd, y); + std::cout << "Score: " << score << std::endl; + auto graph = clf->graph(); + auto dot_file = model_name + "_" + file_name; + ofstream file(dot_file + ".dot"); + file << graph; + file.close(); + std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl; + std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl; + std::string stratified_string = stratified ? " Stratified" : ""; + std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl; + std::cout << "==========================================" << std::endl; + torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); + torch::Tensor yt = torch::tensor(y, torch::kInt32); + for (int i = 0; i < features.size(); ++i) { + Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); + } + float total_score = 0, total_score_train = 0, score_train, score_test; + folding::Fold* fold; + double nodes = 0.0; + if (stratified) + fold = new folding::StratifiedKFold(nFolds, y, seed); + else + fold = new folding::KFold(nFolds, y.size(), seed); + for (auto i = 0; i < nFolds; ++i) { + auto [train, test] = fold->getFold(i); + std::cout << "Fold: " << i + 1 << std::endl; + if (tensors) { + auto ttrain = torch::tensor(train, torch::kInt64); + auto ttest = torch::tensor(test, torch::kInt64); + torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); + torch::Tensor ytraint = yt.index({ ttrain }); + torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); + torch::Tensor ytestt = yt.index({ ttest }); + clf->fit(Xtraint, ytraint, features, className, states); + auto temp = clf->predict(Xtraint); + score_train = clf->score(Xtraint, ytraint); + score_test = clf->score(Xtestt, ytestt); + } else { + auto [Xtrain, ytrain] = extract_indices(train, Xd, y); + auto [Xtest, ytest] = extract_indices(test, Xd, y); + clf->fit(Xtrain, ytrain, features, className, states); + std::cout << "Nodes: " << clf->getNumberOfNodes() << std::endl; + nodes += clf->getNumberOfNodes(); + score_train = clf->score(Xtrain, ytrain); + score_test = clf->score(Xtest, ytest); + } + if (dump_cpt) { + std::cout << "--- CPT Tables ---" << std::endl; + clf->dump_cpt(); + } + total_score_train += score_train; + total_score += score_test; + std::cout << "Score Train: " << score_train << std::endl; + std::cout << "Score Test : " << score_test << std::endl; + std::cout << "-------------------------------------------------------------------------------" << std::endl; + } + std::cout << "Nodes: " << nodes / nFolds << std::endl; + std::cout << "**********************************************************************************" << std::endl; + std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl; + std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0; } \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6313e48..7049d65 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -26,7 +26,7 @@ add_executable( reports/ReportExcel.cpp reports/ReportBase.cpp reports/ExcelFile.cpp results/Result.cpp ) -target_link_libraries(b_best Boost::boost "${PyClassifiers}" "${BayesNet}" ArffFiles mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy "${XLSXWRITER_LIB}") +target_link_libraries(b_best Boost::boost "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy "${XLSXWRITER_LIB}") # b_grid set(grid_sources GridSearch.cpp GridData.cpp) @@ -35,7 +35,7 @@ add_executable(b_grid commands/b_grid.cpp ${grid_sources} common/Datasets.cpp common/Dataset.cpp main/HyperParameters.cpp main/Models.cpp ) -target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} "${PyClassifiers}" "${BayesNet}" ArffFiles mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy) +target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy) # b_list add_executable(b_list commands/b_list.cpp @@ -44,7 +44,7 @@ add_executable(b_list commands/b_list.cpp reports/ReportExcel.cpp reports/ExcelFile.cpp reports/ReportBase.cpp reports/DatasetsExcel.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp results/Result.cpp results/ResultsDatasetExcel.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp ) -target_link_libraries(b_list "${PyClassifiers}" "${BayesNet}" ArffFiles mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy "${XLSXWRITER_LIB}") +target_link_libraries(b_list "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy "${XLSXWRITER_LIB}") # b_main set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp) @@ -54,7 +54,7 @@ add_executable(b_main commands/b_main.cpp ${main_sources} reports/ReportConsole.cpp reports/ReportBase.cpp results/Result.cpp ) -target_link_libraries(b_main "${PyClassifiers}" "${BayesNet}" ArffFiles mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy) +target_link_libraries(b_main "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy) # b_manage set(manage_sources ManageScreen.cpp CommandParser.cpp ResultsManager.cpp) @@ -66,4 +66,4 @@ add_executable( results/Result.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp main/Scores.cpp ) -target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp "${BayesNet}") +target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" mdlp "${BayesNet}") diff --git a/src/common/Dataset.cpp b/src/common/Dataset.cpp index bb1bf94..7f9a26f 100644 --- a/src/common/Dataset.cpp +++ b/src/common/Dataset.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include "Dataset.h" namespace platform {