From 030ed6bc1c84cc77cefd3e954b80307caaaff892 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Sat, 11 May 2024 17:42:36 +0200 Subject: [PATCH] Remove submodules to change its location --- .gitmodules | 6 - .vscode/c_cpp_properties.json | 12 +- .vscode/launch.json | 4 +- CHANGELOG.md | 12 ++ CMakeLists.txt | 12 +- Makefile | 4 +- folding copy.hpp | 143 ------------------------ folding.hpp | 10 +- lib/catch2 | 1 - lib/mdlp | 1 - tests/CMakeLists.txt | 4 +- {lib => tests/lib}/Files/ArffFiles.cc | 0 {lib => tests/lib}/Files/ArffFiles.h | 0 {lib => tests/lib}/Files/CMakeLists.txt | 0 14 files changed, 38 insertions(+), 171 deletions(-) delete mode 100644 .gitmodules delete mode 100644 folding copy.hpp delete mode 160000 lib/catch2 delete mode 160000 lib/mdlp rename {lib => tests/lib}/Files/ArffFiles.cc (100%) rename {lib => tests/lib}/Files/ArffFiles.h (100%) rename {lib => tests/lib}/Files/CMakeLists.txt (100%) diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 5801187..0000000 --- a/.gitmodules +++ /dev/null @@ -1,6 +0,0 @@ -[submodule "lib/catch2"] - path = lib/catch2 - url = https://github.com/catchorg/Catch2.git -[submodule "lib/mdlp"] - path = lib/mdlp - url = https://github.com/rmontanana/mdlp diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json index 6faaf51..57de37e 100644 --- a/.vscode/c_cpp_properties.json +++ b/.vscode/c_cpp_properties.json @@ -10,8 +10,16 @@ "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks" ], "cStandard": "c17", - "cppStandard": "c++17", - "compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json" + "cppStandard": "c++17" + }, + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [], + "cStandard": "c17", + "cppStandard": "c++17" } ], "version": 4 diff --git a/.vscode/launch.json b/.vscode/launch.json index 1d3ed53..38b06cf 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -5,9 +5,9 @@ "type": "lldb", "request": "launch", "name": "test", - "program": "${workspaceFolder}/build_debug/tests/unit_tests_folding", + "program": "${workspaceFolder}/build_Debug/tests/unit_tests_folding", "args": [], - "cwd": "${workspaceFolder}/build_debug/tests", + "cwd": "${workspaceFolder}/build_Debug/tests", }, ] } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a0d7d9..c899645 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.1.0] 2024-05-11 + +### Fixed + +- Fixed the issue in stratified K-fold when the number of samples of a class is less than the number of folds. Now the algorithm will split the samples evenly among the folds. + +### Added + +- Refactor stratified build method to remove uneeded structures and optimize loops. +- Refactor the code to improve the readability and maintainability of the code,changing the order of the private, public and protected methods. +- More tests to enhance the robustness of the code. + ## [1.0.1] 2024-04-03 ### Added diff --git a/CMakeLists.txt b/CMakeLists.txt index 4630e20..9c2107e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,10 +7,6 @@ project(Folding LANGUAGES CXX ) -if (CODE_COVERAGE AND NOT ENABLE_TESTING) - MESSAGE(FATAL_ERROR "Code coverage requires testing enabled") -endif (CODE_COVERAGE AND NOT ENABLE_TESTING) - find_package(Torch REQUIRED) if (POLICY CMP0135) @@ -25,6 +21,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") + # Options # ------- option(ENABLE_TESTING "Unit testing build" OFF) @@ -40,12 +37,11 @@ add_subdirectory(config) # Testing # ------- - if (ENABLE_TESTING) MESSAGE("Testing enabled") - add_git_submodule("lib/catch2") - add_git_submodule("lib/Files") - add_git_submodule("lib/mdlp") + add_git_submodule("tests/lib/catch2") + add_git_submodule("tests/lib/Files") + add_git_submodule("tests/lib/mdlp") include(CTest) add_subdirectory(tests) endif (ENABLE_TESTING) diff --git a/Makefile b/Makefile index ac35ab2..23fde4e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ SHELL := /bin/bash .DEFAULT_GOAL := help .PHONY: help build test clean -f_debug = build_debug +f_debug = build_Debug test_targets = unit_tests_folding n_procs = -j 16 @@ -28,7 +28,7 @@ build: ## Build a debug version of the project @echo ">>> Building Debug Folding..."; @if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi @mkdir $(f_debug); - @cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON + @cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON @echo ">>> Done"; opt = "" diff --git a/folding copy.hpp b/folding copy.hpp deleted file mode 100644 index ee38d84..0000000 --- a/folding copy.hpp +++ /dev/null @@ -1,143 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include -namespace folding { - const std::string FOLDING_VERSION = "1.1.0"; - class Fold { - public: - inline Fold(int k, int m, int seed = -1) : k(k), m(m), seed(seed) - { - std::random_device rd; - random_seed = std::mt19937(seed == -1 ? rd() : seed); - std::srand(seed == -1 ? time(0) : seed); - } - virtual std::pair, std::vector> getFold(int nFold) = 0; - virtual ~Fold() = default; - std::string version() { return FOLDING_VERSION; } - int getNumberOfFolds() { return k; } - protected: - int k; - int m; - int seed; - std::mt19937 random_seed; - }; - class KFold : public Fold { - public: - inline KFold(int k, int m, int seed = -1) : Fold(k, m, seed), indices(std::vector(m)) - { - std::iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1 - std::shuffle(indices.begin(), indices.end(), random_seed); - } - inline std::pair, std::vector> getFold(int nFold) override - { - if (nFold >= k || nFold < 0) { - throw std::out_of_range("nFold (" + std::to_string(nFold) + ") must be less than k (" + std::to_string(k) + ")"); - } - int nTest = m / k; - auto train = std::vector(); - auto test = std::vector(); - for (int i = 0; i < m; i++) { - if (i >= nTest * nFold && i < nTest * (nFold + 1)) { - test.push_back(indices[i]); - } else { - train.push_back(indices[i]); - } - } - return { train, test }; - } - private: - std::vector indices; - }; - class StratifiedKFold : public Fold { - public: - inline StratifiedKFold(int k, const std::vector& y, int seed = -1) : Fold(k, y.size(), seed) - { - m = y.size(); - this->y = y; - build(); - } - inline StratifiedKFold(int k, torch::Tensor& y, int seed = -1) : Fold(k, y.numel(), seed) - { - m = y.numel(); - this->y = std::vector(y.data_ptr(), y.data_ptr() + m); - build(); - } - - inline std::pair, std::vector> getFold(int nFold) override - { - if (nFold >= k || nFold < 0) { - throw std::out_of_range("nFold (" + std::to_string(nFold) + ") must be less than k (" + std::to_string(k) + ")"); - } - std::vector test_indices = stratified_indices[nFold]; - std::vector train_indices; - for (int i = 0; i < k; ++i) { - if (i == nFold) continue; - train_indices.insert(train_indices.end(), stratified_indices[i].begin(), stratified_indices[i].end()); - } - return { train_indices, test_indices }; - } - inline bool isFaulty() { return faulty; } - private: - std::vector y; - std::vector> stratified_indices; - bool faulty = false; // Only true if the number of samples of any class is less than the number of folds. - void build() - { - stratified_indices = std::vector>(k); - // Compute class counts and indices - auto class_indices = std::map>(); - std::vector class_counts(*max_element(y.begin(), y.end()) + 1, 0); - for (auto i = 0; i < m; ++i) { - class_counts[y[i]]++; - class_indices[y[i]].push_back(i); - } - // Assign indices to folds - for (auto [label, indices] : class_indices) { - shuffle(indices.begin(), indices.end(), random_seed); - int num_samples = indices.size(); - int samples_per_fold = num_samples / k; - int remainder_samples_to_take = num_samples % k; - if (samples_per_fold == 0) { - std::cerr << "Warning! The number of samples in class " << label << " (" << num_samples - << ") is less than the number of folds (" << k << ")." << std::endl; - faulty = true; - } - int start = 0; - // auto chosen2 = std::vector(k); - // if (remainder_samples_to_take > 0) { - // iota(chosen2.begin(), chosen2.end(), 0); - // shuffle(chosen2.begin(), chosen2.end(), random_seed); - - - // } - if (samples_per_fold != 0) { - for (auto fold = 0; fold < k; ++fold) { - // auto it = next(indices.begin() + start, samples_per_fold); - // move(indices.begin() + start, it, back_inserter(stratified_indices[fold])); - auto it = next(class_indices[label].begin(), samples_per_fold); - move(class_indices[label].begin(), it, back_inserter(stratified_indices[fold])); - start += samples_per_fold; - class_indices[label].erase(class_indices[label].begin(), it); - } - } - auto chosen = std::vector(k, false); - while (remainder_samples_to_take > 0) { - int fold = (rand() % static_cast(k)); - if (chosen.at(fold)) { - continue; - } - chosen[fold] = true; - // auto it = next(indices.begin() + start, 1); - auto it = next(indices.begin(), 1); - stratified_indices[fold].push_back(class_indices[label][0]); - start++; - class_indices[label].erase(class_indices[label].begin(), it); - remainder_samples_to_take--; - } - } - } - }; -} \ No newline at end of file diff --git a/folding.hpp b/folding.hpp index 24611db..4595da2 100644 --- a/folding.hpp +++ b/folding.hpp @@ -104,10 +104,12 @@ namespace folding { faulty = true; } int start = 0; - for (auto fold = 0; fold < k; ++fold) { - auto it = next(class_indices[label].begin() + start, num_samples_to_take); - move(indices.begin() + start, it, back_inserter(stratified_indices[fold])); - start += num_samples_to_take; + if (num_samples_to_take > 0) { + for (auto fold = 0; fold < k; ++fold) { + auto it = next(class_indices[label].begin() + start, num_samples_to_take); + move(indices.begin() + start, it, back_inserter(stratified_indices[fold])); + start += num_samples_to_take; + } } if (remainder_samples_to_take > 0) { auto chosen = std::vector(k); diff --git a/lib/catch2 b/lib/catch2 deleted file mode 160000 index 863c662..0000000 --- a/lib/catch2 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 863c662c0eff026300f4d729a7054e90d6d12cdd diff --git a/lib/mdlp b/lib/mdlp deleted file mode 160000 index 5708dc3..0000000 --- a/lib/mdlp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 5708dc3de944fc22d61a2dd071b63aa338e04db3 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5a00026..7a72841 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,8 +1,8 @@ if(ENABLE_TESTING) include_directories( ${Folding_SOURCE_DIR} - ${Folding_SOURCE_DIR}/lib/Files - ${Folding_SOURCE_DIR}/lib/mdlp + lib/Files + lib/mdlp ${CMAKE_BINARY_DIR}/configured_files/include ) set(TEST_FOLDING "unit_tests_folding") diff --git a/lib/Files/ArffFiles.cc b/tests/lib/Files/ArffFiles.cc similarity index 100% rename from lib/Files/ArffFiles.cc rename to tests/lib/Files/ArffFiles.cc diff --git a/lib/Files/ArffFiles.h b/tests/lib/Files/ArffFiles.h similarity index 100% rename from lib/Files/ArffFiles.h rename to tests/lib/Files/ArffFiles.h diff --git a/lib/Files/CMakeLists.txt b/tests/lib/Files/CMakeLists.txt similarity index 100% rename from lib/Files/CMakeLists.txt rename to tests/lib/Files/CMakeLists.txt