diff --git a/.vscode/settings.json b/.vscode/settings.json index 6ca95f6..ca5a4e3 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -98,7 +98,8 @@ "queue": "cpp", "typeindex": "cpp", "shared_mutex": "cpp", - "*.ipp": "cpp" + "*.ipp": "cpp", + "cassert": "cpp" }, "cmake.configureOnOpen": false, "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools" diff --git a/src/Platform/Folding.cc b/src/Platform/Folding.cc index a4455f7..7186a14 100644 --- a/src/Platform/Folding.cc +++ b/src/Platform/Folding.cc @@ -34,44 +34,44 @@ pair, vector> KFold::getFold(int nFold) StratifiedKFold::StratifiedKFold(int k, const vector& y, int seed) : k(k), seed(seed) { - // n = y.size(); - // map> class_to_indices; - // for (int i = 0; i < n; ++i) { - // class_to_indices[y[i]].push_back(i); - // } - // random_device rd; - // default_random_engine random_seed(seed == -1 ? rd() : seed); - // for (auto& [cls, indices] : class_to_indices) { - // shuffle(indices.begin(), indices.end(), random_seed); - // int fold_size = n / k; - // for (int i = 0; i < k; ++i) { - // int start = i * fold_size; - // int end = (i == k - 1) ? indices.size() : (i + 1) * fold_size; - // stratified_indices.emplace_back(indices.begin() + start, indices.begin() + end); - // } - // } n = y.size(); - stratified_indices.resize(k); + stratified_indices = vector>(k); + int fold_size = n / k; + int remainder = n % k; + // Compute class counts and indices + auto class_indices = map>(); vector class_counts(*max_element(y.begin(), y.end()) + 1, 0); for (auto i = 0; i < n; ++i) { class_counts[y[i]]++; + class_indices[y[i]].push_back(i); } - vector class_starts(class_counts.size()); - partial_sum(class_counts.begin(), class_counts.end() - 1, class_starts.begin() + 1); - vector indices(n); - for (auto i = 0; i < n; ++i) { - int label = y[i]; - stratified_indices[class_starts[label]] = i; - class_starts[label]++; + // Shuffle class indices + random_device rd; + default_random_engine random_seed(seed == -1 ? rd() : seed); + for (auto& [cls, indices] : class_indices) { + shuffle(indices.begin(), indices.end(), random_seed); } - int fold_size = n / k; - int remainder = n % k; - int start = 0; - for (auto i = 0; i < k; ++i) { - int fold_length = fold_size + (i < remainder ? 1 : 0); - stratified_indices[i].resize(fold_length); - copy(indices.begin() + start, indices.begin() + start + fold_length, stratified_indices[i].begin()); - start += fold_length; + // Assign indices to folds + for (auto label = 0; label < class_counts.size(); ++label) { + auto num_samples_to_take = class_counts[label] / k; + if (num_samples_to_take == 0) + continue; + auto remainder_samples_to_take = class_counts[label] % k; + for (auto fold = 0; fold < k; ++fold) { + auto it = next(class_indices[label].begin(), num_samples_to_take); + move(class_indices[label].begin(), it, back_inserter(stratified_indices[fold])); // ## + class_indices[label].erase(class_indices[label].begin(), it); + } + while (remainder_samples_to_take > 0) { + int fold = (rand() % static_cast(k)); + if (stratified_indices[fold].size() == fold_size) { + continue; + } + auto it = next(class_indices[label].begin(), 1); + stratified_indices[fold].push_back(*class_indices[label].begin()); + class_indices[label].erase(class_indices[label].begin(), it); + remainder_samples_to_take--; + } } } pair, vector> StratifiedKFold::getFold(int nFold) diff --git a/src/Platform/Folding.h b/src/Platform/Folding.h index 76b693a..526d3bc 100644 --- a/src/Platform/Folding.h +++ b/src/Platform/Folding.h @@ -6,8 +6,8 @@ class KFold { private: int k; int n; - vector indices; int seed; + vector indices; public: KFold(int k, int n, int seed = -1); pair, vector> getFold(int nFold); @@ -16,8 +16,8 @@ class StratifiedKFold { private: int k; int n; + int seed; vector> stratified_indices; - unsigned seed; public: StratifiedKFold(int k, const vector& y, int seed = -1); pair, vector> getFold(int nFold); diff --git a/src/Platform/m b/src/Platform/m new file mode 100755 index 0000000..0323306 Binary files /dev/null and b/src/Platform/m differ diff --git a/src/Platform/testx.cpp b/src/Platform/testx.cpp index 754c06b..0ad3017 100644 --- a/src/Platform/testx.cpp +++ b/src/Platform/testx.cpp @@ -4,71 +4,72 @@ using namespace std; class A { private: - int a; + int a; public: - A(int a) : a(a) {} - int getA() { return a; } + A(int a) : a(a) {} + int getA() { return a; } }; class B : public A { private: - int b; + int b; public: - B(int a, int b) : A(a), b(b) {} - int getB() { return b; } + B(int a, int b) : A(a), b(b) {} + int getB() { return b; } }; class C : public A { private: - int c; + int c; public: - C(int a, int c) : A(a), c(c) {} - int getC() { return c; } + C(int a, int c) : A(a), c(c) {} + int getC() { return c; } }; + string counts(vector y, vector indices) { - auto result = map(); - for (auto i = 0; i < indices.size(); ++i) { - result[y[indices[i]]]++; - } - string final_result = ""; - for (auto i = 0; i < result.size(); ++i) - final_result += to_string(i) + " -> " + to_string(result[i]) + " // "; - final_result += "\n"; - return final_result; + auto result = map(); + for (auto i = 0; i < indices.size(); ++i) { + result[y[indices[i]]]++; + } + string final_result = ""; + for (auto i = 0; i < result.size(); ++i) + final_result += to_string(i) + " -> " + to_string(result[i]) + " // "; + final_result += "\n"; + return final_result; } int main() { - auto y = vector(150); - fill(y.begin(), y.begin() + 50, 0); - fill(y.begin() + 50, y.begin() + 100, 1); - fill(y.begin() + 100, y.end(), 2); - //auto fold = KFold(5, 150); - auto fold = StratifiedKFold(5, y, 0); - for (int i = 0; i < 5; ++i) { - cout << "Fold: " << i << endl; - auto [train, test] = fold.getFold(i); - cout << "Train: "; - cout << "(" << train.size() << "): "; - for (auto j = 0; j < static_cast(train.size()); j++) - cout << train[j] << ", "; - cout << endl; - cout << "Train Statistics : " << counts(y, train); - cout << "-------------------------------------------------------------------------------" << endl; - cout << "Test: "; - cout << "(" << test.size() << "): "; - for (auto j = 0; j < static_cast(test.size()); j++) - cout << test[j] << ", "; - cout << endl; - cout << "Test Statistics: " << counts(y, test); - cout << "==============================================================================" << endl; - // cout << "Vector poly" << endl; - // auto some = vector(); - // auto cx = C(5, 4); - // auto bx = B(7, 6); - // some.push_back(cx); - // some.push_back(bx); - // for (auto& obj : some) { - // cout << "Obj :" << obj.getA() << endl; - // } - } + auto y = vector(153); + fill(y.begin(), y.begin() + 50, 0); + fill(y.begin() + 50, y.begin() + 103, 1); + fill(y.begin() + 103, y.end(), 2); + //auto fold = KFold(5, 150); + auto fold = StratifiedKFold(5, y, -1); + for (int i = 0; i < 5; ++i) { + cout << "Fold: " << i << endl; + auto [train, test] = fold.getFold(i); + cout << "Train: "; + cout << "(" << train.size() << "): "; + for (auto j = 0; j < static_cast(train.size()); j++) + cout << train[j] << ", "; + cout << endl; + cout << "Train Statistics : " << counts(y, train); + cout << "-------------------------------------------------------------------------------" << endl; + cout << "Test: "; + cout << "(" << test.size() << "): "; + for (auto j = 0; j < static_cast(test.size()); j++) + cout << test[j] << ", "; + cout << endl; + cout << "Test Statistics: " << counts(y, test); + cout << "==============================================================================" << endl; + // cout << "Vector poly" << endl; + // auto some = vector(); + // auto cx = C(5, 4); + // auto bx = B(7, 6); + // some.push_back(cx); + // some.push_back(bx); + // for (auto& obj : some) { + // cout << "Obj :" << obj.getA() << endl; + // } + } }