diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 426efea..6b4632e 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -4,4 +4,5 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) add_executable(main Experiment.cc Folding.cc platformUtils.cc) +add_executable(testx testx.cpp Folding.cc) target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/Folding.cc b/src/Platform/Folding.cc index 11fca29..a4455f7 100644 --- a/src/Platform/Folding.cc +++ b/src/Platform/Folding.cc @@ -1,21 +1,23 @@ #include "Folding.h" #include +#include #include using namespace std; -KFold::KFold(int k, int n, int seed) +KFold::KFold(int k, int n, int seed) : k(k), n(n), seed(seed) { - this->k = k; - this->n = n; indices = vector(n); iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1 - shuffle(indices.begin(), indices.end(), default_random_engine(seed)); + random_device rd; + default_random_engine random_seed(seed == -1 ? rd() : seed); + shuffle(indices.begin(), indices.end(), random_seed); } pair, vector> KFold::getFold(int nFold) { + if (nFold >= k || nFold < 0) { - throw invalid_argument("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")"); + throw out_of_range("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")"); } int nTest = n / k; auto train = vector(); @@ -28,4 +30,60 @@ pair, vector> KFold::getFold(int nFold) } } return { train, test }; +} +StratifiedKFold::StratifiedKFold(int k, const vector& y, int seed) : + k(k), seed(seed) +{ + // n = y.size(); + // map> class_to_indices; + // for (int i = 0; i < n; ++i) { + // class_to_indices[y[i]].push_back(i); + // } + // random_device rd; + // default_random_engine random_seed(seed == -1 ? rd() : seed); + // for (auto& [cls, indices] : class_to_indices) { + // shuffle(indices.begin(), indices.end(), random_seed); + // int fold_size = n / k; + // for (int i = 0; i < k; ++i) { + // int start = i * fold_size; + // int end = (i == k - 1) ? indices.size() : (i + 1) * fold_size; + // stratified_indices.emplace_back(indices.begin() + start, indices.begin() + end); + // } + // } + n = y.size(); + stratified_indices.resize(k); + vector class_counts(*max_element(y.begin(), y.end()) + 1, 0); + for (auto i = 0; i < n; ++i) { + class_counts[y[i]]++; + } + vector class_starts(class_counts.size()); + partial_sum(class_counts.begin(), class_counts.end() - 1, class_starts.begin() + 1); + vector indices(n); + for (auto i = 0; i < n; ++i) { + int label = y[i]; + stratified_indices[class_starts[label]] = i; + class_starts[label]++; + } + int fold_size = n / k; + int remainder = n % k; + int start = 0; + for (auto i = 0; i < k; ++i) { + int fold_length = fold_size + (i < remainder ? 1 : 0); + stratified_indices[i].resize(fold_length); + copy(indices.begin() + start, indices.begin() + start + fold_length, stratified_indices[i].begin()); + start += fold_length; + } +} +pair, vector> StratifiedKFold::getFold(int nFold) +{ + if (nFold >= k || nFold < 0) { + throw out_of_range("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")"); + } + vector test_indices = stratified_indices[nFold]; + vector train_indices; + for (int i = 0; i < k; ++i) { + if (i == nFold) continue; + train_indices.insert(train_indices.end(), stratified_indices[i].begin(), stratified_indices[i].end()); + } + return { train_indices, test_indices }; } \ No newline at end of file diff --git a/src/Platform/Folding.h b/src/Platform/Folding.h index f851b2f..76b693a 100644 --- a/src/Platform/Folding.h +++ b/src/Platform/Folding.h @@ -7,12 +7,19 @@ private: int k; int n; vector indices; - + int seed; public: - KFold(int k, int n, int seed); - pair, vector> getFold(int); + KFold(int k, int n, int seed = -1); + pair, vector> getFold(int nFold); }; -class KStratifiedFold { - +class StratifiedKFold { +private: + int k; + int n; + vector> stratified_indices; + unsigned seed; +public: + StratifiedKFold(int k, const vector& y, int seed = -1); + pair, vector> getFold(int nFold); }; #endif \ No newline at end of file diff --git a/src/Platform/m b/src/Platform/m deleted file mode 100755 index 8b81161..0000000 Binary files a/src/Platform/m and /dev/null differ diff --git a/src/Platform/main.cpp b/src/Platform/main.cpp deleted file mode 100644 index 5adcf48..0000000 --- a/src/Platform/main.cpp +++ /dev/null @@ -1,51 +0,0 @@ -#include "Folding.h" -#include -using namespace std; -class A { -private: - int a; -public: - A(int a) : a(a) {} - int getA() { return a; } -}; -class B : public A { -private: - int b; -public: - B(int a, int b) : A(a), b(b) {} - int getB() { return b; } -}; -class C : public A { -private: - int c; -public: - C(int a, int c) : A(a), c(c) {} - int getC() { return c; } -}; -int main() -{ - auto fold = KFold(5, 100, 1); - for (int i = 0; i < 5; ++i) { - cout << "Fold: " << i << endl; - auto [train, test] = fold.getFold(i); - cout << "Train: "; - cout << "(" << train.size() << "): "; - for (auto j = 0; j < static_cast(train.size()); j++) - cout << train[j] << ", "; - cout << endl; - cout << "Test: "; - cout << "(" << train.size() << "): "; - for (auto j = 0; j < static_cast(test.size()); j++) - cout << test[j] << ", "; - cout << endl; - cout << "Vector poly" << endl; - auto some = vector(); - auto cx = C(5, 4); - auto bx = B(7, 6); - some.push_back(cx); - some.push_back(bx); - for (auto& obj : some) { - cout << "Obj :" << obj.getA() << endl; - } - } -} diff --git a/src/Platform/testx.cpp b/src/Platform/testx.cpp new file mode 100644 index 0000000..754c06b --- /dev/null +++ b/src/Platform/testx.cpp @@ -0,0 +1,74 @@ +#include "Folding.h" +#include +#include +using namespace std; +class A { +private: + int a; +public: + A(int a) : a(a) {} + int getA() { return a; } +}; +class B : public A { +private: + int b; +public: + B(int a, int b) : A(a), b(b) {} + int getB() { return b; } +}; +class C : public A { +private: + int c; +public: + C(int a, int c) : A(a), c(c) {} + int getC() { return c; } +}; +string counts(vector y, vector indices) +{ + auto result = map(); + for (auto i = 0; i < indices.size(); ++i) { + result[y[indices[i]]]++; + } + string final_result = ""; + for (auto i = 0; i < result.size(); ++i) + final_result += to_string(i) + " -> " + to_string(result[i]) + " // "; + final_result += "\n"; + return final_result; +} + +int main() +{ + auto y = vector(150); + fill(y.begin(), y.begin() + 50, 0); + fill(y.begin() + 50, y.begin() + 100, 1); + fill(y.begin() + 100, y.end(), 2); + //auto fold = KFold(5, 150); + auto fold = StratifiedKFold(5, y, 0); + for (int i = 0; i < 5; ++i) { + cout << "Fold: " << i << endl; + auto [train, test] = fold.getFold(i); + cout << "Train: "; + cout << "(" << train.size() << "): "; + for (auto j = 0; j < static_cast(train.size()); j++) + cout << train[j] << ", "; + cout << endl; + cout << "Train Statistics : " << counts(y, train); + cout << "-------------------------------------------------------------------------------" << endl; + cout << "Test: "; + cout << "(" << test.size() << "): "; + for (auto j = 0; j < static_cast(test.size()); j++) + cout << test[j] << ", "; + cout << endl; + cout << "Test Statistics: " << counts(y, test); + cout << "==============================================================================" << endl; + // cout << "Vector poly" << endl; + // auto some = vector(); + // auto cx = C(5, 4); + // auto bx = B(7, 6); + // some.push_back(cx); + // some.push_back(bx); + // for (auto& obj : some) { + // cout << "Obj :" << obj.getA() << endl; + // } + } +}