Compare commits

...

47 Commits

Author SHA1 Message Date
fa7fe081ad Fix xlsx library finding 2023-10-15 11:19:58 +02:00
660e783517 Update validation for feature selection 2023-10-14 13:32:09 +02:00
b35532dd9e Implement IWSS and FCBF too for BoostAODE 2023-10-14 13:12:04 +02:00
6ef49385ea Remove unneeded method declaration FeatureSelect 2023-10-14 11:30:32 +02:00
6d5a25cdc8 Refactor CFS class creating abstract base class 2023-10-14 11:27:46 +02:00
d00b08cbe8 Fix Header for Linux 2023-10-13 14:26:47 +02:00
977ff6fddb Update CMakeLists for Linux 2023-10-13 14:01:52 +02:00
54b8939f35 Prepare BoostAODE first try 2023-10-13 13:46:22 +02:00
5022a4dc90 Complete CFS tested with Python mufs 2023-10-13 12:29:25 +02:00
40d1dad5d8 Begin CFS implementation 2023-10-11 21:17:26 +02:00
47e2b138c5 Complete first working cfs 2023-10-11 11:33:29 +02:00
e7ded68267 First cfs working version 2023-10-10 23:00:38 +02:00
ca833a34f5 try openssl sha256 2023-10-10 18:16:43 +02:00
df9b4c48d2 Begin CFS initialization 2023-10-10 13:39:11 +02:00
f288bbd6fa Begin adding cfs to BoostAODE 2023-10-10 11:52:39 +02:00
7d8aca4f59 Add Locale shared config to reports 2023-10-09 19:41:29 +02:00
8fdad78a8c Continue Test Network 2023-10-09 11:25:30 +02:00
e3ae073333 Continue test Network 2023-10-08 15:54:58 +02:00
4b732e76c2 MST change unordered_set to list 2023-10-07 19:08:13 +02:00
fe5fead27e Begin Fix Test MST 2023-10-07 01:43:26 +02:00
8c3864f3c8 Complete Folding Test 2023-10-07 01:23:36 +02:00
1287160c47 Refactor makefile to use variables 2023-10-07 00:16:25 +02:00
2f58807322 Begin refactor CMakeLists debug/release paths 2023-10-06 19:32:29 +02:00
17e079edd5 Begin Test Folding 2023-10-06 17:08:54 +02:00
b9e0028e9d Refactor Makefile 2023-10-06 01:28:27 +02:00
e0d39fe631 Fix BayesMetrics Test 2023-10-06 01:14:55 +02:00
36b0277576 Add Maximum Spanning Tree test 2023-10-05 15:45:36 +02:00
da8d018ec4 Refactor Makefile 2023-10-05 11:45:00 +02:00
5f0676691c Add First BayesMetrics Tests 2023-10-05 01:14:16 +02:00
3448fb1299 Refactor Tests and add BayesMetrics test 2023-10-04 23:19:23 +02:00
5e938d5cca Add ranks sheet to excel best results 2023-10-04 16:26:57 +02:00
55e742438f Add constant references to Statistics 2023-10-04 13:40:45 +02:00
c4ae3fe429 Add Control model rank info to report 2023-10-04 12:42:35 +02:00
93e4ff94db Add significance level as parameter in best 2023-10-02 15:46:40 +02:00
57c27f739c Remove unused code in BestResults 2023-10-02 15:31:02 +02:00
a434d7f1ae Add a Linux config in launch.json 2023-09-30 18:44:21 +02:00
294666c516 Fix a Linux problem in Datasets 2023-09-30 18:43:47 +02:00
fd04e78ad9 Restore sample.cc 2023-09-29 18:50:25 +02:00
66ec1b343b Remove platformUtils and split Datasets & Dataset 2023-09-29 18:20:46 +02:00
bb423da42f Add csv and R_dat files to platform 2023-09-29 13:52:50 +02:00
db17c14042 Change names of executables to b_... 2023-09-29 09:17:50 +02:00
a4401cb78f Linux CMakeLists.txt adjustment 2023-09-29 00:30:47 +02:00
9d3d9cc6c6 Complete Excel output for bestResults with Friedman test 2023-09-28 18:52:37 +02:00
cfcf3c16df Add best results Excel 2023-09-28 17:12:04 +02:00
85202260f3 Separate specific Excel methods to ExcelFile 2023-09-28 13:07:11 +02:00
82acb3cab5 Enhance output of Best results reports 2023-09-28 12:08:56 +02:00
623ceed396 Merge pull request 'Add Friedman Test & post hoc tests to BestResults' (#10) from boost into main
Reviewed-on: #10
2023-09-28 07:44:55 +00:00
68 changed files with 2578 additions and 873 deletions

4
.gitignore vendored
View File

@@ -31,7 +31,9 @@
*.exe
*.out
*.app
build/
build/**
build_debug/**
build_release/**
*.dSYM/**
cmake-build*/**
.idea

18
.vscode/c_cpp_properties.json vendored Normal file
View File

@@ -0,0 +1,18 @@
{
"configurations": [
{
"name": "Mac",
"includePath": [
"${workspaceFolder}/**"
],
"defines": [],
"macFrameworkPath": [
"/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
],
"cStandard": "c17",
"cppStandard": "c++17",
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json"
}
],
"version": 4
}

30
.vscode/launch.json vendored
View File

@@ -22,26 +22,24 @@
"type": "lldb",
"request": "launch",
"name": "experiment",
"program": "${workspaceFolder}/build/src/Platform/main",
"program": "${workspaceFolder}/build/src/Platform/b_main",
"args": [
"-m",
"BoostAODE",
"-p",
"/Users/rmontanana/Code/discretizbench/datasets",
"TAN",
"--stratified",
"-d",
"mfeat-morphological",
"zoo",
"--discretize"
// "--hyperparameters",
// "{\"repeatSparent\": true, \"maxModels\": 12}"
],
"cwd": "/Users/rmontanana/Code/discretizbench",
"cwd": "/Users/rmontanana/Code/odtebench",
},
{
"type": "lldb",
"request": "launch",
"name": "best",
"program": "${workspaceFolder}/build/src/Platform/best",
"program": "${workspaceFolder}/build/src/Platform/b_best",
"args": [
"-m",
"BoostAODE",
@@ -55,7 +53,7 @@
"type": "lldb",
"request": "launch",
"name": "manage",
"program": "${workspaceFolder}/build/src/Platform/manage",
"program": "${workspaceFolder}/build/src/Platform/b_manage",
"args": [
"-n",
"20"
@@ -66,9 +64,21 @@
"type": "lldb",
"request": "launch",
"name": "list",
"program": "${workspaceFolder}/build/src/Platform/list",
"program": "${workspaceFolder}/build/src/Platform/b_list",
"args": [],
"cwd": "/Users/rmontanana/Code/discretizbench",
//"cwd": "/Users/rmontanana/Code/discretizbench",
"cwd": "/home/rmontanana/Code/covbench",
},
{
"type": "lldb",
"request": "launch",
"name": "test",
"program": "${workspaceFolder}/build/tests/unit_tests",
"args": [
"-c=\"Metrics Test\"",
// "-s",
],
"cwd": "${workspaceFolder}/build/tests",
},
{
"name": "Build & debug active file",

View File

@@ -35,7 +35,7 @@ option(CODE_COVERAGE "Collect coverage from test library" OFF)
set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_MULTITHREADED ON)
set(Boost_USE_STATIC_RUNTIME OFF)
find_package(Boost 1.78.0 REQUIRED)
find_package(Boost 1.66.0 REQUIRED)
if(Boost_FOUND)
message("Boost_INCLUDE_DIRS=${Boost_INCLUDE_DIRS}")
include_directories(${Boost_INCLUDE_DIRS})
@@ -45,7 +45,6 @@ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
# CMakes modules
# --------------
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
include(AddGitSubmodule)
if (CODE_COVERAGE)
enable_testing()
@@ -65,7 +64,11 @@ endif (ENABLE_CLANG_TIDY)
add_git_submodule("lib/mdlp")
add_git_submodule("lib/argparse")
add_git_submodule("lib/json")
find_library(XLSXWRITER_LIB libxlsxwriter.dylib PATHS /usr/local/lib)
find_library(XLSXWRITER_LIB NAMES libxlsxwriter.dylib libxlsxwriter.so PATHS ${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/lib)
message("XLSXWRITER_LIB=${XLSXWRITER_LIB}")
# Subdirectories
# --------------
@@ -75,7 +78,7 @@ add_subdirectory(src/BayesNet)
add_subdirectory(src/Platform)
add_subdirectory(sample)
file(GLOB BayesNet_HEADERS CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.h ${BayesNet_SOURCE_DIR}/BayesNet/*.hpp)
file(GLOB BayesNet_HEADERS CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.h ${BayesNet_SOURCE_DIR}/BayesNet/*.h)
file(GLOB BayesNet_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.cc ${BayesNet_SOURCE_DIR}/src/BayesNet/*.cpp)
file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/Platform/*.cc ${BayesNet_SOURCE_DIR}/src/Platform/*.cpp)

120
Makefile
View File

@@ -1,6 +1,26 @@
SHELL := /bin/bash
.DEFAULT_GOAL := help
.PHONY: coverage setup help build test
.PHONY: coverage setup help build test clean debug release
f_release = build_release
f_debug = build_debug
app_targets = b_best b_list b_main b_manage
test_targets = unit_tests_bayesnet unit_tests_platform
n_procs = -j 16
define ClearTests
@for t in $(test_targets); do \
if [ -f $(f_debug)/tests/$$t ]; then \
echo ">>> Cleaning $$t..." ; \
rm -f $(f_debug)/tests/$$t ; \
fi ; \
done
@nfiles="$(find . -name "*.gcda" -print0)" ; \
if test "${nfiles}" != "" ; then \
find . -name "*.gcda" -print0 | xargs -0 rm 2>/dev/null ;\
fi ;
endef
setup: ## Install dependencies for tests and coverage
@if [ "$(shell uname)" = "Darwin" ]; then \
@@ -11,63 +31,85 @@ setup: ## Install dependencies for tests and coverage
pip install gcovr; \
fi
dest ?= ../discretizbench
copy: ## Copy binary files to selected folder
dest ?= ${HOME}/bin
install: ## Copy binary files to bin folder
@echo "Destination folder: $(dest)"
make build
make buildr
@echo ">>> Copying files to $(dest)"
@cp build/src/Platform/main $(dest)
@cp build/src/Platform/list $(dest)
@cp build/src/Platform/manage $(dest)
@cp build/src/Platform/best $(dest)
@echo ">>> Done"
@cp $(f_release)/src/Platform/b_main $(dest)
@cp $(f_release)/src/Platform/b_list $(dest)
@cp $(f_release)/src/Platform/b_manage $(dest)
@cp $(f_release)/src/Platform/b_best $(dest)
dependency: ## Create a dependency graph diagram of the project (build/dependency.png)
cd build && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png
@echo ">>> Creating dependency graph diagram of the project...";
$(MAKE) debug
cd $(f_debug) && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png
build: ## Build the main and BayesNetSample
cmake --build build -t main -t BayesNetSample -t manage -t list -t best -j 32
buildd: ## Build the debug targets
cmake --build $(f_debug) -t $(app_targets) $(n_procs)
clean: ## Clean the debug info
@echo ">>> Cleaning Debug BayesNet ...";
find . -name "*.gcda" -print0 | xargs -0 rm
buildr: ## Build the release targets
cmake --build $(f_release) -t $(app_targets) $(n_procs)
clean: ## Clean the tests info
@echo ">>> Cleaning Debug BayesNet tests...";
$(call ClearTests)
@echo ">>> Done";
clang-uml: ## Create uml class and sequence diagrams
clang-uml -p --add-compile-flag -I /usr/lib/gcc/x86_64-redhat-linux/8/include/
debug: ## Build a debug version of the project
@echo ">>> Building Debug BayesNet ...";
@if [ -d ./build ]; then rm -rf ./build; fi
@mkdir build;
cmake -S . -B build -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON; \
cmake --build build -t main -t BayesNetSample -t manage -t list -t best -t unit_tests -j 32;
@echo ">>> Building Debug BayesNet...";
@if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi
@mkdir $(f_debug);
@cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON
@echo ">>> Done";
release: ## Build a Release version of the project
@echo ">>> Building Release BayesNet ...";
@if [ -d ./build ]; then rm -rf ./build; fi
@mkdir build;
cmake -S . -B build -D CMAKE_BUILD_TYPE=Release; \
cmake --build build -t main -t BayesNetSample -t manage -t list -t best -j 32;
@echo ">>> Building Release BayesNet...";
@if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi
@mkdir $(f_release);
@cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release
@echo ">>> Done";
test: ## Run tests
@echo "* Running tests...";
find . -name "*.gcda" -print0 | xargs -0 rm
@cd build; \
cmake --build . --target unit_tests ;
@cd build/tests; \
./unit_tests;
opt = ""
test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
@echo ">>> Running BayesNet & Platform tests...";
@$(MAKE) clean
@cmake --build $(f_debug) -t $(test_targets) $(n_procs)
@for t in $(test_targets); do \
if [ -f $(f_debug)/tests/$$t ]; then \
cd $(f_debug)/tests ; \
./$$t $(opt) ; \
fi ; \
done
@echo ">>> Done";
opt = ""
testp: ## Run platform tests (opt="-s") to verbose output the tests, (opt="-c='Stratified Fold Test'") to run only that section
@echo ">>> Running Platform tests...";
@$(MAKE) clean
@cmake --build $(f_debug) --target unit_tests_platform $(n_procs)
@if [ -f $(f_debug)/tests/unit_tests_platform ]; then cd $(f_debug)/tests ; ./unit_tests_platform $(opt) ; fi ;
@echo ">>> Done";
opt = ""
testb: ## Run BayesNet tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
@echo ">>> Running BayesNet tests...";
@$(MAKE) clean
@cmake --build $(f_debug) --target unit_tests_bayesnet $(n_procs)
@if [ -f $(f_debug)/tests/unit_tests_bayesnet ]; then cd $(f_debug)/tests ; ./unit_tests_bayesnet $(opt) ; fi ;
@echo ">>> Done";
coverage: ## Run tests and generate coverage report (build/index.html)
@echo "*Building tests...";
find . -name "*.gcda" -print0 | xargs -0 rm
@cd build; \
cmake --build . --target unit_tests ;
@cd build/tests; \
./unit_tests;
gcovr ;
@echo ">>> Building tests with coverage...";
@$(MAKE) test
@cd $(f_debug) ; \
gcovr --config ../gcovr.cfg tests ;
@echo ">>> Done";
help: ## Show help message
@IFS=$$'\n' ; \

View File

@@ -10,16 +10,26 @@ Before compiling BayesNet.
[Getting Started](<https://www.boost.org/doc/libs/1_83_0/more/getting_started/index.html>)
The best option is install the packages that the Linux distribution have in its repository. If this is the case:
```bash
sudo dnf install boost-devel
```
If this is not possible and the compressed packaged is installed, the following environment variable has to be set:
```bash
export BOOST_ROOT=/path/to/library/
```
### libxlswriter
```bash
cd lib/libxlsxwriter
make
sudo make install
make install DESTDIR=/home/rmontanana/Code PREFIX=
```
It has to be installed in /usr/local/lib otherwise CMakeLists.txt has to be modified accordingly
Environment variable has to be set:
```bash

33
mac_mst.txt Normal file
View File

@@ -0,0 +1,33 @@
Weights matrix:
0.0000000, 0.0384968, 0.0795434, 0.1546867, -0.0000000, 0.1788104, 0.2214721, 0.0323837, 0.0366549,
0.0384968, 0.0000000, 0.0200662, 0.0200937, -0.0000000, 0.0637224, 0.0183005, 0.0127657, 0.0136054,
0.0795434, 0.0200662, 0.0000000, 0.0605489, -0.0000000, 0.0894469, 0.1689408, 0.0321602, 0.0223184,
0.1546867, 0.0200937, 0.0605489, 0.0000000, -0.0000000, 0.1150757, 0.1332292, 0.0422865, 0.0191138,
-0.0000000, -0.0000000, -0.0000000, -0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000,
0.1788104, 0.0637224, 0.0894469, 0.1150757, 0.0000000, 0.0000000, 0.1407102, 0.0406590, 0.0366986,
0.2214721, 0.0183005, 0.1689408, 0.1332292, 0.0000000, 0.1407102, 0.0000000, 0.0427515, 0.0349965,
0.0323837, 0.0127657, 0.0321602, 0.0422865, 0.0000000, 0.0406590, 0.0427515, 0.0000000, 0.0343376,
0.0366549, 0.0136054, 0.0223184, 0.0191138, 0.0000000, 0.0366986, 0.0349965, 0.0343376, 0.0000000,
Edge : Weight
0 - 6 : 0.2214721
0 - 5 : 0.1788104
2 - 6 : 0.1689408
0 - 3 : 0.1546867
1 - 5 : 0.0637224
6 - 7 : 0.0427515
5 - 8 : 0.0366986
4 - 5 : 0.0000000
-------------------------------------------------------------------------------
Metrics Test
Test Maximum Spanning Tree
-------------------------------------------------------------------------------
/Users/rmontanana/Code/BayesNet/tests/TestBayesMetrics.cc:58
...............................................................................
/Users/rmontanana/Code/BayesNet/tests/TestBayesMetrics.cc:69: PASSED:
REQUIRE( result == resultsMST.at(file_name) )
with expansion:
(0, 6) (0, 5) (0, 3) (5, 1) (5, 8) (5, 4) (6, 2) (6, 7)
==
(0, 6) (0, 5) (0, 3) (5, 1) (5, 8) (5, 4) (6, 2) (6, 7)

View File

@@ -4,7 +4,7 @@
#include <map>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include "ArffFiles.h"
#include "ArffFiles.h"v
#include "BayesMetrics.h"
#include "CPPFImdlp.h"
#include "Folding.h"
@@ -58,52 +58,6 @@ pair<vector<vector<int>>, vector<int>> extract_indices(vector<int> indices, vect
int main(int argc, char** argv)
{
torch::Tensor weights_ = torch::full({ 10 }, 1.0 / 10, torch::kFloat64);
torch::Tensor y_ = torch::tensor({ 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }, torch::kInt32);
torch::Tensor ypred = torch::tensor({ 1, 1, 1, 0, 0, 1, 1, 1, 1, 0 }, torch::kInt32);
cout << "Initial weights_: " << endl;
for (int i = 0; i < 10; i++) {
cout << weights_.index({ i }).item<double>() << ", ";
}
cout << "end." << endl;
cout << "y_: " << endl;
for (int i = 0; i < 10; i++) {
cout << y_.index({ i }).item<int>() << ", ";
}
cout << "end." << endl;
cout << "ypred: " << endl;
for (int i = 0; i < 10; i++) {
cout << ypred.index({ i }).item<int>() << ", ";
}
cout << "end." << endl;
auto mask_wrong = ypred != y_;
auto mask_right = ypred == y_;
auto masked_weights = weights_ * mask_wrong.to(weights_.dtype());
double epsilon_t = masked_weights.sum().item<double>();
cout << "epsilon_t: " << epsilon_t << endl;
double wt = (1 - epsilon_t) / epsilon_t;
cout << "wt: " << wt << endl;
double alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt);
cout << "alpha_t: " << alpha_t << endl;
// Step 3.2: Update weights for next classifier
// Step 3.2.1: Update weights of wrong samples
cout << "exp(alpha_t): " << exp(alpha_t) << endl;
cout << "exp(-alpha_t): " << exp(-alpha_t) << endl;
weights_ += mask_wrong.to(weights_.dtype()) * exp(alpha_t) * weights_;
// Step 3.2.2: Update weights of right samples
weights_ += mask_right.to(weights_.dtype()) * exp(-alpha_t) * weights_;
// Step 3.3: Normalise the weights
double totalWeights = torch::sum(weights_).item<double>();
cout << "totalWeights: " << totalWeights << endl;
cout << "Before normalization: " << endl;
for (int i = 0; i < 10; i++) {
cout << weights_.index({ i }).item<double>() << endl;
}
weights_ = weights_ / totalWeights;
cout << "After normalization: " << endl;
for (int i = 0; i < 10; i++) {
cout << weights_.index({ i }).item<double>() << endl;
}
map<string, bool> datasets = {
{"diabetes", true},
{"ecoli", true},

View File

@@ -1,7 +1,7 @@
#include "BayesMetrics.h"
#include "Mst.h"
namespace bayesnet {
//samples is nxm tensor used to fit the model
//samples is n+1xm tensor used to fit the model
Metrics::Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates)
: samples(samples)
, features(features)
@@ -60,17 +60,7 @@ namespace bayesnet {
{
return scoresKBest;
}
vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
{
vector<pair<string, string>> result;
for (int i = 0; i < source.size(); ++i) {
string temp = source[i];
for (int j = i + 1; j < source.size(); ++j) {
result.push_back({ temp, source[j] });
}
}
return result;
}
torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights)
{
auto result = vector<double>();

View File

@@ -8,20 +8,39 @@ namespace bayesnet {
using namespace torch;
class Metrics {
private:
Tensor samples; // nxm tensor used to fit the model
vector<string> features;
string className;
int classNumStates = 0;
vector<double> scoresKBest;
vector<int> featuresKBest; // sorted indices of the features
double entropy(const Tensor& feature, const Tensor& weights);
double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
vector<pair<string, string>> doCombinations(const vector<string>&);
protected:
Tensor samples; // n+1xm tensor used to fit the model where samples[-1] is the y vector
string className;
double entropy(const Tensor& feature, const Tensor& weights);
vector<string> features;
template <class T>
vector<pair<T, T>> doCombinations(const vector<T>& source)
{
vector<pair<T, T>> result;
for (int i = 0; i < source.size(); ++i) {
T temp = source[i];
for (int j = i + 1; j < source.size(); ++j) {
result.push_back({ temp, source[j] });
}
}
return result;
}
template <class T>
T pop_first(vector<T>& v)
{
T temp = v[0];
v.erase(v.begin());
return temp;
}
public:
Metrics() = default;
Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates);
vector<int> SelectKBestWeighted(const torch::Tensor& weights, bool ascending=false, unsigned k = 0);
vector<int> SelectKBestWeighted(const torch::Tensor& weights, bool ascending = false, unsigned k = 0);
vector<double> getScoresKBest() const;
double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
vector<float> conditionalEdgeWeights(vector<float>& weights); // To use in Python

View File

@@ -1,36 +1,22 @@
#include "BoostAODE.h"
#include <set>
#include "BayesMetrics.h"
#include <functional>
#include <limits.h>
#include "BoostAODE.h"
#include "Colors.h"
#include "Folding.h"
#include <limits.h>
#include "Paths.h"
#include "CFS.h"
#include "FCBF.h"
#include "IWSS.h"
namespace bayesnet {
BoostAODE::BoostAODE() : Ensemble() {}
void BoostAODE::buildModel(const torch::Tensor& weights)
{
// Models shall be built in trainModel
}
void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters)
{
// Check if hyperparameters are valid
const vector<string> validKeys = { "repeatSparent", "maxModels", "ascending", "convergence" };
checkHyperparameters(validKeys, hyperparameters);
if (hyperparameters.contains("repeatSparent")) {
repeatSparent = hyperparameters["repeatSparent"];
}
if (hyperparameters.contains("maxModels")) {
maxModels = hyperparameters["maxModels"];
}
if (hyperparameters.contains("ascending")) {
ascending = hyperparameters["ascending"];
}
if (hyperparameters.contains("convergence")) {
convergence = hyperparameters["convergence"];
}
}
void BoostAODE::validationInit()
{
models.clear();
n_models = 0;
// Prepare the validation dataset
auto y_ = dataset.index({ -1, "..." });
if (convergence) {
// Prepare train & validation sets from train data
@@ -56,18 +42,79 @@ namespace bayesnet {
X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." });
y_train = y_;
}
}
void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters)
{
// Check if hyperparameters are valid
const vector<string> validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "threshold", "select_features" };
checkHyperparameters(validKeys, hyperparameters);
if (hyperparameters.contains("repeatSparent")) {
repeatSparent = hyperparameters["repeatSparent"];
}
if (hyperparameters.contains("maxModels")) {
maxModels = hyperparameters["maxModels"];
}
if (hyperparameters.contains("ascending")) {
ascending = hyperparameters["ascending"];
}
if (hyperparameters.contains("convergence")) {
convergence = hyperparameters["convergence"];
}
if (hyperparameters.contains("threshold")) {
threshold = hyperparameters["threshold"];
}
if (hyperparameters.contains("select_features")) {
auto selectedAlgorithm = hyperparameters["select_features"];
vector<string> algos = { "IWSS", "FCBF", "CFS" };
selectFeatures = true;
algorithm = selectedAlgorithm;
if (find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) {
throw invalid_argument("Invalid selectFeatures value [IWSS, FCBF, CFS]");
}
}
}
unordered_set<int> BoostAODE::initializeModels()
{
unordered_set<int> featuresUsed;
Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
int maxFeatures = 0;
if (algorithm == "CFS") {
featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
} else if (algorithm == "IWSS") {
if (threshold < 0 || threshold >0.5) {
throw invalid_argument("Invalid threshold value for IWSS [0, 0.5]");
}
featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
} else if (algorithm == "FCBF") {
if (threshold < 1e-7 || threshold > 1) {
throw invalid_argument("Invalid threshold value [1e-7, 1]");
}
featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
}
featureSelector->fit();
auto cfsFeatures = featureSelector->getFeatures();
for (const int& feature : cfsFeatures) {
// cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl;
featuresUsed.insert(feature);
unique_ptr<Classifier> model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
models.push_back(std::move(model));
significanceModels.push_back(1.0);
n_models++;
}
delete featureSelector;
return featuresUsed;
}
void BoostAODE::trainModel(const torch::Tensor& weights)
{
models.clear();
n_models = 0;
unordered_set<int> featuresUsed;
if (selectFeatures) {
featuresUsed = initializeModels();
}
if (maxModels == 0)
maxModels = .1 * n > 10 ? .1 * n : n;
validationInit();
Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
bool exitCondition = false;
unordered_set<int> featuresUsed;
// Variables to control the accuracy finish condition
double priorAccuracy = 0.0;
double delta = 1.0;
@@ -86,16 +133,16 @@ namespace bayesnet {
unique_ptr<Classifier> model;
auto feature = featureSelection[0];
if (!repeatSparent || featuresUsed.size() < featureSelection.size()) {
bool found = false;
for (auto feat : featureSelection) {
bool used = true;
for (const auto& feat : featureSelection) {
if (find(featuresUsed.begin(), featuresUsed.end(), feat) != featuresUsed.end()) {
continue;
}
found = true;
used = false;
feature = feat;
break;
}
if (!found) {
if (used) {
exitCondition = true;
continue;
}
@@ -135,7 +182,7 @@ namespace bayesnet {
count++;
}
}
exitCondition = n_models == maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance;
exitCondition = n_models >= maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance;
}
if (featuresUsed.size() != features.size()) {
status = WARNING;

View File

@@ -1,7 +1,9 @@
#ifndef BOOSTAODE_H
#define BOOSTAODE_H
#include "Ensemble.h"
#include <map>
#include "SPODE.h"
#include "FeatureSelect.h"
namespace bayesnet {
class BoostAODE : public Ensemble {
public:
@@ -15,11 +17,16 @@ namespace bayesnet {
private:
torch::Tensor dataset_;
torch::Tensor X_train, y_train, X_test, y_test;
void validationInit();
bool repeatSparent = false;
unordered_set<int> initializeModels();
// Hyperparameters
bool repeatSparent = false; // if true, a feature can be selected more than once
int maxModels = 0;
bool ascending = false; //Process KBest features ascending or descending order
bool convergence = false; //if true, stop when the model does not improve
bool selectFeatures = false; // if true, use feature selection
string algorithm = ""; // Selected feature selection algorithm
FeatureSelect* featureSelector = nullptr;
double threshold = -1;
};
}
#endif

72
src/BayesNet/CFS.cc Normal file
View File

@@ -0,0 +1,72 @@
#include "CFS.h"
#include <limits>
#include "bayesnetUtils.h"
namespace bayesnet {
void CFS::fit()
{
initialize();
computeSuLabels();
auto featureOrder = argsort(suLabels); // sort descending order
auto continueCondition = true;
auto feature = featureOrder[0];
selectedFeatures.push_back(feature);
selectedScores.push_back(suLabels[feature]);
selectedFeatures.erase(selectedFeatures.begin());
while (continueCondition) {
double merit = numeric_limits<double>::lowest();
int bestFeature = -1;
for (auto feature : featureOrder) {
selectedFeatures.push_back(feature);
// Compute merit with selectedFeatures
auto meritNew = computeMeritCFS();
if (meritNew > merit) {
merit = meritNew;
bestFeature = feature;
}
selectedFeatures.pop_back();
}
if (bestFeature == -1) {
// meritNew has to be nan due to constant features
break;
}
selectedFeatures.push_back(bestFeature);
selectedScores.push_back(merit);
featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
continueCondition = computeContinueCondition(featureOrder);
}
fitted = true;
}
bool CFS::computeContinueCondition(const vector<int>& featureOrder)
{
if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) {
return false;
}
if (selectedScores.size() >= 5) {
/*
"To prevent the best first search from exploring the entire
feature subset search space, a stopping criterion is imposed.
The search will terminate if five consecutive fully expanded
subsets show no improvement over the current best subset."
as stated in Mark A.Hall Thesis
*/
double item_ant = numeric_limits<double>::lowest();
int num = 0;
vector<double> lastFive(selectedScores.end() - 5, selectedScores.end());
for (auto item : lastFive) {
if (item_ant == numeric_limits<double>::lowest()) {
item_ant = item;
}
if (item > item_ant) {
break;
} else {
num++;
item_ant = item;
}
}
if (num == 5) {
return false;
}
}
return true;
}
}

21
src/BayesNet/CFS.h Normal file
View File

@@ -0,0 +1,21 @@
#ifndef CFS_H
#define CFS_H
#include <torch/torch.h>
#include <vector>
#include "FeatureSelect.h"
using namespace std;
namespace bayesnet {
class CFS : public FeatureSelect {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights)
{
}
virtual ~CFS() {};
void fit() override;
private:
bool computeContinueCondition(const vector<int>& featureOrder);
};
}
#endif

View File

@@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")

44
src/BayesNet/FCBF.cc Normal file
View File

@@ -0,0 +1,44 @@
#include "bayesnetUtils.h"
#include "FCBF.h"
namespace bayesnet {
FCBF::FCBF(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
{
if (threshold < 1e-7) {
throw std::invalid_argument("Threshold cannot be less than 1e-7");
}
}
void FCBF::fit()
{
initialize();
computeSuLabels();
auto featureOrder = argsort(suLabels); // sort descending order
auto featureOrderCopy = featureOrder;
for (const auto& feature : featureOrder) {
// Don't self compare
featureOrderCopy.erase(featureOrderCopy.begin());
if (suLabels.at(feature) == 0.0) {
// The feature has been removed from the list
continue;
}
if (suLabels.at(feature) < threshold) {
break;
}
// Remove redundant features
for (const auto& featureCopy : featureOrderCopy) {
double value = computeSuFeatures(feature, featureCopy);
if (value >= suLabels.at(featureCopy)) {
// Remove feature from list
suLabels[featureCopy] = 0.0;
}
}
selectedFeatures.push_back(feature);
selectedScores.push_back(suLabels[feature]);
if (selectedFeatures.size() == maxFeatures) {
break;
}
}
fitted = true;
}
}

18
src/BayesNet/FCBF.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef FCBF_H
#define FCBF_H
#include <torch/torch.h>
#include <vector>
#include "FeatureSelect.h"
using namespace std;
namespace bayesnet {
class FCBF : public FeatureSelect {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
FCBF(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
virtual ~FCBF() {};
void fit() override;
private:
double threshold = -1;
};
}
#endif

View File

@@ -0,0 +1,79 @@
#include "FeatureSelect.h"
#include <limits>
#include "bayesnetUtils.h"
namespace bayesnet {
FeatureSelect::FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
{
}
void FeatureSelect::initialize()
{
selectedFeatures.clear();
selectedScores.clear();
}
double FeatureSelect::symmetricalUncertainty(int a, int b)
{
/*
Compute symmetrical uncertainty. Normalize* information gain (mutual
information) with the entropies of the features in order to compensate
the bias due to high cardinality features. *Range [0, 1]
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
*/
auto x = samples.index({ a, "..." });
auto y = samples.index({ b, "..." });
auto mu = mutualInformation(x, y, weights);
auto hx = entropy(x, weights);
auto hy = entropy(y, weights);
return 2.0 * mu / (hx + hy);
}
void FeatureSelect::computeSuLabels()
{
// Compute Simmetrical Uncertainty between features and labels
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
for (int i = 0; i < features.size(); ++i) {
suLabels.push_back(symmetricalUncertainty(i, -1));
}
}
double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
{
// Compute Simmetrical Uncertainty between features
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
try {
return suFeatures.at({ firstFeature, secondFeature });
}
catch (const out_of_range& e) {
double result = symmetricalUncertainty(firstFeature, secondFeature);
suFeatures[{firstFeature, secondFeature}] = result;
return result;
}
}
double FeatureSelect::computeMeritCFS()
{
double result;
double rcf = 0;
for (auto feature : selectedFeatures) {
rcf += suLabels[feature];
}
double rff = 0;
int n = selectedFeatures.size();
for (const auto& item : doCombinations(selectedFeatures)) {
rff += computeSuFeatures(item.first, item.second);
}
return rcf / sqrt(n + (n * n - n) * rff);
}
vector<int> FeatureSelect::getFeatures() const
{
if (!fitted) {
throw runtime_error("FeatureSelect not fitted");
}
return selectedFeatures;
}
vector<double> FeatureSelect::getScores() const
{
if (!fitted) {
throw runtime_error("FeatureSelect not fitted");
}
return selectedScores;
}
}

View File

@@ -0,0 +1,31 @@
#ifndef FEATURE_SELECT_H
#define FEATURE_SELECT_H
#include <torch/torch.h>
#include <vector>
#include "BayesMetrics.h"
using namespace std;
namespace bayesnet {
class FeatureSelect : public Metrics {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
virtual ~FeatureSelect() {};
virtual void fit() = 0;
vector<int> getFeatures() const;
vector<double> getScores() const;
protected:
void initialize();
void computeSuLabels();
double computeSuFeatures(const int a, const int b);
double symmetricalUncertainty(int a, int b);
double computeMeritCFS();
const torch::Tensor& weights;
int maxFeatures;
vector<int> selectedFeatures;
vector<double> selectedScores;
vector<double> suLabels;
map<pair<int, int>, double> suFeatures;
bool fitted = false;
};
}
#endif

47
src/BayesNet/IWSS.cc Normal file
View File

@@ -0,0 +1,47 @@
#include "IWSS.h"
#include <limits>
#include "bayesnetUtils.h"
namespace bayesnet {
IWSS::IWSS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
{
if (threshold < 0 || threshold > .5) {
throw std::invalid_argument("Threshold has to be in [0, 0.5]");
}
}
void IWSS::fit()
{
initialize();
computeSuLabels();
auto featureOrder = argsort(suLabels); // sort descending order
auto featureOrderCopy = featureOrder;
// Add first and second features to result
// First with its own score
auto first_feature = pop_first(featureOrderCopy);
selectedFeatures.push_back(first_feature);
selectedScores.push_back(suLabels.at(first_feature));
// Second with the score of the candidates
selectedFeatures.push_back(pop_first(featureOrderCopy));
auto merit = computeMeritCFS();
selectedScores.push_back(merit);
for (const auto feature : featureOrderCopy) {
selectedFeatures.push_back(feature);
// Compute merit with selectedFeatures
auto meritNew = computeMeritCFS();
double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0;
if (meritNew > merit || delta < threshold) {
if (meritNew > merit) {
merit = meritNew;
}
selectedScores.push_back(meritNew);
} else {
selectedFeatures.pop_back();
break;
}
if (selectedFeatures.size() == maxFeatures) {
break;
}
}
fitted = true;
}
}

18
src/BayesNet/IWSS.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef IWSS_H
#define IWSS_H
#include <torch/torch.h>
#include <vector>
#include "FeatureSelect.h"
using namespace std;
namespace bayesnet {
class IWSS : public FeatureSelect {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
IWSS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
virtual ~IWSS() {};
void fit() override;
private:
double threshold = -1;
};
}
#endif

View File

@@ -1,5 +1,6 @@
#include "Mst.h"
#include <vector>
#include <list>
/*
Based on the code from https://www.softwaretestinghelp.com/minimum-spanning-tree-tutorial/
@@ -34,7 +35,7 @@ namespace bayesnet {
void Graph::kruskal_algorithm()
{
// sort the edges ordered on decreasing weight
sort(G.begin(), G.end(), [](const auto& left, const auto& right) {return left.first > right.first;});
stable_sort(G.begin(), G.end(), [](const auto& left, const auto& right) {return left.first > right.first;});
for (int i = 0; i < G.size(); i++) {
int uSt, vEd;
uSt = find_set(G[i].second.first);
@@ -55,15 +56,24 @@ namespace bayesnet {
}
}
void insertElement(list<int>& variables, int variable)
{
if (find(variables.begin(), variables.end(), variable) == variables.end()) {
variables.push_front(variable);
}
}
vector<pair<int, int>> reorder(vector<pair<float, pair<int, int>>> T, int root_original)
{
// Create the edges of a DAG from the MST
// replacing unordered_set with list because unordered_set cannot guarantee the order of the elements inserted
auto result = vector<pair<int, int>>();
auto visited = vector<int>();
auto nextVariables = unordered_set<int>();
nextVariables.emplace(root_original);
auto nextVariables = list<int>();
nextVariables.push_front(root_original);
while (nextVariables.size() > 0) {
int root = *nextVariables.begin();
nextVariables.erase(nextVariables.begin());
int root = nextVariables.front();
nextVariables.pop_front();
for (int i = 0; i < T.size(); ++i) {
auto [weight, edge] = T[i];
auto [from, to] = edge;
@@ -71,10 +81,10 @@ namespace bayesnet {
visited.insert(visited.begin(), i);
if (from == root) {
result.push_back({ from, to });
nextVariables.emplace(to);
insertElement(nextVariables, to);
} else {
result.push_back({ to, from });
nextVariables.emplace(from);
insertElement(nextVariables, from);
}
}
}
@@ -99,7 +109,6 @@ namespace bayesnet {
{
auto num_features = features.size();
Graph g(num_features);
// Make a complete graph
for (int i = 0; i < num_features - 1; ++i) {
for (int j = i + 1; j < num_features; ++j) {

View File

@@ -157,7 +157,7 @@ namespace bayesnet {
completeFit(states, weights);
}
// input_data comes in nxm, where n is the number of features and m the number of samples
void Network::fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<float>& weights_, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
void Network::fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<double>& weights_, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
{
const torch::Tensor weights = torch::tensor(weights_, torch::kFloat64);
checkFitData(input_data[0].size(), input_data.size(), labels.size(), featureNames, className, states, weights);
@@ -201,7 +201,6 @@ namespace bayesnet {
}
if (proba)
return result;
else
return result.argmax(1);
}
// Return mxn tensor of probabilities

View File

@@ -39,7 +39,10 @@ namespace bayesnet {
int getNumEdges() const;
int getClassNumStates() const;
string getClassName() const;
void fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<float>& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
/*
Notice: Nodes have to be inserted in the same order as they are in the dataset, i.e., first node is first column and so on.
*/
void fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<double>& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
void fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
void fit(const torch::Tensor& samples, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
vector<int> predict(const vector<vector<int>>&); // Return mx1 vector of predictions

View File

@@ -14,8 +14,8 @@ namespace bayesnet {
int numStates; // number of states of the variable
torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ...
vector<int64_t> dimensions; // dimensions of the cpTable
public:
vector<pair<string, string>> combinations(const vector<string>&);
public:
explicit Node(const string&);
void clear();
void addParent(Node*);

View File

@@ -1,4 +1,5 @@
#include <filesystem>
#include <set>
#include <fstream>
#include <iostream>
#include <sstream>
@@ -6,7 +7,8 @@
#include "Result.h"
#include "Colors.h"
#include "Statistics.h"
#include "BestResultsExcel.h"
#include "CLocale.h"
namespace fs = std::filesystem;
@@ -39,15 +41,17 @@ namespace platform {
auto data = result.load();
for (auto const& item : data.at("results")) {
bool update = false;
if (bests.contains(item.at("dataset").get<string>())) {
if (item.at("score").get<double>() > bests[item.at("dataset").get<string>()].at(0).get<double>()) {
// Check if results file contains only one dataset
auto datasetName = item.at("dataset").get<string>();
if (bests.contains(datasetName)) {
if (item.at("score").get<double>() > bests[datasetName].at(0).get<double>()) {
update = true;
}
} else {
update = true;
}
if (update) {
bests[item.at("dataset").get<string>()] = { item.at("score").get<double>(), item.at("hyperparameters"), file };
bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file };
}
}
}
@@ -124,6 +128,14 @@ namespace platform {
result = vector<string>(models.begin(), models.end());
return result;
}
vector<string> BestResults::getDatasets(json table)
{
vector<string> datasets;
for (const auto& dataset : table.items()) {
datasets.push_back(dataset.key());
}
return datasets;
}
void BestResults::buildAll()
{
@@ -145,18 +157,21 @@ namespace platform {
cerr << Colors::MAGENTA() << "File " << bestFileName << " doesn't exist." << Colors::RESET() << endl;
exit(1);
}
auto temp = ConfigLocale();
auto date = ftime_to_string(filesystem::last_write_time(bestFileName));
auto data = loadFile(bestFileName);
auto datasets = getDatasets(data);
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
cout << Colors::GREEN() << "Best results for " << model << " and " << score << " as of " << date << endl;
cout << "--------------------------------------------------------" << endl;
cout << Colors::GREEN() << " # Dataset Score File Hyperparameters" << endl;
cout << "=== ========================= =========== ================================================================== ================================================= " << endl;
cout << Colors::GREEN() << " # " << setw(maxDatasetName + 1) << left << string("Dataset") << "Score File Hyperparameters" << endl;
cout << "=== " << string(maxDatasetName, '=') << " =========== ================================================================== ================================================= " << endl;
auto i = 0;
bool odd = true;
for (auto const& item : data.items()) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
cout << color << setw(3) << fixed << right << i++ << " ";
cout << setw(25) << left << item.key() << " ";
cout << setw(maxDatasetName) << left << item.key() << " ";
cout << setw(11) << setprecision(9) << fixed << item.value().at(0).get<double>() << " ";
cout << setw(66) << item.value().at(2).get<string>() << " ";
cout << item.value().at(1) << " ";
@@ -166,9 +181,6 @@ namespace platform {
}
json BestResults::buildTableResults(vector<string> models)
{
int numberOfDatasets = 0;
bool first = true;
json origin;
json table;
auto maxDate = filesystem::file_time_type::max();
for (const auto& model : models) {
@@ -185,17 +197,6 @@ namespace platform {
maxDate = dateWrite;
}
auto data = loadFile(bestFileName);
if (first) {
// Get the number of datasets of the first file and check that is the same for all the models
first = false;
numberOfDatasets = data.size();
origin = data;
} else {
if (numberOfDatasets != data.size()) {
cerr << Colors::MAGENTA() << "The number of datasets in the best results files is not the same for all the models." << Colors::RESET() << endl;
exit(1);
}
}
table[model] = data;
}
table["dateTable"] = ftime_to_string(maxDate);
@@ -206,14 +207,14 @@ namespace platform {
{
cout << Colors::GREEN() << "Best results for " << score << " as of " << table.at("dateTable").get<string>() << endl;
cout << "------------------------------------------------" << endl;
cout << Colors::GREEN() << " # Dataset ";
cout << Colors::GREEN() << " # " << setw(maxDatasetName + 1) << left << string("Dataset");
for (const auto& model : models) {
cout << setw(12) << left << model << " ";
cout << setw(maxModelName) << left << model << " ";
}
cout << endl;
cout << "=== ========================= ";
cout << "=== " << string(maxDatasetName, '=') << " ";
for (const auto& model : models) {
cout << "============ ";
cout << string(maxModelName, '=') << " ";
}
cout << endl;
auto i = 0;
@@ -223,15 +224,15 @@ namespace platform {
for (const auto& model : models) {
totals[model] = 0.0;
}
json origin = table.begin().value();
for (auto const& item : origin.items()) {
auto datasets = getDatasets(table.begin().value());
for (auto const& dataset : datasets) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
cout << color << setw(3) << fixed << right << i++ << " ";
cout << setw(25) << left << item.key() << " ";
cout << setw(maxDatasetName) << left << dataset << " ";
double maxValue = 0;
// Find out the max value for this dataset
for (const auto& model : models) {
double value = table[model].at(item.key()).at(0).get<double>();
double value = table[model].at(dataset).at(0).get<double>();
if (value > maxValue) {
maxValue = value;
}
@@ -239,22 +240,22 @@ namespace platform {
// Print the row with red colors on max values
for (const auto& model : models) {
string efectiveColor = color;
double value = table[model].at(item.key()).at(0).get<double>();
double value = table[model].at(dataset).at(0).get<double>();
if (value == maxValue) {
efectiveColor = Colors::RED();
}
totals[model] += value;
cout << efectiveColor << setw(12) << setprecision(10) << fixed << value << " ";
cout << efectiveColor << setw(maxModelName) << setprecision(maxModelName - 2) << fixed << value << " ";
}
cout << endl;
odd = !odd;
}
cout << Colors::GREEN() << "=== ========================= ";
cout << Colors::GREEN() << "=== " << string(maxDatasetName, '=') << " ";
for (const auto& model : models) {
cout << "============ ";
cout << string(maxModelName, '=') << " ";
}
cout << endl;
cout << Colors::GREEN() << setw(30) << " Totals...................";
cout << Colors::GREEN() << setw(5 + maxDatasetName) << " Totals...................";
double max = 0.0;
for (const auto& total : totals) {
if (total.second > max) {
@@ -266,27 +267,34 @@ namespace platform {
if (totals[model] == max) {
efectiveColor = Colors::RED();
}
cout << efectiveColor << setw(12) << setprecision(9) << fixed << totals[model] << " ";
cout << efectiveColor << right << setw(maxModelName) << setprecision(maxModelName - 4) << fixed << totals[model] << " ";
}
cout << endl;
}
void BestResults::reportAll()
void BestResults::reportAll(bool excel)
{
auto models = getModels();
// Build the table of results
json table = buildTableResults(models);
vector<string> datasets = getDatasets(table.begin().value());
maxModelName = (*max_element(models.begin(), models.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
maxModelName = max(12, maxModelName);
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
maxDatasetName = max(25, maxDatasetName);
// Print the table of results
printTableResults(models, table);
// Compute the Friedman test
map<string, map<string, float>> ranksModels;
if (friedman) {
vector<string> datasets;
for (const auto& dataset : table.begin().value().items()) {
datasets.push_back(dataset.key());
}
double significance = 0.05;
Statistics stats(models, datasets, table, significance);
auto result = stats.friedmanTest();
stats.postHocHolmTest(result);
ranksModels = stats.getRanks();
}
if (excel) {
BestResultsExcel excel(score, models, datasets, table, ranksModels, friedman, significance);
excel.build();
cout << Colors::YELLOW() << "** Excel file generated: " << excel.getFileName() << Colors::RESET() << endl;
}
}
}

View File

@@ -1,20 +1,20 @@
#ifndef BESTRESULTS_H
#define BESTRESULTS_H
#include <string>
#include <set>
#include <nlohmann/json.hpp>
using namespace std;
using json = nlohmann::json;
namespace platform {
class BestResults {
public:
explicit BestResults(const string& path, const string& score, const string& model, bool friedman) : path(path), score(score), model(model), friedman(friedman) {}
explicit BestResults(const string& path, const string& score, const string& model, bool friedman, double significance = 0.05) : path(path), score(score), model(model), friedman(friedman), significance(significance) {}
string build();
void reportSingle();
void reportAll();
void reportAll(bool excel);
void buildAll();
private:
vector<string> getModels();
vector<string> getDatasets(json table);
vector<string> loadResultFiles();
json buildTableResults(vector<string> models);
void printTableResults(vector<string> models, json table);
@@ -24,6 +24,9 @@ namespace platform {
string score;
string model;
bool friedman;
double significance;
int maxModelName = 0;
int maxDatasetName = 0;
};
}
#endif //BESTRESULTS_H

View File

@@ -0,0 +1,177 @@
#include <sstream>
#include "BestResultsExcel.h"
#include "Paths.h"
#include "Statistics.h"
namespace platform {
BestResultsExcel::BestResultsExcel(const string& score, const vector<string>& models, const vector<string>& datasets, const json& table, const map<string, map<string, float>>& ranksModels, bool friedman, double significance) :
score(score), models(models), datasets(datasets), table(table), ranksModels(ranksModels), friedman(friedman), significance(significance)
{
workbook = workbook_new((Paths::excel() + fileName).c_str());
worksheet = workbook_add_worksheet(workbook, "Best Results");
setProperties("Best Results");
createFormats();
int maxModelName = (*max_element(models.begin(), models.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
modelNameSize = max(modelNameSize, maxModelName);
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
datasetNameSize = max(datasetNameSize, maxDatasetName);
formatColumns();
}
BestResultsExcel::~BestResultsExcel()
{
workbook_close(workbook);
}
void BestResultsExcel::formatColumns()
{
worksheet_freeze_panes(worksheet, 4, 2);
vector<int> columns_sizes = { 5, datasetNameSize };
for (int i = 0; i < models.size(); ++i) {
columns_sizes.push_back(modelNameSize);
}
for (int i = 0; i < columns_sizes.size(); ++i) {
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
}
}
void BestResultsExcel::build()
{
// Create Sheet with scores
header(false);
body(false);
footer(false);
if (friedman) {
// Create Sheet with ranks
worksheet = workbook_add_worksheet(workbook, "Ranks");
formatColumns();
header(true);
body(true);
footer(true);
// Create Sheet with Friedman Test
doFriedman();
}
}
string BestResultsExcel::getFileName()
{
return Paths::excel() + fileName;
}
void BestResultsExcel::header(bool ranks)
{
row = 0;
string message = ranks ? "Ranks for score " + score : "Best results for " + score;
worksheet_merge_range(worksheet, 0, 0, 0, 1 + models.size(), message.c_str(), styles["headerFirst"]);
// Body header
row = 3;
int col = 1;
writeString(row, 0, "", "bodyHeader");
writeString(row, 1, "Dataset", "bodyHeader");
for (const auto& model : models) {
writeString(row, ++col, model.c_str(), "bodyHeader");
}
}
void BestResultsExcel::body(bool ranks)
{
row = 4;
int i = 0;
json origin = table.begin().value();
for (auto const& item : origin.items()) {
writeInt(row, 0, i++, "ints");
writeString(row, 1, item.key().c_str(), "text");
int col = 1;
for (const auto& model : models) {
double value = ranks ? ranksModels[item.key()][model] : table[model].at(item.key()).at(0).get<double>();
writeDouble(row, ++col, value, "result");
}
++row;
}
}
void BestResultsExcel::footer(bool ranks)
{
// Set Totals
writeString(row, 1, "Total", "bodyHeader");
int col = 1;
for (const auto& model : models) {
stringstream oss;
oss << "=sum(indirect(address(" << 5 << "," << col + 2 << ")):indirect(address(" << row << "," << col + 2 << ")))";
worksheet_write_formula(worksheet, row, ++col, oss.str().c_str(), styles["bodyHeader_odd"]);
}
if (ranks) {
row++;
writeString(row, 1, "Average ranks", "bodyHeader");
int col = 1;
for (const auto& model : models) {
stringstream oss;
oss << "=sum(indirect(address(" << 5 << "," << col + 2 << ")):indirect(address(" << row - 1 << "," << col + 2 << ")))/" << datasets.size();
worksheet_write_formula(worksheet, row, ++col, oss.str().c_str(), styles["bodyHeader_odd"]);
}
}
}
void BestResultsExcel::doFriedman()
{
worksheet = workbook_add_worksheet(workbook, "Friedman");
vector<int> columns_sizes = { 5, datasetNameSize };
for (int i = 0; i < models.size(); ++i) {
columns_sizes.push_back(modelNameSize);
}
for (int i = 0; i < columns_sizes.size(); ++i) {
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
}
worksheet_merge_range(worksheet, 0, 0, 0, 1 + models.size(), "Friedman Test", styles["headerFirst"]);
row = 2;
Statistics stats(models, datasets, table, significance, false);
auto result = stats.friedmanTest();
stats.postHocHolmTest(result);
auto friedmanResult = stats.getFriedmanResult();
auto holmResult = stats.getHolmResult();
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Null hypothesis: H0 'There is no significant differences between all the classifiers.'", styles["headerSmall"]);
row += 2;
writeString(row, 1, "Friedman Q", "bodyHeader");
writeDouble(row, 2, friedmanResult.statistic, "bodyHeader");
row++;
writeString(row, 1, "Critical χ2 value", "bodyHeader");
writeDouble(row, 2, friedmanResult.criticalValue, "bodyHeader");
row++;
writeString(row, 1, "p-value", "bodyHeader");
writeDouble(row, 2, friedmanResult.pvalue, "bodyHeader");
writeString(row, 3, friedmanResult.reject ? "<" : ">", "bodyHeader");
writeDouble(row, 4, significance, "bodyHeader");
writeString(row, 5, friedmanResult.reject ? "Reject H0" : "Accept H0", "bodyHeader");
row += 3;
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Holm Test", styles["headerFirst"]);
row += 2;
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Null hypothesis: H0 'There is no significant differences between the control model and the other models.'", styles["headerSmall"]);
row += 2;
string controlModel = "Control Model: " + holmResult.model;
worksheet_merge_range(worksheet, row, 1, row, 7, controlModel.c_str(), styles["bodyHeader_odd"]);
row++;
writeString(row, 1, "Model", "bodyHeader");
writeString(row, 2, "p-value", "bodyHeader");
writeString(row, 3, "Rank", "bodyHeader");
writeString(row, 4, "Win", "bodyHeader");
writeString(row, 5, "Tie", "bodyHeader");
writeString(row, 6, "Loss", "bodyHeader");
writeString(row, 7, "Reject H0", "bodyHeader");
row++;
bool first = true;
for (const auto& item : holmResult.holmLines) {
writeString(row, 1, item.model, "text");
if (first) {
// Control model info
first = false;
writeString(row, 2, "", "text");
writeDouble(row, 3, item.rank, "result");
writeString(row, 4, "", "text");
writeString(row, 5, "", "text");
writeString(row, 6, "", "text");
writeString(row, 7, "", "textCentered");
} else {
// Rest of the models info
writeDouble(row, 2, item.pvalue, "result");
writeDouble(row, 3, item.rank, "result");
writeInt(row, 4, item.wtl.win, "ints");
writeInt(row, 5, item.wtl.tie, "ints");
writeInt(row, 6, item.wtl.loss, "ints");
writeString(row, 7, item.reject ? "Yes" : "No", "textCentered");
}
row++;
}
}
}

View File

@@ -0,0 +1,37 @@
#ifndef BESTRESULTS_EXCEL_H
#define BESTRESULTS_EXCEL_H
#include "ExcelFile.h"
#include <vector>
#include <map>
#include <nlohmann/json.hpp>
using namespace std;
using json = nlohmann::json;
namespace platform {
class BestResultsExcel : ExcelFile {
public:
BestResultsExcel(const string& score, const vector<string>& models, const vector<string>& datasets, const json& table, const map<string, map<string, float>>& ranks, bool friedman, double significance);
~BestResultsExcel();
void build();
string getFileName();
private:
void header(bool ranks);
void body(bool ranks);
void footer(bool ranks);
void formatColumns();
void doFriedman();
const string fileName = "BestResults.xlsx";
string score;
vector<string> models;
vector<string> datasets;
json table;
map<string, map<string, float>> ranksModels;
bool friedman;
double significance;
int modelNameSize = 12; // Min size of the column
int datasetNameSize = 25; // Min size of the column
};
}
#endif //BESTRESULTS_EXCEL_H

24
src/Platform/CLocale.h Normal file
View File

@@ -0,0 +1,24 @@
#ifndef LOCALE_H
#define LOCALE_H
#include <locale>
#include <iostream>
#include <sstream>
#include <string>
using namespace std;
namespace platform {
struct separation : numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
string do_grouping() const { return "\03"; }
};
class ConfigLocale {
public:
explicit ConfigLocale()
{
locale mylocale(cout.getloc(), new separation);
locale::global(mylocale);
cout.imbue(mylocale);
}
};
}
#endif

View File

@@ -5,16 +5,13 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include)
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
include_directories(${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/include)
add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc ReportConsole.cc ReportBase.cc)
add_executable(manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc platformUtils.cc)
add_executable(list list.cc platformUtils Datasets.cc)
add_executable(best best.cc BestResults.cc Result.cc Statistics.cc)
target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
target_link_libraries(manage "${TORCH_LIBRARIES}" libxlsxwriter.so ArffFiles mdlp stdc++fs)
target_link_libraries(best Boost::boost stdc++fs)
else()
target_link_libraries(manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)
target_link_libraries(best Boost::boost)
endif()
target_link_libraries(list ArffFiles mdlp "${TORCH_LIBRARIES}")
add_executable(b_main main.cc Folding.cc Experiment.cc Datasets.cc Dataset.cc Models.cc ReportConsole.cc ReportBase.cc)
add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
add_executable(b_list list.cc Datasets.cc Dataset.cc)
add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc)
add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc )
target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)
target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}")
target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}")
target_link_libraries(testx ArffFiles BayesNet "${TORCH_LIBRARIES}")

215
src/Platform/Dataset.cc Normal file
View File

@@ -0,0 +1,215 @@
#include "Dataset.h"
#include "ArffFiles.h"
#include <fstream>
namespace platform {
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
{
}
string Dataset::getName() const
{
return name;
}
string Dataset::getClassName() const
{
return className;
}
vector<string> Dataset::getFeatures() const
{
if (loaded) {
return features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures() const
{
if (loaded) {
return n_features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples() const
{
if (loaded) {
return n_samples;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Dataset::getStates() const
{
if (loaded) {
return states;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<int>>&, vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
string line;
getline(file, line);
vector<string> tokens = split(line, ',');
features = vector<string>(tokens.begin(), tokens.end() - 1);
if (className == "-1") {
className = tokens.back();
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
}
states[className] = vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
}
vector<string> tokenize(string line)
{
vector<string> tokens;
for (auto i = 0; i < line.size(); ++i) {
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
string token = line.substr(0, i);
tokens.push_back(token);
line.erase(line.begin(), line.begin() + i + 1);
i = 0;
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
line.erase(line.begin(), line.begin() + i + 1);
}
}
if (line.size() > 0) {
tokens.push_back(line);
}
return tokens;
}
void Dataset::load_rdata()
{
ifstream file(path + "/" + name + "_R.dat");
if (file.is_open()) {
string line;
getline(file, line);
line = ArffFiles::trim(line);
vector<string> tokens = tokenize(line);
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
if (className == "-1") {
className = ArffFiles::trim(tokens.back());
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(vector<float>());
}
while (getline(file, line)) {
tokens = tokenize(line);
// We have to skip the first token, which is the instance number.
for (auto i = 1; i < features.size() + 1; ++i) {
const float value = stof(tokens[i]);
Xv[i - 1].push_back(value);
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw invalid_argument("Unable to open dataset file.");
}
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
} else if (fileType == RDATA) {
load_rdata();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
}
n_samples = Xv[0].size();
n_features = Xv.size();
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
}
y = torch::tensor(yv, torch::kInt32);
}
vector<mdlp::labels_t> Dataset::discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y)
{
vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
}

80
src/Platform/Dataset.h Normal file
View File

@@ -0,0 +1,80 @@
#ifndef DATASET_H
#define DATASET_H
#include <torch/torch.h>
#include <map>
#include <vector>
#include <string>
#include "CPPFImdlp.h"
#include "Utils.h"
namespace platform {
using namespace std;
enum fileType_t { CSV, ARFF, RDATA };
class SourceData {
public:
SourceData(string source)
{
if (source == "Surcov") {
path = "datasets/";
fileType = CSV;
} else if (source == "Arff") {
path = "datasets/";
fileType = ARFF;
} else if (source == "Tanveer") {
path = "data/";
fileType = RDATA;
} else {
throw invalid_argument("Unknown source.");
}
}
string getPath()
{
return path;
}
fileType_t getFileType()
{
return fileType;
}
private:
string path;
fileType_t fileType;
};
class Dataset {
private:
string path;
string name;
fileType_t fileType;
string className;
int n_samples{ 0 }, n_features{ 0 };
vector<string> features;
map<string, vector<int>> states;
bool loaded;
bool discretize;
torch::Tensor X, y;
vector<vector<float>> Xv;
vector<vector<int>> Xd;
vector<int> yv;
void buildTensors();
void load_csv();
void load_arff();
void load_rdata();
void computeStates();
vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y);
public:
Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
explicit Dataset(const Dataset&);
string getName() const;
string getClassName() const;
vector<string> getFeatures() const;
map<string, vector<int>> getStates() const;
pair<vector<vector<float>>&, vector<int>&> getVectors();
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized();
pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures() const;
int getNSamples() const;
void load();
const bool inline isLoaded() const { return loaded; };
};
};
#endif

View File

@@ -1,22 +1,31 @@
#include "Datasets.h"
#include "platformUtils.h"
#include "ArffFiles.h"
#include <fstream>
namespace platform {
void Datasets::load()
{
ifstream catalog(path + "/all.txt");
auto sd = SourceData(sfileType);
fileType = sd.getFileType();
path = sd.getPath();
ifstream catalog(path + "all.txt");
if (catalog.is_open()) {
string line;
while (getline(catalog, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
vector<string> tokens = split(line, ',');
string name = tokens[0];
string className = tokens[1];
string className;
if (tokens.size() == 1) {
className = "-1";
} else {
className = tokens[1];
}
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
}
catalog.close();
} else {
throw invalid_argument("Unable to open catalog file. [" + path + "/all.txt" + "]");
throw invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
}
}
vector<string> Datasets::getNames()
@@ -117,152 +126,4 @@ namespace platform {
{
return datasets.find(name) != datasets.end();
}
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
{
}
string Dataset::getName() const
{
return name;
}
string Dataset::getClassName() const
{
return className;
}
vector<string> Dataset::getFeatures() const
{
if (loaded) {
return features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures() const
{
if (loaded) {
return n_features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples() const
{
if (loaded) {
return n_samples;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Dataset::getStates() const
{
if (loaded) {
return states;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<int>>&, vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
string line;
getline(file, line);
vector<string> tokens = split(line, ',');
features = vector<string>(tokens.begin(), tokens.end() - 1);
className = tokens.back();
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
}
states[className] = vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
}
n_samples = Xv[0].size();
n_features = Xv.size();
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
}
y = torch::tensor(yv, torch::kInt32);
}
}

View File

@@ -1,55 +1,18 @@
#ifndef DATASETS_H
#define DATASETS_H
#include <torch/torch.h>
#include <map>
#include <vector>
#include <string>
#include "Dataset.h"
namespace platform {
using namespace std;
enum fileType_t { CSV, ARFF };
class Dataset {
private:
string path;
string name;
fileType_t fileType;
string className;
int n_samples{ 0 }, n_features{ 0 };
vector<string> features;
map<string, vector<int>> states;
bool loaded;
bool discretize;
torch::Tensor X, y;
vector<vector<float>> Xv;
vector<vector<int>> Xd;
vector<int> yv;
void buildTensors();
void load_csv();
void load_arff();
void computeStates();
public:
Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
explicit Dataset(const Dataset&);
string getName() const;
string getClassName() const;
vector<string> getFeatures() const;
map<string, vector<int>> getStates() const;
pair<vector<vector<float>>&, vector<int>&> getVectors();
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized();
pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures() const;
int getNSamples() const;
void load();
const bool inline isLoaded() const { return loaded; };
};
class Datasets {
private:
string path;
fileType_t fileType;
string sfileType;
map<string, unique_ptr<Dataset>> datasets;
bool discretize;
void load(); // Loads the list of datasets
public:
explicit Datasets(const string& path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); };
explicit Datasets(bool discretize, string sfileType) : discretize(discretize), sfileType(sfileType) { load(); };
vector<string> getNames();
vector<string> getFeatures(const string& name) const;
int getNSamples(const string& name) const;

View File

@@ -4,7 +4,11 @@
#include <map>
#include <fstream>
#include <sstream>
#include "platformUtils.h"
#include <algorithm>
#include <iostream>
#include "Utils.h"
//#include "Dataset.h"
namespace platform {
class DotEnv {
private:
@@ -43,7 +47,7 @@ namespace platform {
}
std::string get(const std::string& key)
{
return env[key];
return env.at(key);
}
std::vector<int> getSeeds()
{

164
src/Platform/ExcelFile.cc Normal file
View File

@@ -0,0 +1,164 @@
#include "ExcelFile.h"
namespace platform {
ExcelFile::ExcelFile()
{
setDefault();
}
ExcelFile::ExcelFile(lxw_workbook* workbook) : workbook(workbook)
{
setDefault();
}
void ExcelFile::setDefault()
{
normalSize = 14; //font size for report body
row = 0;
colorTitle = 0xB1A0C7;
colorOdd = 0xDCE6F1;
colorEven = 0xFDE9D9;
}
lxw_workbook* ExcelFile::getWorkbook()
{
return workbook;
}
void ExcelFile::setProperties(string title)
{
char line[title.size() + 1];
strcpy(line, title.c_str());
lxw_doc_properties properties = {
.title = line,
.subject = (char*)"Machine learning results",
.author = (char*)"Ricardo Montañana Gómez",
.manager = (char*)"Dr. J. A. Gámez, Dr. J. M. Puerta",
.company = (char*)"UCLM",
.comments = (char*)"Created with libxlsxwriter and c++",
};
workbook_set_properties(workbook, &properties);
}
lxw_format* ExcelFile::efectiveStyle(const string& style)
{
lxw_format* efectiveStyle = NULL;
if (style != "") {
string suffix = row % 2 ? "_odd" : "_even";
try {
efectiveStyle = styles.at(style + suffix);
}
catch (const out_of_range& oor) {
try {
efectiveStyle = styles.at(style);
}
catch (const out_of_range& oor) {
throw invalid_argument("Style " + style + " not found");
}
}
}
return efectiveStyle;
}
void ExcelFile::writeString(int row, int col, const string& text, const string& style)
{
worksheet_write_string(worksheet, row, col, text.c_str(), efectiveStyle(style));
}
void ExcelFile::writeInt(int row, int col, const int number, const string& style)
{
worksheet_write_number(worksheet, row, col, number, efectiveStyle(style));
}
void ExcelFile::writeDouble(int row, int col, const double number, const string& style)
{
worksheet_write_number(worksheet, row, col, number, efectiveStyle(style));
}
void ExcelFile::addColor(lxw_format* style, bool odd)
{
uint32_t efectiveColor = odd ? colorEven : colorOdd;
format_set_bg_color(style, lxw_color_t(efectiveColor));
}
void ExcelFile::createStyle(const string& name, lxw_format* style, bool odd)
{
addColor(style, odd);
if (name == "textCentered") {
format_set_align(style, LXW_ALIGN_CENTER);
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "text") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "bodyHeader") {
format_set_bold(style);
format_set_font_size(style, normalSize);
format_set_align(style, LXW_ALIGN_CENTER);
format_set_align(style, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(style, LXW_BORDER_THIN);
format_set_bg_color(style, lxw_color_t(colorTitle));
} else if (name == "result") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
format_set_num_format(style, "0.0000000");
} else if (name == "time") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
format_set_num_format(style, "#,##0.000000");
} else if (name == "ints") {
format_set_font_size(style, normalSize);
format_set_num_format(style, "###,##0");
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "floats") {
format_set_border(style, LXW_BORDER_THIN);
format_set_font_size(style, normalSize);
format_set_num_format(style, "#,##0.00");
}
}
void ExcelFile::createFormats()
{
auto styleNames = { "text", "textCentered", "bodyHeader", "result", "time", "ints", "floats" };
lxw_format* style;
for (string name : styleNames) {
lxw_format* style = workbook_add_format(workbook);
style = workbook_add_format(workbook);
createStyle(name, style, true);
styles[name + "_odd"] = style;
style = workbook_add_format(workbook);
createStyle(name, style, false);
styles[name + "_even"] = style;
}
// Header 1st line
lxw_format* headerFirst = workbook_add_format(workbook);
format_set_bold(headerFirst);
format_set_font_size(headerFirst, 18);
format_set_align(headerFirst, LXW_ALIGN_CENTER);
format_set_align(headerFirst, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(headerFirst, LXW_BORDER_THIN);
format_set_bg_color(headerFirst, lxw_color_t(colorTitle));
// Header rest
lxw_format* headerRest = workbook_add_format(workbook);
format_set_bold(headerRest);
format_set_align(headerRest, LXW_ALIGN_CENTER);
format_set_font_size(headerRest, 16);
format_set_align(headerRest, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(headerRest, LXW_BORDER_THIN);
format_set_bg_color(headerRest, lxw_color_t(colorOdd));
// Header small
lxw_format* headerSmall = workbook_add_format(workbook);
format_set_bold(headerSmall);
format_set_align(headerSmall, LXW_ALIGN_LEFT);
format_set_font_size(headerSmall, 12);
format_set_border(headerSmall, LXW_BORDER_THIN);
format_set_align(headerSmall, LXW_ALIGN_VERTICAL_CENTER);
format_set_bg_color(headerSmall, lxw_color_t(colorOdd));
// Summary style
lxw_format* summaryStyle = workbook_add_format(workbook);
format_set_bold(summaryStyle);
format_set_font_size(summaryStyle, 16);
format_set_border(summaryStyle, LXW_BORDER_THIN);
format_set_align(summaryStyle, LXW_ALIGN_VERTICAL_CENTER);
styles["headerFirst"] = headerFirst;
styles["headerRest"] = headerRest;
styles["headerSmall"] = headerSmall;
styles["summaryStyle"] = summaryStyle;
}
}

43
src/Platform/ExcelFile.h Normal file
View File

@@ -0,0 +1,43 @@
#ifndef EXCELFILE_H
#define EXCELFILE_H
#include <locale>
#include <string>
#include <map>
#include "xlsxwriter.h"
using namespace std;
namespace platform {
struct separated : numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
string do_grouping() const { return "\03"; }
};
class ExcelFile {
public:
ExcelFile();
ExcelFile(lxw_workbook* workbook);
lxw_workbook* getWorkbook();
protected:
void setProperties(string title);
void writeString(int row, int col, const string& text, const string& style = "");
void writeInt(int row, int col, const int number, const string& style = "");
void writeDouble(int row, int col, const double number, const string& style = "");
void createFormats();
void createStyle(const string& name, lxw_format* style, bool odd);
void addColor(lxw_format* style, bool odd);
lxw_format* efectiveStyle(const string& name);
lxw_workbook* workbook;
lxw_worksheet* worksheet;
map<string, lxw_format*> styles;
int row;
int normalSize; //font size for report body
uint32_t colorTitle;
uint32_t colorOdd;
uint32_t colorEven;
private:
void setDefault();
};
}
#endif // !EXCELFILE_H

View File

@@ -1,8 +1,9 @@
#include <fstream>
#include "Experiment.h"
#include "Datasets.h"
#include "Models.h"
#include "ReportConsole.h"
#include <fstream>
#include "Paths.h"
namespace platform {
using json = nlohmann::json;
string get_date()
@@ -101,12 +102,12 @@ namespace platform {
cout << data.dump(4) << endl;
}
void Experiment::go(vector<string> filesToProcess, const string& path)
void Experiment::go(vector<string> filesToProcess)
{
cout << "*** Starting experiment: " << title << " ***" << endl;
for (auto fileName : filesToProcess) {
cout << "- " << setw(20) << left << fileName << " " << right << flush;
cross_validation(path, fileName);
cross_validation(fileName);
cout << endl;
}
}
@@ -131,9 +132,9 @@ namespace platform {
cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush;
}
void Experiment::cross_validation(const string& path, const string& fileName)
void Experiment::cross_validation(const string& fileName)
{
auto datasets = platform::Datasets(path, discretized, platform::ARFF);
auto datasets = platform::Datasets(discretized, Paths::datasets());
// Get dataset
auto [X, y] = datasets.getTensors(fileName);
auto states = datasets.getStates(fileName);

View File

@@ -108,8 +108,8 @@ namespace platform {
Experiment& setHyperparameters(const json& hyperparameters) { this->hyperparameters = hyperparameters; return *this; }
string get_file_name();
void save(const string& path);
void cross_validation(const string& path, const string& fileName);
void go(vector<string> filesToProcess, const string& path);
void cross_validation(const string& fileName);
void go(vector<string> filesToProcess);
void show();
void report();
};

View File

@@ -47,6 +47,7 @@ namespace platform {
{
stratified_indices = vector<vector<int>>(k);
int fold_size = n / k;
// Compute class counts and indices
auto class_indices = map<int, vector<int>>();
vector<int> class_counts(*max_element(y.begin(), y.end()) + 1, 0);
@@ -60,20 +61,26 @@ namespace platform {
}
// Assign indices to folds
for (auto label = 0; label < class_counts.size(); ++label) {
auto num_samples_to_take = class_counts[label] / k;
if (num_samples_to_take == 0)
auto num_samples_to_take = class_counts.at(label) / k;
if (num_samples_to_take == 0) {
cerr << "Warning! The number of samples in class " << label << " (" << class_counts.at(label)
<< ") is less than the number of folds (" << k << ")." << endl;
faulty = true;
continue;
}
auto remainder_samples_to_take = class_counts[label] % k;
for (auto fold = 0; fold < k; ++fold) {
auto it = next(class_indices[label].begin(), num_samples_to_take);
move(class_indices[label].begin(), it, back_inserter(stratified_indices[fold])); // ##
class_indices[label].erase(class_indices[label].begin(), it);
}
auto chosen = vector<bool>(k, false);
while (remainder_samples_to_take > 0) {
int fold = (rand() % static_cast<int>(k));
if (stratified_indices[fold].size() == fold_size + 1) {
if (chosen.at(fold)) {
continue;
}
chosen[fold] = true;
auto it = next(class_indices[label].begin(), 1);
stratified_indices[fold].push_back(*class_indices[label].begin());
class_indices[label].erase(class_indices[label].begin(), it);

View File

@@ -29,10 +29,12 @@ namespace platform {
vector<int> y;
vector<vector<int>> stratified_indices;
void build();
bool faulty = false; // Only true if the number of samples of any class is less than the number of folds.
public:
StratifiedKFold(int k, const vector<int>& y, int seed = -1);
StratifiedKFold(int k, torch::Tensor& y, int seed = -1);
pair<vector<int>, vector<int>> getFold(int nFold) override;
bool isFaulty() { return faulty; }
};
}
#endif

View File

@@ -1,12 +1,18 @@
#ifndef PATHS_H
#define PATHS_H
#include <string>
#include "DotEnv.h"
namespace platform {
class Paths {
public:
static std::string datasets() { return "datasets/"; }
static std::string results() { return "results/"; }
static std::string excel() { return "excel/"; }
static std::string cfs() { return "cfs/"; }
static std::string datasets()
{
auto env = platform::DotEnv();
return env.get("source_data");
}
};
}
#endif

View File

@@ -3,7 +3,7 @@
#include "Datasets.h"
#include "ReportBase.h"
#include "BestScore.h"
#include "DotEnv.h"
namespace platform {
ReportBase::ReportBase(json data_, bool compare) : data(data_), compare(compare), margin(0.1)
@@ -58,7 +58,7 @@ namespace platform {
}
} else {
if (data["score_name"].get<string>() == "accuracy") {
auto dt = Datasets(Paths::datasets(), false);
auto dt = Datasets(false, Paths::datasets());
dt.loadDataset(dataset);
auto numClasses = dt.getNClasses(dataset);
if (numClasses == 2) {

View File

@@ -2,15 +2,9 @@
#include <locale>
#include "ReportConsole.h"
#include "BestScore.h"
#include "CLocale.h"
namespace platform {
struct separated : numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
string do_grouping() const { return "\03"; }
};
string ReportConsole::headerLine(const string& text, int utf = 0)
{
int n = MAXL - text.length() - 3;
@@ -20,9 +14,6 @@ namespace platform {
void ReportConsole::header()
{
locale mylocale(cout.getloc(), new separated);
locale::global(mylocale);
cout.imbue(mylocale);
stringstream oss;
cout << Colors::MAGENTA() << string(MAXL, '*') << endl;
cout << headerLine("Report " + data["model"].get<string>() + " ver. " + data["version"].get<string>() + " with " + to_string(data["folds"].get<int>()) + " Folds cross validation and " + to_string(data["seeds"].size()) + " random seeds. " + data["date"].get<string>() + " " + data["time"].get<string>());
@@ -36,6 +27,7 @@ namespace platform {
}
void ReportConsole::body()
{
auto tmp = ConfigLocale();
cout << Colors::GREEN() << " # Dataset Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << endl;
cout << "=== ========================= ====== ===== === ========= ========= ========= =============== =================== ====================" << endl;
json lastResult;
@@ -61,13 +53,9 @@ namespace platform {
const string status = compareResult(r["dataset"].get<string>(), r["score"].get<double>());
cout << status;
cout << setw(12) << right << setprecision(6) << fixed << r["time"].get<double>() << "±" << setw(6) << setprecision(4) << fixed << r["time_std"].get<double>() << " ";
try {
cout << r["hyperparameters"].get<string>();
}
catch (const exception& err) {
cout << r["hyperparameters"];
}
cout << r["hyperparameters"].dump();
cout << endl;
cout << flush;
lastResult = r;
totalScore += r["score"].get<double>();
odd = !odd;

View File

@@ -5,53 +5,12 @@
namespace platform {
struct separated : numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
string do_grouping() const { return "\03"; }
};
ReportExcel::ReportExcel(json data_, bool compare, lxw_workbook* workbook) : ReportBase(data_, compare), row(0), workbook(workbook)
ReportExcel::ReportExcel(json data_, bool compare, lxw_workbook* workbook) : ReportBase(data_, compare), ExcelFile(workbook)
{
normalSize = 14; //font size for report body
colorTitle = 0xB1A0C7;
colorOdd = 0xDCE6F1;
colorEven = 0xFDE9D9;
createFile();
}
lxw_workbook* ReportExcel::getWorkbook()
{
return workbook;
}
lxw_format* ReportExcel::efectiveStyle(const string& style)
{
lxw_format* efectiveStyle;
if (style == "") {
efectiveStyle = NULL;
} else {
string suffix = row % 2 ? "_odd" : "_even";
efectiveStyle = styles.at(style + suffix);
}
return efectiveStyle;
}
void ReportExcel::writeString(int row, int col, const string& text, const string& style)
{
worksheet_write_string(worksheet, row, col, text.c_str(), efectiveStyle(style));
}
void ReportExcel::writeInt(int row, int col, const int number, const string& style)
{
worksheet_write_number(worksheet, row, col, number, efectiveStyle(style));
}
void ReportExcel::writeDouble(int row, int col, const double number, const string& style)
{
worksheet_write_number(worksheet, row, col, number, efectiveStyle(style));
}
void ReportExcel::formatColumns()
{
worksheet_freeze_panes(worksheet, 6, 1);
@@ -61,116 +20,6 @@ namespace platform {
}
}
void ReportExcel::addColor(lxw_format* style, bool odd)
{
uint32_t efectiveColor = odd ? colorEven : colorOdd;
format_set_bg_color(style, lxw_color_t(efectiveColor));
}
void ReportExcel::createStyle(const string& name, lxw_format* style, bool odd)
{
addColor(style, odd);
if (name == "textCentered") {
format_set_align(style, LXW_ALIGN_CENTER);
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "text") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "bodyHeader") {
format_set_bold(style);
format_set_font_size(style, normalSize);
format_set_align(style, LXW_ALIGN_CENTER);
format_set_align(style, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(style, LXW_BORDER_THIN);
format_set_bg_color(style, lxw_color_t(colorTitle));
} else if (name == "result") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
format_set_num_format(style, "0.0000000");
} else if (name == "time") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
format_set_num_format(style, "#,##0.000000");
} else if (name == "ints") {
format_set_font_size(style, normalSize);
format_set_num_format(style, "###,##0");
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "floats") {
format_set_border(style, LXW_BORDER_THIN);
format_set_font_size(style, normalSize);
format_set_num_format(style, "#,##0.00");
}
}
void ReportExcel::createFormats()
{
auto styleNames = { "text", "textCentered", "bodyHeader", "result", "time", "ints", "floats" };
lxw_format* style;
for (string name : styleNames) {
lxw_format* style = workbook_add_format(workbook);
style = workbook_add_format(workbook);
createStyle(name, style, true);
styles[name + "_odd"] = style;
style = workbook_add_format(workbook);
createStyle(name, style, false);
styles[name + "_even"] = style;
}
// Header 1st line
lxw_format* headerFirst = workbook_add_format(workbook);
format_set_bold(headerFirst);
format_set_font_size(headerFirst, 18);
format_set_align(headerFirst, LXW_ALIGN_CENTER);
format_set_align(headerFirst, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(headerFirst, LXW_BORDER_THIN);
format_set_bg_color(headerFirst, lxw_color_t(colorTitle));
// Header rest
lxw_format* headerRest = workbook_add_format(workbook);
format_set_bold(headerRest);
format_set_align(headerRest, LXW_ALIGN_CENTER);
format_set_font_size(headerRest, 16);
format_set_align(headerRest, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(headerRest, LXW_BORDER_THIN);
format_set_bg_color(headerRest, lxw_color_t(colorOdd));
// Header small
lxw_format* headerSmall = workbook_add_format(workbook);
format_set_bold(headerSmall);
format_set_align(headerSmall, LXW_ALIGN_LEFT);
format_set_font_size(headerSmall, 12);
format_set_border(headerSmall, LXW_BORDER_THIN);
format_set_align(headerSmall, LXW_ALIGN_VERTICAL_CENTER);
format_set_bg_color(headerSmall, lxw_color_t(colorOdd));
// Summary style
lxw_format* summaryStyle = workbook_add_format(workbook);
format_set_bold(summaryStyle);
format_set_font_size(summaryStyle, 16);
format_set_border(summaryStyle, LXW_BORDER_THIN);
format_set_align(summaryStyle, LXW_ALIGN_VERTICAL_CENTER);
styles["headerFirst"] = headerFirst;
styles["headerRest"] = headerRest;
styles["headerSmall"] = headerSmall;
styles["summaryStyle"] = summaryStyle;
}
void ReportExcel::setProperties()
{
char line[data["title"].get<string>().size() + 1];
strcpy(line, data["title"].get<string>().c_str());
lxw_doc_properties properties = {
.title = line,
.subject = (char*)"Machine learning results",
.author = (char*)"Ricardo Montañana Gómez",
.manager = (char*)"Dr. J. A. Gámez, Dr. J. M. Puerta",
.company = (char*)"UCLM",
.comments = (char*)"Created with libxlsxwriter and c++",
};
workbook_set_properties(workbook, &properties);
}
void ReportExcel::createFile()
{
if (workbook == NULL) {
@@ -194,7 +43,7 @@ namespace platform {
}
}
cout << "Adding sheet " << efectiveName << " to " << Paths::excel() + fileName << endl;
setProperties();
setProperties(data["title"].get<string>());
createFormats();
formatColumns();
}

View File

@@ -3,40 +3,23 @@
#include<map>
#include "xlsxwriter.h"
#include "ReportBase.h"
#include "ExcelFile.h"
#include "Colors.h"
namespace platform {
using namespace std;
const int MAXLL = 128;
class ReportExcel : public ReportBase {
class ReportExcel : public ReportBase, public ExcelFile {
public:
explicit ReportExcel(json data_, bool compare, lxw_workbook* workbook);
lxw_workbook* getWorkbook();
private:
void writeString(int row, int col, const string& text, const string& style = "");
void writeInt(int row, int col, const int number, const string& style = "");
void writeDouble(int row, int col, const double number, const string& style = "");
const string fileName = "some_results.xlsx";
void formatColumns();
void createFormats();
void setProperties();
void createFile();
void closeFile();
lxw_workbook* workbook;
lxw_worksheet* worksheet;
map<string, lxw_format*> styles;
int row;
int normalSize; //font size for report body
uint32_t colorTitle;
uint32_t colorOdd;
uint32_t colorEven;
const string fileName = "some_results.xlsx";
void header() override;
void body() override;
void showSummary() override;
void footer(double totalScore, int row);
void createStyle(const string& name, lxw_format* style, bool odd);
void addColor(lxw_format* style, bool odd);
lxw_format* efectiveStyle(const string& name);
};
};
#endif // !REPORTEXCEL_H

View File

@@ -4,6 +4,8 @@
#include "Result.h"
#include "Colors.h"
#include "BestScore.h"
#include "CLocale.h"
namespace platform {
Result::Result(const string& path, const string& filename)
: path(path)
@@ -37,14 +39,17 @@ namespace platform {
string Result::to_string() const
{
auto tmp = ConfigLocale();
stringstream oss;
double durationShow = duration > 3600 ? duration / 3600 : duration > 60 ? duration / 60 : duration;
string durationUnit = duration > 3600 ? "h" : duration > 60 ? "m" : "s";
oss << date << " ";
oss << setw(12) << left << model << " ";
oss << setw(11) << left << scoreName << " ";
oss << right << setw(11) << setprecision(7) << fixed << score << " ";
auto completeString = isComplete() ? "C" : "P";
oss << setw(1) << " " << completeString << " ";
oss << setw(9) << setprecision(3) << fixed << duration << " ";
oss << setw(7) << setprecision(2) << fixed << durationShow << " " << durationUnit << " ";
oss << setw(50) << left << title << " ";
return oss.str();
}

View File

@@ -1,10 +1,10 @@
#include <filesystem>
#include "platformUtils.h"
#include "Results.h"
#include "ReportConsole.h"
#include "ReportExcel.h"
#include "BestScore.h"
#include "Colors.h"
#include "CLocale.h"
namespace platform {
void Results::load()
{
@@ -26,6 +26,7 @@ namespace platform {
}
void Results::show() const
{
auto temp = ConfigLocale();
cout << Colors::GREEN() << "Results found: " << files.size() << endl;
cout << "-------------------" << endl;
if (complete) {

View File

@@ -1,15 +1,20 @@
#include <sstream>
#include "Statistics.h"
#include "Colors.h"
#include "Symbols.h"
#include <boost/math/distributions/chi_squared.hpp>
#include <boost/math/distributions/normal.hpp>
#include "CLocale.h"
namespace platform {
Statistics::Statistics(vector<string>& models, vector<string>& datasets, json data, double significance) : models(models), datasets(datasets), data(data), significance(significance)
Statistics::Statistics(const vector<string>& models, const vector<string>& datasets, const json& data, double significance, bool output) :
models(models), datasets(datasets), data(data), significance(significance), output(output)
{
nModels = models.size();
nDatasets = datasets.size();
auto temp = ConfigLocale();
};
void Statistics::fit()
@@ -19,10 +24,13 @@ namespace platform {
cerr << "nDatasets: " << nDatasets << endl;
throw runtime_error("Can't make the Friedman test with less than 3 models and/or less than 3 datasets.");
}
ranksModels.clear();
computeRanks();
// Set the control model as the one with the lowest average rank
controlIdx = distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; }));
computeWTL();
maxModelName = (*max_element(models.begin(), models.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
fitted = true;
}
map<string, float> assignRanks(vector<pair<string, double>>& ranksOrder)
@@ -64,6 +72,8 @@ namespace platform {
}
// Assign the ranks
ranksLine = assignRanks(ranksOrder);
// Store the ranks of the dataset
ranksModels[dataset] = ranksLine;
if (ranks.size() == 0) {
ranks = ranksLine;
} else {
@@ -108,6 +118,7 @@ namespace platform {
if (!fitted) {
fit();
}
stringstream oss;
// Reference https://link.springer.com/article/10.1007/s44196-022-00083-8
// Post-hoc Holm test
// Calculate the p-value for the models paired with the control model
@@ -140,13 +151,14 @@ namespace platform {
p_value = max(before, p_value);
statsOrder[i] = { item.first, p_value };
}
holmResult.model = models.at(controlIdx);
auto color = friedmanResult ? Colors::CYAN() : Colors::YELLOW();
cout << color;
cout << " *************************************************************************************************************" << endl;
cout << " Post-hoc Holm test: H0: 'There is no significant differences between the control model and the other models.'" << endl;
cout << " Control model: " << models[controlIdx] << endl;
cout << " Model p-value rank win tie loss Status" << endl;
cout << " ============ ============ ========= === === ==== =============" << endl;
oss << color;
oss << " *************************************************************************************************************" << endl;
oss << " Post-hoc Holm test: H0: 'There is no significant differences between the control model and the other models.'" << endl;
oss << " Control model: " << models.at(controlIdx) << endl;
oss << " " << left << setw(maxModelName) << string("Model") << " p-value rank win tie loss Status" << endl;
oss << " " << string(maxModelName, '=') << " ============ ========= === === ==== =============" << endl;
// sort ranks from lowest to highest
vector<pair<string, float>> ranksOrder;
for (const auto& rank : ranks) {
@@ -155,10 +167,10 @@ namespace platform {
sort(ranksOrder.begin(), ranksOrder.end(), [](const pair<string, float>& a, const pair<string, float>& b) {
return a.second < b.second;
});
// Show the control model info.
oss << " " << Colors::BLUE() << left << setw(maxModelName) << ranksOrder.at(0).first << " ";
oss << setw(12) << " " << setprecision(7) << fixed << " " << ranksOrder.at(0).second << endl;
for (const auto& item : ranksOrder) {
if (item.first == models.at(controlIdx)) {
continue;
}
auto idx = distance(models.begin(), find(models.begin(), models.end(), item.first));
double pvalue = 0.0;
for (const auto& stat : statsOrder) {
@@ -166,26 +178,35 @@ namespace platform {
pvalue = stat.second;
}
}
holmResult.holmLines.push_back({ item.first, pvalue, item.second, wtl.at(idx), pvalue < significance });
if (item.first == models.at(controlIdx)) {
continue;
}
auto colorStatus = pvalue > significance ? Colors::GREEN() : Colors::MAGENTA();
auto status = pvalue > significance ? Symbols::check_mark : Symbols::cross;
auto textStatus = pvalue > significance ? " accepted H0" : " rejected H0";
cout << " " << colorStatus << left << setw(12) << item.first << " " << setprecision(6) << scientific << pvalue << setprecision(7) << fixed << " " << item.second;
cout << " " << right << setw(3) << wtl.at(idx).win << " " << setw(3) << wtl.at(idx).tie << " " << setw(4) << wtl.at(idx).loss;
cout << " " << status << textStatus << endl;
oss << " " << colorStatus << left << setw(maxModelName) << item.first << " ";
oss << setprecision(6) << scientific << pvalue << setprecision(7) << fixed << " " << item.second;
oss << " " << right << setw(3) << wtl.at(idx).win << " " << setw(3) << wtl.at(idx).tie << " " << setw(4) << wtl.at(idx).loss;
oss << " " << status << textStatus << endl;
}
oss << color << " *************************************************************************************************************" << endl;
oss << Colors::RESET();
if (output) {
cout << oss.str();
}
cout << color << " *************************************************************************************************************" << endl;
cout << Colors::RESET();
}
bool Statistics::friedmanTest()
{
if (!fitted) {
fit();
}
stringstream oss;
// Friedman test
// Calculate the Friedman statistic
cout << Colors::BLUE() << endl;
cout << "***************************************************************************************************************" << endl;
cout << Colors::GREEN() << "Friedman test: H0: 'There is no significant differences between all the classifiers.'" << Colors::BLUE() << endl;
oss << Colors::BLUE() << endl;
oss << "***************************************************************************************************************" << endl;
oss << Colors::GREEN() << "Friedman test: H0: 'There is no significant differences between all the classifiers.'" << Colors::BLUE() << endl;
double degreesOfFreedom = nModels - 1.0;
double sumSquared = 0;
for (const auto& rank : ranks) {
@@ -193,23 +214,39 @@ namespace platform {
}
// Compute the Friedman statistic as in https://link.springer.com/article/10.1007/s44196-022-00083-8
double friedmanQ = 12.0 * nDatasets / (nModels * (nModels + 1)) * (sumSquared - (nModels * pow(nModels + 1, 2)) / 4);
cout << "Friedman statistic: " << friedmanQ << endl;
// Calculate the critical value
boost::math::chi_squared chiSquared(degreesOfFreedom);
long double p_value = (long double)1.0 - cdf(chiSquared, friedmanQ);
double criticalValue = quantile(chiSquared, 1 - significance);
std::cout << "Critical Chi-Square Value for df=" << fixed << (int)degreesOfFreedom
oss << "Friedman statistic: " << friedmanQ << endl;
oss << "Critical χ2 Value for df=" << fixed << (int)degreesOfFreedom
<< " and alpha=" << setprecision(2) << fixed << significance << ": " << setprecision(7) << scientific << criticalValue << std::endl;
cout << "p-value: " << scientific << p_value << " is " << (p_value < significance ? "less" : "greater") << " than " << setprecision(2) << fixed << significance << endl;
oss << "p-value: " << scientific << p_value << " is " << (p_value < significance ? "less" : "greater") << " than " << setprecision(2) << fixed << significance << endl;
bool result;
if (p_value < significance) {
cout << Colors::GREEN() << "The null hypothesis H0 is rejected." << endl;
oss << Colors::GREEN() << "The null hypothesis H0 is rejected." << endl;
result = true;
} else {
cout << Colors::YELLOW() << "The null hypothesis H0 is accepted. Computed p-values will not be significant." << endl;
oss << Colors::YELLOW() << "The null hypothesis H0 is accepted. Computed p-values will not be significant." << endl;
result = false;
}
cout << Colors::BLUE() << "***************************************************************************************************************" << Colors::RESET() << endl;
oss << Colors::BLUE() << "***************************************************************************************************************" << Colors::RESET() << endl;
if (output) {
cout << oss.str();
}
friedmanResult = { friedmanQ, criticalValue, p_value, result };
return result;
}
FriedmanResult& Statistics::getFriedmanResult()
{
return friedmanResult;
}
HolmResult& Statistics::getHolmResult()
{
return holmResult;
}
map<string, map<string, float>>& Statistics::getRanks()
{
return ranksModels;
}
} // namespace platform

View File

@@ -2,6 +2,7 @@
#define STATISTICS_H
#include <iostream>
#include <vector>
#include <map>
#include <nlohmann/json.hpp>
using namespace std;
@@ -13,25 +14,51 @@ namespace platform {
int tie;
int loss;
};
struct FriedmanResult {
double statistic;
double criticalValue;
long double pvalue;
bool reject;
};
struct HolmLine {
string model;
long double pvalue;
double rank;
WTL wtl;
bool reject;
};
struct HolmResult {
string model;
vector<HolmLine> holmLines;
};
class Statistics {
public:
Statistics(vector<string>& models, vector<string>& datasets, json data, double significance = 0.05);
Statistics(const vector<string>& models, const vector<string>& datasets, const json& data, double significance = 0.05, bool output = true);
bool friedmanTest();
void postHocHolmTest(bool friedmanResult);
FriedmanResult& getFriedmanResult();
HolmResult& getHolmResult();
map<string, map<string, float>>& getRanks();
private:
void fit();
void computeRanks();
void computeWTL();
vector<string> models;
vector<string> datasets;
json data;
const vector<string>& models;
const vector<string>& datasets;
const json& data;
double significance;
bool output;
bool fitted = false;
int nModels = 0;
int nDatasets = 0;
int controlIdx = 0;
map<int, WTL> wtl;
map<string, float> ranks;
int maxModelName = 0;
int maxDatasetName = 0;
FriedmanResult friedmanResult;
HolmResult holmResult;
map<string, map<string, float>> ranksModels;
};
}
#endif // !STATISTICS_H

19
src/Platform/Utils.h Normal file
View File

@@ -0,0 +1,19 @@
#ifndef UTILS_H
#define UTILS_H
#include <sstream>
#include <string>
#include <vector>
namespace platform {
//static vector<string> split(const string& text, char delimiter);
static std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(token);
}
return result;
}
}
#endif

View File

@@ -14,6 +14,21 @@ argparse::ArgumentParser manageArguments(int argc, char** argv)
program.add_argument("--build").help("build best score results file").default_value(false).implicit_value(true);
program.add_argument("--report").help("report of best score results file").default_value(false).implicit_value(true);
program.add_argument("--friedman").help("Friedman test").default_value(false).implicit_value(true);
program.add_argument("--excel").help("Output to excel").default_value(false).implicit_value(true);
program.add_argument("--level").help("significance level").default_value(0.05).scan<'g', double>().action([](const string& value) {
try {
auto k = stod(value);
if (k < 0.01 || k > 0.15) {
throw runtime_error("Significance level hast to be a number in [0.01, 0.15]");
}
return k;
}
catch (const runtime_error& err) {
throw runtime_error(err.what());
}
catch (...) {
throw runtime_error("Number of folds must be an decimal number");
}});
try {
program.parse_args(argc, argv);
auto model = program.get<string>("model");
@@ -21,9 +36,26 @@ argparse::ArgumentParser manageArguments(int argc, char** argv)
auto build = program.get<bool>("build");
auto report = program.get<bool>("report");
auto friedman = program.get<bool>("friedman");
auto excel = program.get<bool>("excel");
auto level = program.get<double>("level");
if (model == "" || score == "") {
throw runtime_error("Model and score name must be supplied");
}
if (friedman && model != "any") {
cerr << "Friedman test can only be used with all models" << endl;
cerr << program;
exit(1);
}
if (excel && model != "any") {
cerr << "Excel ourput can only be used with all models" << endl;
cerr << program;
exit(1);
}
if (!report && !build) {
cerr << "Either build, report or both, have to be selected to do anything!" << endl;
cerr << program;
exit(1);
}
}
catch (const exception& err) {
cerr << err.what() << endl;
@@ -41,17 +73,9 @@ int main(int argc, char** argv)
auto build = program.get<bool>("build");
auto report = program.get<bool>("report");
auto friedman = program.get<bool>("friedman");
if (friedman && model != "any") {
cerr << "Friedman test can only be used with all models" << endl;
cerr << program;
exit(1);
}
if (!report && !build) {
cerr << "Either build, report or both, have to be selected to do anything!" << endl;
cerr << program;
exit(1);
}
auto results = platform::BestResults(platform::Paths::results(), score, model, friedman);
auto excel = program.get<bool>("excel");
auto level = program.get<double>("level");
auto results = platform::BestResults(platform::Paths::results(), score, model, friedman, level);
if (build) {
if (model == "any") {
results.buildAll();
@@ -62,7 +86,7 @@ int main(int argc, char** argv)
}
if (report) {
if (model == "any") {
results.reportAll();
results.reportAll(excel);
} else {
results.reportSingle();
}

View File

@@ -27,7 +27,7 @@ void outputBalance(const string& balance)
int main(int argc, char** argv)
{
auto data = platform::Datasets(platform::Paths().datasets(), false);
auto data = platform::Datasets(false, platform::Paths::datasets());
locale mylocale(cout.getloc(), new separated);
locale::global(mylocale);
cout.imbue(mylocale);

View File

@@ -1,7 +1,6 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include "platformUtils.h"
#include "Experiment.h"
#include "Datasets.h"
#include "DotEnv.h"
@@ -13,15 +12,12 @@
using namespace std;
using json = nlohmann::json;
argparse::ArgumentParser manageArguments(int argc, char** argv)
argparse::ArgumentParser manageArguments()
{
auto env = platform::DotEnv();
argparse::ArgumentParser program("main");
program.add_argument("-d", "--dataset").default_value("").help("Dataset file name");
program.add_argument("--hyperparameters").default_value("{}").help("Hyperparamters passed to the model in Experiment");
program.add_argument("-p", "--path")
.help("folder where the data files are located, default")
.default_value(string{ platform::Paths::datasets() });
program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->toString())
.action([](const std::string& value) {
@@ -52,46 +48,40 @@ argparse::ArgumentParser manageArguments(int argc, char** argv)
}});
auto seed_values = env.getSeeds();
program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
return program;
}
int main(int argc, char** argv)
{
string file_name, model_name, title;
json hyperparameters_json;
bool discretize_dataset, stratified, saveResults;
vector<int> seeds;
vector<string> filesToTest;
int n_folds;
auto program = manageArguments();
try {
program.parse_args(argc, argv);
auto file_name = program.get<string>("dataset");
auto path = program.get<string>("path");
auto model_name = program.get<string>("model");
auto discretize_dataset = program.get<bool>("discretize");
auto stratified = program.get<bool>("stratified");
auto n_folds = program.get<int>("folds");
auto seeds = program.get<vector<int>>("seeds");
auto complete_file_name = path + file_name + ".arff";
auto title = program.get<string>("title");
file_name = program.get<string>("dataset");
model_name = program.get<string>("model");
discretize_dataset = program.get<bool>("discretize");
stratified = program.get<bool>("stratified");
n_folds = program.get<int>("folds");
seeds = program.get<vector<int>>("seeds");
auto hyperparameters = program.get<string>("hyperparameters");
auto saveResults = program.get<bool>("save");
hyperparameters_json = json::parse(hyperparameters);
title = program.get<string>("title");
if (title == "" && file_name == "") {
throw runtime_error("title is mandatory if dataset is not provided");
}
saveResults = program.get<bool>("save");
}
catch (const exception& err) {
cerr << err.what() << endl;
cerr << program;
exit(1);
}
return program;
}
int main(int argc, char** argv)
{
auto program = manageArguments(argc, argv);
auto file_name = program.get<string>("dataset");
auto path = program.get<string>("path");
auto model_name = program.get<string>("model");
auto discretize_dataset = program.get<bool>("discretize");
auto stratified = program.get<bool>("stratified");
auto n_folds = program.get<int>("folds");
auto seeds = program.get<vector<int>>("seeds");
auto hyperparameters = program.get<string>("hyperparameters");
vector<string> filesToTest;
auto datasets = platform::Datasets(path, true, platform::ARFF);
auto title = program.get<string>("title");
auto saveResults = program.get<bool>("save");
auto datasets = platform::Datasets(discretize_dataset, platform::Paths::datasets());
if (file_name != "") {
if (!datasets.isDataset(file_name)) {
cerr << "Dataset " << file_name << " not found" << endl;
@@ -113,13 +103,13 @@ int main(int argc, char** argv)
experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("14.0.3");
experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform"));
experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy");
experiment.setHyperparameters(json::parse(hyperparameters));
experiment.setHyperparameters(hyperparameters_json);
for (auto seed : seeds) {
experiment.addRandomSeed(seed);
}
platform::Timer timer;
timer.start();
experiment.go(filesToTest, path);
experiment.go(filesToTest);
experiment.setDuration(timer.getDuration());
if (saveResults) {
experiment.save(platform::Paths::results());

View File

@@ -1,6 +1,5 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include "platformUtils.h"
#include "Paths.h"
#include "Results.h"

View File

@@ -1,20 +0,0 @@
#ifndef PLATFORM_UTILS_H
#define PLATFORM_UTILS_H
#include <torch/torch.h>
#include <string>
#include <vector>
#include <map>
#include <tuple>
#include "ArffFiles.h"
#include "CPPFImdlp.h"
using namespace std;
bool file_exists(const std::string& name);
vector<string> split(const string& text, char delimiter);
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features);
vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y);
pair<torch::Tensor, map<string, vector<int>>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector<string>& features, const string& className);
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(const string& name);
tuple<torch::Tensor, torch::Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset);
map<string, vector<int>> get_states(vector<string>& features, string className, map<string, int>& maxes);
#endif //PLATFORM_UTILS_H

248
src/Platform/testx.cpp Normal file
View File

@@ -0,0 +1,248 @@
#include "Folding.h"
#include <torch/torch.h>
#include "nlohmann/json.hpp"
#include "map"
#include <iostream>
#include <sstream>
#include "Datasets.h"
#include "Network.h"
#include "ArffFiles.h"
#include "CPPFImdlp.h"
#include "CFS.h"
#include "IWSS.h"
#include "FCBF.h"
using namespace std;
using namespace platform;
using namespace torch;
string counts(vector<int> y, vector<int> indices)
{
auto result = map<int, int>();
stringstream oss;
for (auto i = 0; i < indices.size(); ++i) {
result[y[indices[i]]]++;
}
string final_result = "";
for (auto i = 0; i < result.size(); ++i)
oss << i << " -> " << setprecision(2) << fixed
<< (double)result[i] * 100 / indices.size() << "% (" << result[i] << ") //";
oss << endl;
return oss.str();
}
class Paths {
public:
static string datasets()
{
return "datasets/";
}
};
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)
{
vector<mdlp::labels_t> Xd;
map<string, int> maxes;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
Xd.push_back(xd);
}
return { Xd, maxes };
}
vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y)
{
vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
bool file_exists(const string& name)
{
if (FILE* file = fopen(name.c_str(), "r")) {
fclose(file);
return true;
} else {
return false;
}
}
tuple<Tensor, Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& name, bool class_last, bool discretize_dataset)
{
auto handler = ArffFiles();
handler.load(Paths::datasets() + static_cast<string>(name) + ".arff", class_last);
// Get Dataset X, y
vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
vector<string> features;
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
Tensor Xd;
auto states = map<string, vector<int>>();
if (discretize_dataset) {
auto Xr = discretizeDataset(X, y);
Xd = torch::zeros({ static_cast<int>(Xr.size()), static_cast<int>(Xr[0].size()) }, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32));
}
states[className] = vector<int>(*max_element(y.begin(), y.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
} else {
Xd = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
for (int i = 0; i < features.size(); ++i) {
Xd.index_put_({ i, "..." }, torch::tensor(X[i]));
}
}
return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
}
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(const string& name)
{
auto handler = ArffFiles();
handler.load(Paths::datasets() + static_cast<string>(name) + ".arff");
// Get Dataset X, y
vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
vector<string> features;
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
// Discretize Dataset
vector<mdlp::labels_t> Xd;
map<string, int> maxes;
tie(Xd, maxes) = discretize(X, y, features);
maxes[className] = *max_element(y.begin(), y.end()) + 1;
map<string, vector<int>> states;
for (auto feature : features) {
states[feature] = vector<int>(maxes[feature]);
}
states[className] = vector<int>(maxes[className]);
return { Xd, y, features, className, states };
}
class RawDatasets {
public:
RawDatasets(const string& file_name, bool discretize)
{
// Xt can be either discretized or not
tie(Xt, yt, featurest, classNamet, statest) = loadDataset(file_name, true, discretize);
// Xv is always discretized
tie(Xv, yv, featuresv, classNamev, statesv) = loadFile(file_name);
auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1);
dataset = torch::cat({ Xt, yresized }, 0);
nSamples = dataset.size(1);
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
weightsv = vector<double>(nSamples, 1.0 / nSamples);
classNumStates = discretize ? statest.at(classNamet).size() : 0;
}
torch::Tensor Xt, yt, dataset, weights;
vector<vector<int>> Xv;
vector<double> weightsv;
vector<int> yv;
vector<string> featurest, featuresv;
map<string, vector<int>> statest, statesv;
string classNamet, classNamev;
int nSamples, classNumStates;
double epsilon = 1e-5;
};
int main()
{
// map<string, string> balance = {
// {"iris", "33,33% (50) / 33,33% (50) / 33,33% (50)"},
// {"diabetes", "34,90% (268) / 65,10% (500)"},
// {"ecoli", "42,56% (143) / 22,92% (77) / 0,60% (2) / 0,60% (2) / 10,42% (35) / 5,95% (20) / 1,49% (5) / 15,48% (52)"},
// {"glass", "32,71% (70) / 7,94% (17) / 4,21% (9) / 35,51% (76) / 13,55% (29) / 6,07% (13)"}
// };
// for (const auto& file_name : { "iris", "glass", "ecoli", "diabetes" }) {
// auto dt = Datasets(true, "Arff");
// auto [X, y] = dt.getVectors(file_name);
// //auto fold = KFold(5, 150);
// auto fold = StratifiedKFold(5, y, -1);
// cout << "***********************************************************************************************" << endl;
// cout << "Dataset: " << file_name << endl;
// cout << "Nº Samples: " << dt.getNSamples(file_name) << endl;
// cout << "Class states: " << dt.getNClasses(file_name) << endl;
// cout << "Balance: " << balance.at(file_name) << endl;
// for (int i = 0; i < 5; ++i) {
// cout << "Fold: " << i << endl;
// auto [train, test] = fold.getFold(i);
// cout << "Train: ";
// cout << "(" << train.size() << "): ";
// // for (auto j = 0; j < static_cast<int>(train.size()); j++)
// // cout << train[j] << ", ";
// cout << endl;
// cout << "Train Statistics : " << counts(y, train);
// cout << "-------------------------------------------------------------------------------" << endl;
// cout << "Test: ";
// cout << "(" << test.size() << "): ";
// // for (auto j = 0; j < static_cast<int>(test.size()); j++)
// // cout << test[j] << ", ";
// cout << endl;
// cout << "Test Statistics: " << counts(y, test);
// cout << "==============================================================================" << endl;
// }
// cout << "***********************************************************************************************" << endl;
// }
// const string file_name = "iris";
// auto net = bayesnet::Network();
// auto dt = Datasets(true, "Arff");
// auto raw = RawDatasets("iris", true);
// auto [X, y] = dt.getVectors(file_name);
// cout << "Dataset dims " << raw.dataset.sizes() << endl;
// cout << "weights dims " << raw.weights.sizes() << endl;
// cout << "States dims " << raw.statest.size() << endl;
// cout << "features: ";
// for (const auto& feature : raw.featurest) {
// cout << feature << ", ";
// net.addNode(feature);
// }
// net.addNode(raw.classNamet);
// cout << endl;
// net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest);
auto dt = Datasets(true, "Arff");
nlohmann::json output;
for (const auto& name : dt.getNames()) {
// for (const auto& name : { "iris" }) {
auto [X, y] = dt.getTensors(name);
auto features = dt.getFeatures(name);
auto states = dt.getStates(name);
auto className = dt.getClassName(name);
int maxFeatures = 0;
auto classNumStates = states.at(className).size();
torch::Tensor weights = torch::full({ X.size(1) }, 1.0 / X.size(1), torch::kDouble);
auto dataset = X;
auto yresized = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
dataset = torch::cat({ dataset, yresized }, 0);
auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, classNumStates, weights);
auto fcbf = bayesnet::FCBF(dataset, features, className, maxFeatures, classNumStates, weights, 1e-7);
auto iwss = bayesnet::IWSS(dataset, features, className, maxFeatures, classNumStates, weights, 0.5);
cout << "Dataset: " << setw(20) << name << flush;
cfs.fit();
cout << " CFS: " << setw(4) << cfs.getFeatures().size() << flush;
fcbf.fit();
cout << " FCBF: " << setw(4) << fcbf.getFeatures().size() << flush;
iwss.fit();
cout << " IWSS: " << setw(4) << iwss.getFeatures().size() << flush;
cout << endl;
output[name]["CFS"] = cfs.getFeatures();
output[name]["FCBF"] = fcbf.getFeatures();
output[name]["IWSS"] = iwss.getFeatures();
}
ofstream file("features_cpp.json");
file << output;
file.close();
}

View File

@@ -1,88 +0,0 @@
#define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do
#include <catch2/catch_test_macros.hpp>
#include <catch2/catch_approx.hpp>
#include <catch2/generators/catch_generators.hpp>
#include <vector>
#include <map>
#include <string>
#include "KDB.h"
#include "TAN.h"
#include "SPODE.h"
#include "AODE.h"
#include "platformUtils.h"
TEST_CASE("Test Bayesian Classifiers score", "[BayesNet]")
{
map <pair<string, string>, float> scores = {
{{"diabetes", "AODE"}, 0.811198}, {{"diabetes", "KDB"}, 0.852865}, {{"diabetes", "SPODE"}, 0.802083}, {{"diabetes", "TAN"}, 0.821615},
{{"ecoli", "AODE"}, 0.889881}, {{"ecoli", "KDB"}, 0.889881}, {{"ecoli", "SPODE"}, 0.880952}, {{"ecoli", "TAN"}, 0.892857},
{{"glass", "AODE"}, 0.78972}, {{"glass", "KDB"}, 0.827103}, {{"glass", "SPODE"}, 0.775701}, {{"glass", "TAN"}, 0.827103},
{{"iris", "AODE"}, 0.973333}, {{"iris", "KDB"}, 0.973333}, {{"iris", "SPODE"}, 0.973333}, {{"iris", "TAN"}, 0.973333}
};
string file_name = GENERATE("glass", "iris", "ecoli", "diabetes");
auto [Xd, y, features, className, states] = loadFile(file_name);
SECTION("Test TAN classifier (" + file_name + ")")
{
auto clf = bayesnet::TAN();
clf.fit(Xd, y, features, className, states);
auto score = clf.score(Xd, y);
//scores[{file_name, "TAN"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "TAN"}]).epsilon(1e-6));
}
SECTION("Test KDB classifier (" + file_name + ")")
{
auto clf = bayesnet::KDB(2);
clf.fit(Xd, y, features, className, states);
auto score = clf.score(Xd, y);
//scores[{file_name, "KDB"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "KDB"
}]).epsilon(1e-6));
}
SECTION("Test SPODE classifier (" + file_name + ")")
{
auto clf = bayesnet::SPODE(1);
clf.fit(Xd, y, features, className, states);
auto score = clf.score(Xd, y);
// scores[{file_name, "SPODE"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "SPODE"}]).epsilon(1e-6));
}
SECTION("Test AODE classifier (" + file_name + ")")
{
auto clf = bayesnet::AODE();
clf.fit(Xd, y, features, className, states);
auto score = clf.score(Xd, y);
// scores[{file_name, "AODE"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "AODE"}]).epsilon(1e-6));
}
// for (auto scores : scores) {
// cout << "{{\"" << scores.first.first << "\", \"" << scores.first.second << "\"}, " << scores.second << "}, ";
// }
}
TEST_CASE("Models features")
{
auto graph = vector<string>({ "digraph BayesNet {\nlabel=<BayesNet Test>\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n",
"class [shape=circle, fontcolor=red, fillcolor=lightblue, style=filled ] \n",
"class -> sepallength", "class -> sepalwidth", "class -> petallength", "class -> petalwidth", "petallength [shape=circle] \n",
"petallength -> sepallength", "petalwidth [shape=circle] \n", "sepallength [shape=circle] \n",
"sepallength -> sepalwidth", "sepalwidth [shape=circle] \n", "sepalwidth -> petalwidth", "}\n"
}
);
auto clf = bayesnet::TAN();
auto [Xd, y, features, className, states] = loadFile("iris");
clf.fit(Xd, y, features, className, states);
REQUIRE(clf.getNumberOfNodes() == 5);
REQUIRE(clf.getNumberOfEdges() == 7);
REQUIRE(clf.show() == vector<string>{"class -> sepallength, sepalwidth, petallength, petalwidth, ", "petallength -> sepallength, ", "petalwidth -> ", "sepallength -> sepalwidth, ", "sepalwidth -> petalwidth, "});
REQUIRE(clf.graph("Test") == graph);
}
TEST_CASE("Get num features & num edges")
{
auto [Xd, y, features, className, states] = loadFile("iris");
auto clf = bayesnet::KDB(2);
clf.fit(Xd, y, features, className, states);
REQUIRE(clf.getNumberOfNodes() == 5);
REQUIRE(clf.getNumberOfEdges() == 8);
}

View File

@@ -1,33 +0,0 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/catch_approx.hpp>
#include <catch2/generators/catch_generators.hpp>
#include <string>
#include "KDB.h"
#include "platformUtils.h"
TEST_CASE("Test Bayesian Network")
{
auto [Xd, y, features, className, states] = loadFile("iris");
SECTION("Test get features")
{
auto net = bayesnet::Network();
net.addNode("A");
net.addNode("B");
REQUIRE(net.getFeatures() == vector<string>{"A", "B"});
net.addNode("C");
REQUIRE(net.getFeatures() == vector<string>{"A", "B", "C"});
}
SECTION("Test get edges")
{
auto net = bayesnet::Network();
net.addNode("A");
net.addNode("B");
net.addNode("C");
net.addEdge("A", "B");
net.addEdge("B", "C");
REQUIRE(net.getEdges() == vector<pair<string, string>>{ {"A", "B"}, { "B", "C" } });
net.addEdge("A", "C");
REQUIRE(net.getEdges() == vector<pair<string, string>>{ {"A", "B"}, { "A", "C" }, { "B", "C" } });
}
}

View File

@@ -1,12 +1,18 @@
if(ENABLE_TESTING)
set(TEST_MAIN "unit_tests")
set(TEST_BAYESNET "unit_tests_bayesnet")
set(TEST_PLATFORM "unit_tests_platform")
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
set(TEST_SOURCES BayesModels.cc BayesNetwork.cc ${BayesNet_SOURCE_DIR}/src/Platform/platformUtils.cc ${BayesNet_SOURCES})
add_executable(${TEST_MAIN} ${TEST_SOURCES})
target_link_libraries(${TEST_MAIN} PUBLIC "${TORCH_LIBRARIES}" ArffFiles mdlp Catch2::Catch2WithMain)
add_test(NAME ${TEST_MAIN} COMMAND ${TEST_MAIN})
include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include)
set(TEST_SOURCES_BAYESNET TestBayesModels.cc TestBayesNetwork.cc TestBayesMetrics.cc TestUtils.cc ${BayesNet_SOURCE_DIR}/src/Platform/Folding.cc ${BayesNet_SOURCES})
set(TEST_SOURCES_PLATFORM TestFolding.cc TestUtils.cc ${BayesNet_SOURCE_DIR}/src/Platform/Folding.cc)
add_executable(${TEST_BAYESNET} ${TEST_SOURCES_BAYESNET})
add_executable(${TEST_PLATFORM} ${TEST_SOURCES_PLATFORM})
target_link_libraries(${TEST_BAYESNET} PUBLIC "${TORCH_LIBRARIES}" ArffFiles mdlp Catch2::Catch2WithMain)
target_link_libraries(${TEST_PLATFORM} PUBLIC "${TORCH_LIBRARIES}" ArffFiles mdlp Catch2::Catch2WithMain)
add_test(NAME ${TEST_BAYESNET} COMMAND ${TEST_BAYESNET})
add_test(NAME ${TEST_PLATFORM} COMMAND ${TEST_PLATFORM})
endif(ENABLE_TESTING)

63
tests/TestBayesMetrics.cc Normal file
View File

@@ -0,0 +1,63 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/catch_approx.hpp>
#include <catch2/generators/catch_generators.hpp>
#include "BayesMetrics.h"
#include "TestUtils.h"
using namespace std;
TEST_CASE("Metrics Test", "[BayesNet]")
{
string file_name = GENERATE("glass", "iris", "ecoli", "diabetes");
map<string, pair<int, vector<int>>> resultsKBest = {
{"glass", {7, { 0, 1, 7, 6, 3, 5, 2 }}},
{"iris", {3, { 0, 3, 2 }} },
{"ecoli", {6, { 2, 4, 1, 0, 6, 5 }}},
{"diabetes", {2, { 7, 1 }}}
};
map<string, double> resultsMI = {
{"glass", 0.12805398},
{"iris", 0.3158139948},
{"ecoli", 0.0089431099},
{"diabetes", 0.0345470614}
};
map<pair<string, int>, vector<pair<int, int>>> resultsMST = {
{ {"glass", 0}, { {0, 6}, {0, 5}, {0, 3}, {5, 1}, {5, 8}, {5, 4}, {6, 2}, {6, 7} } },
{ {"glass", 1}, { {1, 5}, {5, 0}, {5, 8}, {5, 4}, {0, 6}, {0, 3}, {6, 2}, {6, 7} } },
{ {"iris", 0}, { {0, 1}, {0, 2}, {1, 3} } },
{ {"iris", 1}, { {1, 0}, {1, 3}, {0, 2} } },
{ {"ecoli", 0}, { {0, 1}, {0, 2}, {1, 5}, {1, 3}, {5, 6}, {5, 4} } },
{ {"ecoli", 1}, { {1, 0}, {1, 5}, {1, 3}, {5, 6}, {5, 4}, {0, 2} } },
{ {"diabetes", 0}, { {0, 7}, {0, 2}, {0, 6}, {2, 3}, {3, 4}, {3, 5}, {4, 1} } },
{ {"diabetes", 1}, { {1, 4}, {4, 3}, {3, 2}, {3, 5}, {2, 0}, {0, 7}, {0, 6} } }
};
auto raw = RawDatasets(file_name, true);
bayesnet::Metrics metrics(raw.dataset, raw.featurest, raw.classNamet, raw.classNumStates);
SECTION("Test Constructor")
{
REQUIRE(metrics.getScoresKBest().size() == 0);
}
SECTION("Test SelectKBestWeighted")
{
vector<int> kBest = metrics.SelectKBestWeighted(raw.weights, true, resultsKBest.at(file_name).first);
REQUIRE(kBest.size() == resultsKBest.at(file_name).first);
REQUIRE(kBest == resultsKBest.at(file_name).second);
}
SECTION("Test Mutual Information")
{
auto result = metrics.mutualInformation(raw.dataset.index({ 1, "..." }), raw.dataset.index({ 2, "..." }), raw.weights);
REQUIRE(result == Catch::Approx(resultsMI.at(file_name)).epsilon(raw.epsilon));
}
SECTION("Test Maximum Spanning Tree")
{
auto weights_matrix = metrics.conditionalEdge(raw.weights);
for (int i = 0; i < 2; ++i) {
auto result = metrics.maximumSpanningTree(raw.featurest, weights_matrix, i);
REQUIRE(result == resultsMST.at({ file_name, i }));
}
}
}

141
tests/TestBayesModels.cc Normal file
View File

@@ -0,0 +1,141 @@
#define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do
#include <catch2/catch_test_macros.hpp>
#include <catch2/catch_approx.hpp>
#include <catch2/generators/catch_generators.hpp>
#include <vector>
#include <map>
#include <string>
#include "KDB.h"
#include "TAN.h"
#include "SPODE.h"
#include "AODE.h"
#include "BoostAODE.h"
#include "TANLd.h"
#include "KDBLd.h"
#include "SPODELd.h"
#include "AODELd.h"
#include "TestUtils.h"
TEST_CASE("Test Bayesian Classifiers score", "[BayesNet]")
{
map <pair<string, string>, float> scores = {
// Diabetes
{{"diabetes", "AODE"}, 0.811198}, {{"diabetes", "KDB"}, 0.852865}, {{"diabetes", "SPODE"}, 0.802083}, {{"diabetes", "TAN"}, 0.821615},
{{"diabetes", "AODELd"}, 0.8138f}, {{"diabetes", "KDBLd"}, 0.80208f}, {{"diabetes", "SPODELd"}, 0.78646f}, {{"diabetes", "TANLd"}, 0.8099f}, {{"diabetes", "BoostAODE"}, 0.83984f},
// Ecoli
{{"ecoli", "AODE"}, 0.889881}, {{"ecoli", "KDB"}, 0.889881}, {{"ecoli", "SPODE"}, 0.880952}, {{"ecoli", "TAN"}, 0.892857},
{{"ecoli", "AODELd"}, 0.8869f}, {{"ecoli", "KDBLd"}, 0.875f}, {{"ecoli", "SPODELd"}, 0.84226f}, {{"ecoli", "TANLd"}, 0.86905f}, {{"ecoli", "BoostAODE"}, 0.89583f},
// Glass
{{"glass", "AODE"}, 0.78972}, {{"glass", "KDB"}, 0.827103}, {{"glass", "SPODE"}, 0.775701}, {{"glass", "TAN"}, 0.827103},
{{"glass", "AODELd"}, 0.79439f}, {{"glass", "KDBLd"}, 0.85047f}, {{"glass", "SPODELd"}, 0.79439f}, {{"glass", "TANLd"}, 0.86449f}, {{"glass", "BoostAODE"}, 0.84579f},
// Iris
{{"iris", "AODE"}, 0.973333}, {{"iris", "KDB"}, 0.973333}, {{"iris", "SPODE"}, 0.973333}, {{"iris", "TAN"}, 0.973333},
{{"iris", "AODELd"}, 0.973333}, {{"iris", "KDBLd"}, 0.973333}, {{"iris", "SPODELd"}, 0.96f}, {{"iris", "TANLd"}, 0.97333f}, {{"iris", "BoostAODE"}, 0.98f}
};
string file_name = GENERATE("glass", "iris", "ecoli", "diabetes");
auto raw = RawDatasets(file_name, false);
SECTION("Test TAN classifier (" + file_name + ")")
{
auto clf = bayesnet::TAN();
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
auto score = clf.score(raw.Xv, raw.yv);
//scores[{file_name, "TAN"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "TAN"}]).epsilon(raw.epsilon));
}
SECTION("Test TANLd classifier (" + file_name + ")")
{
auto clf = bayesnet::TANLd();
clf.fit(raw.Xt, raw.yt, raw.featurest, raw.classNamet, raw.statest);
auto score = clf.score(raw.Xt, raw.yt);
//scores[{file_name, "TANLd"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "TANLd"}]).epsilon(raw.epsilon));
}
SECTION("Test KDB classifier (" + file_name + ")")
{
auto clf = bayesnet::KDB(2);
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
auto score = clf.score(raw.Xv, raw.yv);
//scores[{file_name, "KDB"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "KDB"
}]).epsilon(raw.epsilon));
}
SECTION("Test KDBLd classifier (" + file_name + ")")
{
auto clf = bayesnet::KDBLd(2);
clf.fit(raw.Xt, raw.yt, raw.featurest, raw.classNamet, raw.statest);
auto score = clf.score(raw.Xt, raw.yt);
//scores[{file_name, "KDBLd"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "KDBLd"
}]).epsilon(raw.epsilon));
}
SECTION("Test SPODE classifier (" + file_name + ")")
{
auto clf = bayesnet::SPODE(1);
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
auto score = clf.score(raw.Xv, raw.yv);
// scores[{file_name, "SPODE"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "SPODE"}]).epsilon(raw.epsilon));
}
SECTION("Test SPODELd classifier (" + file_name + ")")
{
auto clf = bayesnet::SPODELd(1);
clf.fit(raw.Xt, raw.yt, raw.featurest, raw.classNamet, raw.statest);
auto score = clf.score(raw.Xt, raw.yt);
// scores[{file_name, "SPODELd"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "SPODELd"}]).epsilon(raw.epsilon));
}
SECTION("Test AODE classifier (" + file_name + ")")
{
auto clf = bayesnet::AODE();
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
auto score = clf.score(raw.Xv, raw.yv);
// scores[{file_name, "AODE"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "AODE"}]).epsilon(raw.epsilon));
}
SECTION("Test AODELd classifier (" + file_name + ")")
{
auto clf = bayesnet::AODELd();
clf.fit(raw.Xt, raw.yt, raw.featurest, raw.classNamet, raw.statest);
auto score = clf.score(raw.Xt, raw.yt);
// scores[{file_name, "AODELd"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "AODELd"}]).epsilon(raw.epsilon));
}
SECTION("Test BoostAODE classifier (" + file_name + ")")
{
auto clf = bayesnet::BoostAODE();
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
auto score = clf.score(raw.Xv, raw.yv);
// scores[{file_name, "BoostAODE"}] = score;
REQUIRE(score == Catch::Approx(scores[{file_name, "BoostAODE"}]).epsilon(raw.epsilon));
}
// for (auto scores : scores) {
// cout << "{{\"" << scores.first.first << "\", \"" << scores.first.second << "\"}, " << scores.second << "}, ";
// }
}
TEST_CASE("Models features", "[BayesNet]")
{
auto graph = vector<string>({ "digraph BayesNet {\nlabel=<BayesNet Test>\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n",
"class [shape=circle, fontcolor=red, fillcolor=lightblue, style=filled ] \n",
"class -> sepallength", "class -> sepalwidth", "class -> petallength", "class -> petalwidth", "petallength [shape=circle] \n",
"petallength -> sepallength", "petalwidth [shape=circle] \n", "sepallength [shape=circle] \n",
"sepallength -> sepalwidth", "sepalwidth [shape=circle] \n", "sepalwidth -> petalwidth", "}\n"
}
);
auto raw = RawDatasets("iris", true);
auto clf = bayesnet::TAN();
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
REQUIRE(clf.getNumberOfNodes() == 6);
REQUIRE(clf.getNumberOfEdges() == 7);
REQUIRE(clf.show() == vector<string>{"class -> sepallength, sepalwidth, petallength, petalwidth, ", "petallength -> sepallength, ", "petalwidth -> ", "sepallength -> sepalwidth, ", "sepalwidth -> petalwidth, "});
REQUIRE(clf.graph("Test") == graph);
}
TEST_CASE("Get num features & num edges", "[BayesNet]")
{
auto raw = RawDatasets("iris", true);
auto clf = bayesnet::KDB(2);
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
REQUIRE(clf.getNumberOfNodes() == 6);
REQUIRE(clf.getNumberOfEdges() == 8);
}

263
tests/TestBayesNetwork.cc Normal file
View File

@@ -0,0 +1,263 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/catch_approx.hpp>
#include <catch2/generators/catch_generators.hpp>
#include <string>
#include "TestUtils.h"
#include "Network.h"
void buildModel(bayesnet::Network& net, const vector<string>& features, const string& className)
{
vector<pair<int, int>> network = { {0, 1}, {0, 2}, {1, 3} };
for (const auto& feature : features) {
net.addNode(feature);
}
net.addNode(className);
for (const auto& edge : network) {
net.addEdge(features.at(edge.first), features.at(edge.second));
}
for (const auto& feature : features) {
net.addEdge(className, feature);
}
}
TEST_CASE("Test Bayesian Network", "[BayesNet]")
{
auto raw = RawDatasets("iris", true);
auto net = bayesnet::Network();
SECTION("Test get features")
{
net.addNode("A");
net.addNode("B");
REQUIRE(net.getFeatures() == vector<string>{"A", "B"});
net.addNode("C");
REQUIRE(net.getFeatures() == vector<string>{"A", "B", "C"});
}
SECTION("Test get edges")
{
net.addNode("A");
net.addNode("B");
net.addNode("C");
net.addEdge("A", "B");
net.addEdge("B", "C");
REQUIRE(net.getEdges() == vector<pair<string, string>>{ {"A", "B"}, { "B", "C" } });
REQUIRE(net.getNumEdges() == 2);
net.addEdge("A", "C");
REQUIRE(net.getEdges() == vector<pair<string, string>>{ {"A", "B"}, { "A", "C" }, { "B", "C" } });
REQUIRE(net.getNumEdges() == 3);
}
SECTION("Test getNodes")
{
net.addNode("A");
net.addNode("B");
auto& nodes = net.getNodes();
REQUIRE(nodes.count("A") == 1);
REQUIRE(nodes.count("B") == 1);
}
SECTION("Test fit Network")
{
auto net2 = bayesnet::Network();
auto net3 = bayesnet::Network();
net3.initialize();
net2.initialize();
net.initialize();
buildModel(net, raw.featuresv, raw.classNamev);
buildModel(net2, raw.featurest, raw.classNamet);
buildModel(net3, raw.featurest, raw.classNamet);
vector<pair<string, string>> edges = {
{"class", "sepallength"}, {"class", "sepalwidth"}, {"class", "petallength"},
{"class", "petalwidth" }, {"sepallength", "sepalwidth"}, {"sepallength", "petallength"},
{"sepalwidth", "petalwidth"}
};
REQUIRE(net.getEdges() == edges);
REQUIRE(net2.getEdges() == edges);
REQUIRE(net3.getEdges() == edges);
vector<string> features = { "sepallength", "sepalwidth", "petallength", "petalwidth", "class" };
REQUIRE(net.getFeatures() == features);
REQUIRE(net2.getFeatures() == features);
REQUIRE(net3.getFeatures() == features);
auto& nodes = net.getNodes();
auto& nodes2 = net2.getNodes();
auto& nodes3 = net3.getNodes();
// Check Nodes parents & children
for (const auto& feature : features) {
// Parents
vector<string> parents, parents2, parents3, children, children2, children3;
auto nodeParents = nodes[feature]->getParents();
auto nodeParents2 = nodes2[feature]->getParents();
auto nodeParents3 = nodes3[feature]->getParents();
transform(nodeParents.begin(), nodeParents.end(), back_inserter(parents), [](const auto& p) { return p->getName(); });
transform(nodeParents2.begin(), nodeParents2.end(), back_inserter(parents2), [](const auto& p) { return p->getName(); });
transform(nodeParents3.begin(), nodeParents3.end(), back_inserter(parents3), [](const auto& p) { return p->getName(); });
REQUIRE(parents == parents2);
REQUIRE(parents == parents3);
// Children
auto nodeChildren = nodes[feature]->getChildren();
auto nodeChildren2 = nodes2[feature]->getChildren();
auto nodeChildren3 = nodes2[feature]->getChildren();
transform(nodeChildren.begin(), nodeChildren.end(), back_inserter(children), [](const auto& p) { return p->getName(); });
transform(nodeChildren2.begin(), nodeChildren2.end(), back_inserter(children2), [](const auto& p) { return p->getName(); });
transform(nodeChildren3.begin(), nodeChildren3.end(), back_inserter(children3), [](const auto& p) { return p->getName(); });
REQUIRE(children == children2);
REQUIRE(children == children3);
}
// Fit networks
net.fit(raw.Xv, raw.yv, raw.weightsv, raw.featuresv, raw.classNamev, raw.statesv);
net2.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest);
net3.fit(raw.Xt, raw.yt, raw.weights, raw.featurest, raw.classNamet, raw.statest);
REQUIRE(net.getStates() == net2.getStates());
REQUIRE(net.getStates() == net3.getStates());
// Check Conditional Probabilities tables
for (int i = 0; i < features.size(); ++i) {
auto feature = features.at(i);
for (const auto& feature : features) {
auto cpt = nodes[feature]->getCPT();
auto cpt2 = nodes2[feature]->getCPT();
auto cpt3 = nodes3[feature]->getCPT();
REQUIRE(cpt.equal(cpt2));
REQUIRE(cpt.equal(cpt3));
}
}
}
SECTION("Test show")
{
auto net = bayesnet::Network();
net.addNode("A");
net.addNode("B");
net.addNode("C");
net.addEdge("A", "B");
net.addEdge("A", "C");
auto str = net.show();
REQUIRE(str.size() == 3);
REQUIRE(str[0] == "A -> B, C, ");
REQUIRE(str[1] == "B -> ");
REQUIRE(str[2] == "C -> ");
}
SECTION("Test topological_sort")
{
auto net = bayesnet::Network();
net.addNode("A");
net.addNode("B");
net.addNode("C");
net.addEdge("A", "B");
net.addEdge("A", "C");
auto sorted = net.topological_sort();
REQUIRE(sorted.size() == 3);
REQUIRE(sorted[0] == "A");
bool result = sorted[1] == "B" && sorted[2] == "C";
REQUIRE(result);
}
SECTION("Test graph")
{
auto net = bayesnet::Network();
net.addNode("A");
net.addNode("B");
net.addNode("C");
net.addEdge("A", "B");
net.addEdge("A", "C");
auto str = net.graph("Test Graph");
REQUIRE(str.size() == 7);
REQUIRE(str[0] == "digraph BayesNet {\nlabel=<BayesNet Test Graph>\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n");
REQUIRE(str[1] == "A [shape=circle] \n");
REQUIRE(str[2] == "A -> B");
REQUIRE(str[3] == "A -> C");
REQUIRE(str[4] == "B [shape=circle] \n");
REQUIRE(str[5] == "C [shape=circle] \n");
REQUIRE(str[6] == "}\n");
}
// SECTION("Test predict")
// {
// auto net = bayesnet::Network();
// net.fit(raw.Xv, raw.yv, raw.weightsv, raw.featuresv, raw.classNamev, raw.statesv);
// vector<vector<int>> test = { {1, 2, 0, 1}, {0, 1, 2, 0}, {1, 1, 1, 1}, {0, 0, 0, 0}, {2, 2, 2, 2} };
// vector<int> y_test = { 0, 1, 1, 0, 2 };
// auto y_pred = net.predict(test);
// REQUIRE(y_pred == y_test);
// }
// SECTION("Test predict_proba")
// {
// auto net = bayesnet::Network();
// net.fit(raw.Xv, raw.yv, raw.weightsv, raw.featuresv, raw.classNamev, raw.statesv);
// vector<vector<int>> test = { {1, 2, 0, 1}, {0, 1, 2, 0}, {1, 1, 1, 1}, {0, 0, 0, 0}, {2, 2, 2, 2} };
// auto y_test = { 0, 1, 1, 0, 2 };
// auto y_pred = net.predict(test);
// REQUIRE(y_pred == y_test);
// }
}
// SECTION("Test score")
// {
// auto net = bayesnet::Network();
// net.fit(Xd, y, weights, features, className, states);
// auto test = { {1, 2, 0, 1}, {0, 1, 2, 0}, {1, 1, 1, 1}, {0, 0, 0, 0}, {2, 2, 2, 2} };
// auto score = net.score(X, y);
// REQUIRE(score == Catch::Approx();
// }
//
//
// SECTION("Test graph")
// {
// auto net = bayesnet::Network();
// net.addNode("A");
// net.addNode("B");
// net.addNode("C");
// net.addEdge("A", "B");
// net.addEdge("A", "C");
// auto str = net.graph("Test Graph");
// REQUIRE(str.size() == 6);
// REQUIRE(str[0] == "digraph \"Test Graph\" {");
// REQUIRE(str[1] == " A -> B;");
// REQUIRE(str[2] == " A -> C;");
// REQUIRE(str[3] == " B [shape=ellipse];");
// REQUIRE(str[4] == " C [shape=ellipse];");
// REQUIRE(str[5] == "}");
// }
// SECTION("Test initialize")
// {
// auto net = bayesnet::Network();
// net.addNode("A");
// net.addNode("B");
// net.addNode("C");
// net.addEdge("A", "B");
// net.addEdge("A", "C");
// net.initialize();
// REQUIRE(net.getNodes().size() == 0);
// REQUIRE(net.getEdges().size() == 0);
// REQUIRE(net.getFeatures().size() == 0);
// REQUIRE(net.getClassNumStates() == 0);
// REQUIRE(net.getClassName().empty());
// REQUIRE(net.getStates() == 0);
// REQUIRE(net.getSamples().numel() == 0);
// }
// SECTION("Test dump_cpt")
// {
// auto net = bayesnet::Network();
// net.addNode("A");
// net.addNode("B");
// net.addNode("C");
// net.addEdge("A", "B");
// net.addEdge("A", "C");
// net.setClassName("C");
// net.setStates({ {"A", {0, 1}}, {"B", {0, 1}}, {"C", {0, 1, 2}} });
// net.fit({ {0, 0}, {0, 1}, {1, 0}, {1, 1} }, { 0, 1, 1, 2 }, {}, { "A", "B" }, "C", { {"A", {0, 1}}, {"B", {0, 1}}, {"C", {0, 1, 2}} });
// net.dump_cpt();
// // TODO: Check that the file was created and contains the expected data
// }
// SECTION("Test version")
// {
// auto net = bayesnet::Network();
// REQUIRE(net.version() == "0.2.0");
// }
// }
// }

95
tests/TestFolding.cc Normal file
View File

@@ -0,0 +1,95 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/catch_approx.hpp>
#include <catch2/generators/catch_generators.hpp>
#include "TestUtils.h"
#include "Folding.h"
TEST_CASE("KFold Test", "[Platform][KFold]")
{
// Initialize a KFold object with k=5 and a seed of 19.
string file_name = GENERATE("glass", "iris", "ecoli", "diabetes");
auto raw = RawDatasets(file_name, true);
int nFolds = 5;
platform::KFold kfold(nFolds, raw.nSamples, 19);
int number = raw.nSamples * (kfold.getNumberOfFolds() - 1) / kfold.getNumberOfFolds();
SECTION("Number of Folds")
{
REQUIRE(kfold.getNumberOfFolds() == nFolds);
}
SECTION("Fold Test")
{
// Test each fold's size and contents.
for (int i = 0; i < nFolds; ++i) {
auto [train_indices, test_indices] = kfold.getFold(i);
bool result = train_indices.size() == number || train_indices.size() == number + 1;
REQUIRE(result);
REQUIRE(train_indices.size() + test_indices.size() == raw.nSamples);
}
}
}
map<int, int> counts(vector<int> y, vector<int> indices)
{
map<int, int> result;
for (auto i = 0; i < indices.size(); ++i) {
result[y[indices[i]]]++;
}
return result;
}
TEST_CASE("StratifiedKFold Test", "[Platform][StratifiedKFold]")
{
// Initialize a StratifiedKFold object with k=3, using the y vector, and a seed of 17.
string file_name = GENERATE("glass", "iris", "ecoli", "diabetes");
int nFolds = GENERATE(3, 5, 10);
auto raw = RawDatasets(file_name, true);
platform::StratifiedKFold stratified_kfoldt(nFolds, raw.yt, 17);
platform::StratifiedKFold stratified_kfoldv(nFolds, raw.yv, 17);
int number = raw.nSamples * (stratified_kfoldt.getNumberOfFolds() - 1) / stratified_kfoldt.getNumberOfFolds();
SECTION("Stratified Number of Folds")
{
REQUIRE(stratified_kfoldt.getNumberOfFolds() == nFolds);
}
SECTION("Stratified Fold Test")
{
// Test each fold's size and contents.
auto counts = map<int, vector<int>>();
// Initialize the counts per Fold
for (int i = 0; i < nFolds; ++i) {
counts[i] = vector<int>(raw.classNumStates, 0);
}
// Check fold and compute counts of each fold
for (int fold = 0; fold < nFolds; ++fold) {
auto [train_indicest, test_indicest] = stratified_kfoldt.getFold(fold);
auto [train_indicesv, test_indicesv] = stratified_kfoldv.getFold(fold);
REQUIRE(train_indicest == train_indicesv);
REQUIRE(test_indicest == test_indicesv);
// In the worst case scenario, the number of samples in the training set is number + raw.classNumStates
// because in that fold can come one remainder sample from each class.
REQUIRE(train_indicest.size() <= number + raw.classNumStates);
// If the number of samples in any class is less than the number of folds, then the fold is faulty.
// and the number of samples in the training set + test set will be less than nSamples
if (!stratified_kfoldt.isFaulty()) {
REQUIRE(train_indicest.size() + test_indicest.size() == raw.nSamples);
} else {
REQUIRE(train_indicest.size() + test_indicest.size() <= raw.nSamples);
}
auto train_t = torch::tensor(train_indicest);
auto ytrain = raw.yt.index({ train_t });
// Check that the class labels have been equally assign to each fold
for (const auto& idx : train_indicest) {
counts[fold][raw.yt[idx].item<int>()]++;
}
}
// Test the fold counting of every class
for (int fold = 0; fold < nFolds; ++fold) {
for (int j = 1; j < nFolds - 1; ++j) {
for (int k = 0; k < raw.classNumStates; ++k) {
REQUIRE(abs(counts.at(fold).at(k) - counts.at(j).at(k)) <= 1);
}
}
}
}
}

View File

@@ -1,18 +1,14 @@
#include "platformUtils.h"
#include "Paths.h"
#include "TestUtils.h"
using namespace std;
using namespace torch;
vector<string> split(const string& text, char delimiter)
{
vector<string> result;
stringstream ss(text);
string token;
while (getline(ss, token, delimiter)) {
result.push_back(token);
class Paths {
public:
static string datasets()
{
return "../../data/";
}
return result;
}
};
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)
{
@@ -50,10 +46,10 @@ bool file_exists(const string& name)
}
}
tuple<Tensor, Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset)
tuple<Tensor, Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& name, bool class_last, bool discretize_dataset)
{
auto handler = ArffFiles();
handler.load(path + static_cast<string>(name) + ".arff", class_last);
handler.load(Paths::datasets() + static_cast<string>(name) + ".arff", class_last);
// Get Dataset X, y
vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
@@ -66,19 +62,19 @@ tuple<Tensor, Tensor, vector<string>, string, map<string, vector<int>>> loadData
auto states = map<string, vector<int>>();
if (discretize_dataset) {
auto Xr = discretizeDataset(X, y);
Xd = torch::zeros({ static_cast<int>(Xr[0].size()), static_cast<int>(Xr.size()) }, torch::kInt32);
Xd = torch::zeros({ static_cast<int>(Xr.size()), static_cast<int>(Xr[0].size()) }, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
Xd.index_put_({ "...", i }, torch::tensor(Xr[i], torch::kInt32));
Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32));
}
states[className] = vector<int>(*max_element(y.begin(), y.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
} else {
Xd = torch::zeros({ static_cast<int>(X[0].size()), static_cast<int>(X.size()) }, torch::kFloat32);
Xd = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
for (int i = 0; i < features.size(); ++i) {
Xd.index_put_({ "...", i }, torch::tensor(X[i]));
Xd.index_put_({ i, "..." }, torch::tensor(X[i]));
}
}
return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
@@ -87,7 +83,7 @@ tuple<Tensor, Tensor, vector<string>, string, map<string, vector<int>>> loadData
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(const string& name)
{
auto handler = ArffFiles();
handler.load(platform::Paths::datasets() + static_cast<string>(name) + ".arff");
handler.load(Paths::datasets() + static_cast<string>(name) + ".arff");
// Get Dataset X, y
vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();

44
tests/TestUtils.h Normal file
View File

@@ -0,0 +1,44 @@
#ifndef TEST_UTILS_H
#define TEST_UTILS_H
#include <torch/torch.h>
#include <string>
#include <vector>
#include <map>
#include <tuple>
#include "ArffFiles.h"
#include "CPPFImdlp.h"
using namespace std;
bool file_exists(const std::string& name);
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features);
vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y);
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(const string& name);
tuple<torch::Tensor, torch::Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& name, bool class_last, bool discretize_dataset);
class RawDatasets {
public:
RawDatasets(const string& file_name, bool discretize)
{
// Xt can be either discretized or not
tie(Xt, yt, featurest, classNamet, statest) = loadDataset(file_name, true, discretize);
// Xv is always discretized
tie(Xv, yv, featuresv, classNamev, statesv) = loadFile(file_name);
auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1);
dataset = torch::cat({ Xt, yresized }, 0);
nSamples = dataset.size(1);
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
weightsv = vector<double>(nSamples, 1.0 / nSamples);
classNumStates = discretize ? statest.at(classNamet).size() : 0;
}
torch::Tensor Xt, yt, dataset, weights;
vector<vector<int>> Xv;
vector<double> weightsv;
vector<int> yv;
vector<string> featurest, featuresv;
map<string, vector<int>> statest, statesv;
string classNamet, classNamev;
int nSamples, classNumStates;
double epsilon = 1e-5;
};
#endif //TEST_UTILS_H