Compare commits

...

144 Commits

Author SHA1 Message Date
fa7fe081ad Fix xlsx library finding 2023-10-15 11:19:58 +02:00
660e783517 Update validation for feature selection 2023-10-14 13:32:09 +02:00
b35532dd9e Implement IWSS and FCBF too for BoostAODE 2023-10-14 13:12:04 +02:00
6ef49385ea Remove unneeded method declaration FeatureSelect 2023-10-14 11:30:32 +02:00
6d5a25cdc8 Refactor CFS class creating abstract base class 2023-10-14 11:27:46 +02:00
d00b08cbe8 Fix Header for Linux 2023-10-13 14:26:47 +02:00
977ff6fddb Update CMakeLists for Linux 2023-10-13 14:01:52 +02:00
54b8939f35 Prepare BoostAODE first try 2023-10-13 13:46:22 +02:00
5022a4dc90 Complete CFS tested with Python mufs 2023-10-13 12:29:25 +02:00
40d1dad5d8 Begin CFS implementation 2023-10-11 21:17:26 +02:00
47e2b138c5 Complete first working cfs 2023-10-11 11:33:29 +02:00
e7ded68267 First cfs working version 2023-10-10 23:00:38 +02:00
ca833a34f5 try openssl sha256 2023-10-10 18:16:43 +02:00
df9b4c48d2 Begin CFS initialization 2023-10-10 13:39:11 +02:00
f288bbd6fa Begin adding cfs to BoostAODE 2023-10-10 11:52:39 +02:00
7d8aca4f59 Add Locale shared config to reports 2023-10-09 19:41:29 +02:00
8fdad78a8c Continue Test Network 2023-10-09 11:25:30 +02:00
e3ae073333 Continue test Network 2023-10-08 15:54:58 +02:00
4b732e76c2 MST change unordered_set to list 2023-10-07 19:08:13 +02:00
fe5fead27e Begin Fix Test MST 2023-10-07 01:43:26 +02:00
8c3864f3c8 Complete Folding Test 2023-10-07 01:23:36 +02:00
1287160c47 Refactor makefile to use variables 2023-10-07 00:16:25 +02:00
2f58807322 Begin refactor CMakeLists debug/release paths 2023-10-06 19:32:29 +02:00
17e079edd5 Begin Test Folding 2023-10-06 17:08:54 +02:00
b9e0028e9d Refactor Makefile 2023-10-06 01:28:27 +02:00
e0d39fe631 Fix BayesMetrics Test 2023-10-06 01:14:55 +02:00
36b0277576 Add Maximum Spanning Tree test 2023-10-05 15:45:36 +02:00
da8d018ec4 Refactor Makefile 2023-10-05 11:45:00 +02:00
5f0676691c Add First BayesMetrics Tests 2023-10-05 01:14:16 +02:00
3448fb1299 Refactor Tests and add BayesMetrics test 2023-10-04 23:19:23 +02:00
5e938d5cca Add ranks sheet to excel best results 2023-10-04 16:26:57 +02:00
55e742438f Add constant references to Statistics 2023-10-04 13:40:45 +02:00
c4ae3fe429 Add Control model rank info to report 2023-10-04 12:42:35 +02:00
93e4ff94db Add significance level as parameter in best 2023-10-02 15:46:40 +02:00
57c27f739c Remove unused code in BestResults 2023-10-02 15:31:02 +02:00
a434d7f1ae Add a Linux config in launch.json 2023-09-30 18:44:21 +02:00
294666c516 Fix a Linux problem in Datasets 2023-09-30 18:43:47 +02:00
fd04e78ad9 Restore sample.cc 2023-09-29 18:50:25 +02:00
66ec1b343b Remove platformUtils and split Datasets & Dataset 2023-09-29 18:20:46 +02:00
bb423da42f Add csv and R_dat files to platform 2023-09-29 13:52:50 +02:00
db17c14042 Change names of executables to b_... 2023-09-29 09:17:50 +02:00
a4401cb78f Linux CMakeLists.txt adjustment 2023-09-29 00:30:47 +02:00
9d3d9cc6c6 Complete Excel output for bestResults with Friedman test 2023-09-28 18:52:37 +02:00
cfcf3c16df Add best results Excel 2023-09-28 17:12:04 +02:00
85202260f3 Separate specific Excel methods to ExcelFile 2023-09-28 13:07:11 +02:00
82acb3cab5 Enhance output of Best results reports 2023-09-28 12:08:56 +02:00
623ceed396 Merge pull request 'Add Friedman Test & post hoc tests to BestResults' (#10) from boost into main
Reviewed-on: #10
2023-09-28 07:44:55 +00:00
926de2bebd Add boost info to README 2023-09-28 09:44:33 +02:00
71704e3547 Enhance output info in Statistics 2023-09-28 01:27:18 +02:00
3b06534327 Remove duplicated code in BestResults 2023-09-28 00:59:34 +02:00
ac89a451e3 Duplicate statistics tests in class 2023-09-28 00:45:15 +02:00
00c6cf663b Fix order of output in posthoc 2023-09-27 19:11:47 +02:00
5043c12be8 Complete posthoc with Holm adjust 2023-09-27 18:34:16 +02:00
11320e2cc7 Complete friedman test as in exreport 2023-09-27 12:36:03 +02:00
ce66483b65 Update boost version requirement for Linux 2023-09-26 14:12:53 +02:00
cab8e14b2d Add friedman hyperparameter 2023-09-26 11:26:59 +02:00
f0d0abe891 Add boost library link to linux build 2023-09-26 01:07:50 +02:00
dcba146e12 Begin adding Friedman test to BestResults 2023-09-26 01:04:59 +02:00
3ea0285119 Fix ranks to match friedman test ranks 2023-09-25 18:38:12 +02:00
e3888e1503 Merge pull request 'bestResults' (#9) from bestResults into main
Reviewed-on: https://gitea.rmontanana.es:3000/rmontanana/BayesNet/pulls/9

Add best results management, build, report, build all & report all
2023-09-25 12:02:17 +00:00
06de13df98 Add date/time to header of report best 2023-09-25 10:04:53 +02:00
de4fa6a04f Add color to totals 2023-09-23 10:30:39 +02:00
3a7bf4e672 Fix ranking order mistake 2023-09-23 01:33:23 +02:00
cd0bc02a74 Add report/build all with totals and ranks 2023-09-23 01:14:02 +02:00
c8597a794e Begin report all models 2023-09-22 18:13:32 +02:00
b30416364d Fix mistake in best results file name 2023-09-22 14:14:39 +02:00
3a16589220 Add best config for debug in vscode 2023-09-22 01:04:36 +02:00
c4f9187e2a Complete best build and report 2023-09-22 01:03:55 +02:00
c4d0a5b4e6 Split Result from Results 2023-09-21 23:30:17 +02:00
7bfafe555f Begin BestResults build 2023-09-21 23:04:11 +02:00
337b6f7e79 Rename BestResult to BestScore 2023-09-21 19:30:07 +02:00
5fa0b957dd Fix mistake in idx range in manage 2023-09-20 19:12:07 +02:00
67252fc41d Fix CMakeLists libxlsxwriter for Linux 2023-09-20 19:02:53 +02:00
94ae9456a0 Fix libxslxwriter linking problem 2023-09-20 18:50:11 +02:00
781993e326 Resolve some warnings 2023-09-20 17:54:15 +02:00
8257a6ae39 Add message of not exist Best Results 2023-09-20 13:50:34 +02:00
fc81730dfc Merge pull request 'Exchange OpenXLSX to libxlsxwriter' (#8) from libxlsxwriter into main
Add multiple sheets to excel file
Add format and color to sheets
Add comparison with ZeroR
Add comparison with Best Results
Separate contextual menu from general in manage
2023-09-20 11:17:16 +00:00
d8734ff082 Separate contextual menu from general 2023-09-20 13:15:33 +02:00
03533461c8 Add compare to best results in manage 2023-09-20 12:51:19 +02:00
68f22a673d Add comparison to report console 2023-09-20 11:40:01 +02:00
b9bc0088f3 Add format to unique dataset results summary 2023-09-20 10:30:45 +02:00
c280e254ca Remove OpenXLSX submodule 2023-09-20 01:09:58 +02:00
3d0f29fda3 Remove .vscode/settings.json from repository 2023-09-20 01:01:40 +02:00
20a6ebab7c Support to add any number of sheets to excel 2023-09-20 00:58:01 +02:00
925f71166c Fix mistake in comparison 2023-09-19 23:46:49 +02:00
f69f415b92 Complete comparison with ZeroR 2023-09-19 17:55:03 +02:00
1bdfbd1620 Complete adding color to format 2023-09-19 14:07:41 +02:00
06fb135526 First approach 2023-09-18 23:26:22 +02:00
501ea0ab4e Fix CMakeList manage build with Linux 2023-09-18 19:27:40 +02:00
847c6761d7 Add Linux specific link library to cmake 2023-09-17 10:42:19 +02:00
6030885fc3 Add partial result filter to manage 2023-09-16 17:27:18 +02:00
89df7f4db0 Add library to manage link 2023-09-14 01:41:49 +02:00
41257ed566 If ! convergence don't predict test 2023-09-10 19:50:36 +02:00
506369e46b Add Convergence hyperparameter 2023-09-07 11:27:35 +02:00
d908f389f5 Begin using validation as finish condition 2023-09-06 10:51:07 +02:00
5a7c8f1818 Add status to classifier and Experiment 2023-09-05 13:39:43 +02:00
64fc7bd9dd Add show dataset detail in report 2023-09-05 09:26:49 +02:00
0b7beda78c Add threads without limit to network fit 2023-09-04 21:24:11 +02:00
05b670dfc0 Add detail to fold progress in main 2023-09-03 16:33:48 +02:00
de62d42b74 Fix make debug command 2023-09-03 14:13:10 +02:00
edb957d22e Add filter complete results to manage 2023-09-03 14:07:11 +02:00
4de5cb4c6c Merge pull request 'Solve Ensemble models exceptions on certain datasets' (#7) from solveexceptions into main
Reviewed-on: #7
2023-09-02 15:29:33 +00:00
c35030f137 Upgrade models version and Add class diagram 2023-09-02 14:39:43 +02:00
182b07ed90 Solve voting vector error 2023-09-02 13:58:12 +02:00
7806f961e2 Remove threads 2023-08-31 20:30:28 +02:00
7c3e315ae7 Add Linux specific options to compile 2023-08-29 18:20:55 +02:00
284ef6dfd1 Add significanceModels to AODELd 2023-08-24 12:58:53 +02:00
1c6af619b5 Exception if hyperparameters not valid 2023-08-24 12:09:35 +02:00
86ffdfd6f3 Add const feature and className to fit models 2023-08-23 23:15:39 +02:00
d82148079d Add KDB hyperparameters K and theta 2023-08-23 00:44:10 +02:00
067430fd1b Add xlsxopen submodule 2023-08-22 23:45:11 +02:00
f5d0d16365 Merge pull request 'Add excel report to manage results' (#6) from xlsx into main
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/6
2023-08-22 21:40:11 +00:00
97ca8ac084 Move check valid hyperparameters to Classifier 2023-08-22 22:12:20 +02:00
1c1385b768 Fix maxModels mistake in BoostAODE if !repeatSp
Throw exception if wrong hyperparmeter is supplied
2023-08-22 21:55:17 +02:00
35432b6294 Fix time std was not saved in experiment 2023-08-22 12:30:27 +02:00
c59dd30e53 Complete Excel Report with data 2023-08-22 11:55:15 +02:00
d2da0ddb88 Create ReportExcel eq to ReportConsole 2023-08-21 17:51:49 +02:00
8066701c3c Refactor Report class into ReportBase & ReportCons 2023-08-21 17:16:29 +02:00
0f66ac73d0 Revert "Refactor Report into ReportBase & ReportConsole"
This reverts commit 4370bf51d7.
2023-08-21 17:15:14 +02:00
4370bf51d7 Refactor Report into ReportBase & ReportConsole 2023-08-21 17:14:23 +02:00
2b7353b9e0 Add default sorting by date in manage 2023-08-21 16:30:10 +02:00
b686b3c9c3 Enhance copy in Makefile 2023-08-21 12:18:23 +02:00
2dd04a6c44 enhance saving results and add Makefile copy 2023-08-21 11:57:45 +02:00
1da83662d0 Always save results 2023-08-21 10:55:20 +02:00
3ac9593c65 Fix mistake in sample 2023-08-20 20:36:46 +02:00
6b317accf1 Add hyperparameters and processing order to Boost 2023-08-20 20:31:23 +02:00
4964aab722 Add hyperparameters management in experiments 2023-08-20 17:57:38 +02:00
7a6ec73d63 Merge pull request 'boostAode' (#5) from boostAode into main
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/5
Implement boostAODE
add list datasets
add manage results
2023-08-20 09:02:07 +00:00
1a534888d6 Fix report format 2023-08-19 23:30:44 +02:00
59ffd179f4 Fix report format 2023-08-19 21:26:48 +02:00
9972738deb Add list datasets and add locale format 2023-08-19 19:05:16 +02:00
bafcb26bb6 Add manage to build target 2023-08-18 13:43:53 +02:00
2d7999d5f2 Add manage to release targets 2023-08-18 13:43:13 +02:00
a6bb22dfb5 Complete first BoostAODE 2023-08-18 11:50:34 +02:00
704dc937be Remove FeatureSel, add SelectKBest to BayesMetrics 2023-08-16 19:05:18 +02:00
a3e665eed6 make weights double 2023-08-16 12:46:09 +02:00
918a7b4180 Remove unneeded output 2023-08-16 12:36:38 +02:00
80b20f35b4 Fix weights mistakes in computation 2023-08-16 12:32:51 +02:00
4d4780c1d5 Add BoostAODE model based on AODE 2023-08-15 16:16:04 +02:00
fa612c531e Complete Adding weights to Models 2023-08-15 15:59:56 +02:00
24b68f9ae2 Add weigths as parameter 2023-08-15 15:04:56 +02:00
a062ebf445 Merge pull request 'reports' (#4) from reports into boostAode
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/4
2023-08-14 16:58:48 +00:00
f26ea1f0ac Add weights to BayesMetrics 2023-08-13 12:56:06 +02:00
af0419c9da First approx with const 1 weights 2023-08-13 00:59:02 +02:00
115 changed files with 4700 additions and 1184 deletions

31
.clang-uml Normal file
View File

@@ -0,0 +1,31 @@
compilation_database_dir: build
output_directory: puml
diagrams:
BayesNet:
type: class
glob:
- src/BayesNet/*.cc
- src/Platform/*.cc
using_namespace: bayesnet
include:
namespaces:
- bayesnet
- platform
plantuml:
after:
- "note left of {{ alias(\"MyProjectMain\") }}: Main class of myproject library."
sequence:
type: sequence
glob:
- src/Platform/main.cc
combine_free_functions_into_file_participants: true
using_namespace:
- std
- bayesnet
- platform
include:
paths:
- src/BayesNet
- src/Platform
start_from:
- function: main(int,const char **)

6
.gitignore vendored
View File

@@ -31,7 +31,11 @@
*.exe
*.out
*.app
build/
build/**
build_debug/**
build_release/**
*.dSYM/**
cmake-build*/**
.idea
puml/**
.vscode/settings.json

3
.gitmodules vendored
View File

@@ -10,3 +10,6 @@
[submodule "lib/json"]
path = lib/json
url = https://github.com/nlohmann/json.git
[submodule "lib/libxlsxwriter"]
path = lib/libxlsxwriter
url = https://github.com/jmcnamara/libxlsxwriter.git

18
.vscode/c_cpp_properties.json vendored Normal file
View File

@@ -0,0 +1,18 @@
{
"configurations": [
{
"name": "Mac",
"includePath": [
"${workspaceFolder}/**"
],
"defines": [],
"macFrameworkPath": [
"/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
],
"cStandard": "c17",
"cppStandard": "c++17",
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json"
}
],
"version": 4
}

49
.vscode/launch.json vendored
View File

@@ -10,7 +10,7 @@
"-d",
"iris",
"-m",
"KDB",
"TANLd",
"-s",
"271",
"-p",
@@ -22,15 +22,30 @@
"type": "lldb",
"request": "launch",
"name": "experiment",
"program": "${workspaceFolder}/build/src/Platform/main",
"program": "${workspaceFolder}/build/src/Platform/b_main",
"args": [
"-m",
"SPODELd",
"-p",
"/Users/rmontanana/Code/discretizbench/datasets",
"TAN",
"--stratified",
"-d",
"iris"
"zoo",
"--discretize"
// "--hyperparameters",
// "{\"repeatSparent\": true, \"maxModels\": 12}"
],
"cwd": "/Users/rmontanana/Code/odtebench",
},
{
"type": "lldb",
"request": "launch",
"name": "best",
"program": "${workspaceFolder}/build/src/Platform/b_best",
"args": [
"-m",
"BoostAODE",
"-s",
"accuracy",
"--build",
],
"cwd": "/Users/rmontanana/Code/discretizbench",
},
@@ -38,13 +53,33 @@
"type": "lldb",
"request": "launch",
"name": "manage",
"program": "${workspaceFolder}/build/src/Platform/manage",
"program": "${workspaceFolder}/build/src/Platform/b_manage",
"args": [
"-n",
"20"
],
"cwd": "/Users/rmontanana/Code/discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "list",
"program": "${workspaceFolder}/build/src/Platform/b_list",
"args": [],
//"cwd": "/Users/rmontanana/Code/discretizbench",
"cwd": "/home/rmontanana/Code/covbench",
},
{
"type": "lldb",
"request": "launch",
"name": "test",
"program": "${workspaceFolder}/build/tests/unit_tests",
"args": [
"-c=\"Metrics Test\"",
// "-s",
],
"cwd": "${workspaceFolder}/build/tests",
},
{
"name": "Build & debug active file",
"type": "cppdbg",

109
.vscode/settings.json vendored
View File

@@ -1,109 +0,0 @@
{
"files.associations": {
"*.rmd": "markdown",
"*.py": "python",
"vector": "cpp",
"__bit_reference": "cpp",
"__bits": "cpp",
"__config": "cpp",
"__debug": "cpp",
"__errc": "cpp",
"__hash_table": "cpp",
"__locale": "cpp",
"__mutex_base": "cpp",
"__node_handle": "cpp",
"__nullptr": "cpp",
"__split_buffer": "cpp",
"__string": "cpp",
"__threading_support": "cpp",
"__tuple": "cpp",
"array": "cpp",
"atomic": "cpp",
"bitset": "cpp",
"cctype": "cpp",
"chrono": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"exception": "cpp",
"initializer_list": "cpp",
"ios": "cpp",
"iosfwd": "cpp",
"istream": "cpp",
"limits": "cpp",
"locale": "cpp",
"memory": "cpp",
"mutex": "cpp",
"new": "cpp",
"optional": "cpp",
"ostream": "cpp",
"ratio": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"streambuf": "cpp",
"string": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"typeinfo": "cpp",
"unordered_map": "cpp",
"variant": "cpp",
"algorithm": "cpp",
"iostream": "cpp",
"iomanip": "cpp",
"numeric": "cpp",
"set": "cpp",
"__tree": "cpp",
"deque": "cpp",
"list": "cpp",
"map": "cpp",
"unordered_set": "cpp",
"any": "cpp",
"condition_variable": "cpp",
"forward_list": "cpp",
"fstream": "cpp",
"stack": "cpp",
"thread": "cpp",
"__memory": "cpp",
"filesystem": "cpp",
"*.toml": "toml",
"utility": "cpp",
"__verbose_abort": "cpp",
"bit": "cpp",
"random": "cpp",
"*.tcc": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory_resource": "cpp",
"format": "cpp",
"valarray": "cpp",
"regex": "cpp",
"span": "cpp",
"cfenv": "cpp",
"cinttypes": "cpp",
"csetjmp": "cpp",
"future": "cpp",
"queue": "cpp",
"typeindex": "cpp",
"shared_mutex": "cpp",
"*.ipp": "cpp",
"cassert": "cpp",
"charconv": "cpp",
"source_location": "cpp",
"ranges": "cpp"
},
"cmake.configureOnOpen": false,
"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools"
}

23
.vscode/tasks.json vendored
View File

@@ -32,6 +32,29 @@
],
"group": "build",
"detail": "Task generated by Debugger."
},
{
"type": "cppbuild",
"label": "C/C++: g++ build active file",
"command": "/usr/bin/g++",
"args": [
"-fdiagnostics-color=always",
"-g",
"${file}",
"-o",
"${fileDirname}/${fileBasenameNoExtension}"
],
"options": {
"cwd": "${fileDirname}"
},
"problemMatcher": [
"$gcc"
],
"group": {
"kind": "build",
"isDefault": true
},
"detail": "Task generated by Debugger."
}
]
}

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.20)
project(BayesNet
VERSION 0.1.0
VERSION 0.2.0
DESCRIPTION "Bayesian Network and basic classifiers Library."
HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
LANGUAGES CXX
@@ -31,17 +31,26 @@ option(ENABLE_CLANG_TIDY "Enable to add clang tidy." OFF)
option(ENABLE_TESTING "Unit testing build" OFF)
option(CODE_COVERAGE "Collect coverage from test library" OFF)
# Boost Library
set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_MULTITHREADED ON)
set(Boost_USE_STATIC_RUNTIME OFF)
find_package(Boost 1.66.0 REQUIRED)
if(Boost_FOUND)
message("Boost_INCLUDE_DIRS=${Boost_INCLUDE_DIRS}")
include_directories(${Boost_INCLUDE_DIRS})
endif()
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
# CMakes modules
# --------------
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
include(AddGitSubmodule)
if (CODE_COVERAGE)
enable_testing()
include(CodeCoverage)
MESSAGE("Code coverage enabled")
set(CMAKE_C_FLAGS " ${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
endif (CODE_COVERAGE)
@@ -56,6 +65,11 @@ add_git_submodule("lib/mdlp")
add_git_submodule("lib/argparse")
add_git_submodule("lib/json")
find_library(XLSXWRITER_LIB NAMES libxlsxwriter.dylib libxlsxwriter.so PATHS ${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/lib)
message("XLSXWRITER_LIB=${XLSXWRITER_LIB}")
# Subdirectories
# --------------
add_subdirectory(config)
@@ -64,7 +78,7 @@ add_subdirectory(src/BayesNet)
add_subdirectory(src/Platform)
add_subdirectory(sample)
file(GLOB BayesNet_HEADERS CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.h ${BayesNet_SOURCE_DIR}/BayesNet/*.hpp)
file(GLOB BayesNet_HEADERS CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.h ${BayesNet_SOURCE_DIR}/BayesNet/*.h)
file(GLOB BayesNet_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.cc ${BayesNet_SOURCE_DIR}/src/BayesNet/*.cpp)
file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/Platform/*.cc ${BayesNet_SOURCE_DIR}/src/Platform/*.cpp)
@@ -74,7 +88,6 @@ file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/Platform
if (ENABLE_TESTING)
MESSAGE("Testing enabled")
add_git_submodule("lib/catch2")
include(CTest)
add_subdirectory(tests)
endif (ENABLE_TESTING)

118
Makefile
View File

@@ -1,6 +1,26 @@
SHELL := /bin/bash
.DEFAULT_GOAL := help
.PHONY: coverage setup help build test
.PHONY: coverage setup help build test clean debug release
f_release = build_release
f_debug = build_debug
app_targets = b_best b_list b_main b_manage
test_targets = unit_tests_bayesnet unit_tests_platform
n_procs = -j 16
define ClearTests
@for t in $(test_targets); do \
if [ -f $(f_debug)/tests/$$t ]; then \
echo ">>> Cleaning $$t..." ; \
rm -f $(f_debug)/tests/$$t ; \
fi ; \
done
@nfiles="$(find . -name "*.gcda" -print0)" ; \
if test "${nfiles}" != "" ; then \
find . -name "*.gcda" -print0 | xargs -0 rm 2>/dev/null ;\
fi ;
endef
setup: ## Install dependencies for tests and coverage
@if [ "$(shell uname)" = "Darwin" ]; then \
@@ -11,49 +31,85 @@ setup: ## Install dependencies for tests and coverage
pip install gcovr; \
fi
dest ?= ${HOME}/bin
install: ## Copy binary files to bin folder
@echo "Destination folder: $(dest)"
make buildr
@echo ">>> Copying files to $(dest)"
@cp $(f_release)/src/Platform/b_main $(dest)
@cp $(f_release)/src/Platform/b_list $(dest)
@cp $(f_release)/src/Platform/b_manage $(dest)
@cp $(f_release)/src/Platform/b_best $(dest)
dependency: ## Create a dependency graph diagram of the project (build/dependency.png)
cd build && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png
@echo ">>> Creating dependency graph diagram of the project...";
$(MAKE) debug
cd $(f_debug) && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png
build: ## Build the main and BayesNetSample
cmake --build build -t main -t BayesNetSample -j 32
buildd: ## Build the debug targets
cmake --build $(f_debug) -t $(app_targets) $(n_procs)
clean: ## Clean the debug info
@echo ">>> Cleaning Debug BayesNet ...";
find . -name "*.gcda" -print0 | xargs -0 rm
buildr: ## Build the release targets
cmake --build $(f_release) -t $(app_targets) $(n_procs)
clean: ## Clean the tests info
@echo ">>> Cleaning Debug BayesNet tests...";
$(call ClearTests)
@echo ">>> Done";
clang-uml: ## Create uml class and sequence diagrams
clang-uml -p --add-compile-flag -I /usr/lib/gcc/x86_64-redhat-linux/8/include/
debug: ## Build a debug version of the project
@echo ">>> Building Debug BayesNet ...";
@if [ -d ./build ]; then rm -rf ./build; fi
@mkdir build;
cmake -S . -B build -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON; \
cmake --build build -j 32;
@echo ">>> Building Debug BayesNet...";
@if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi
@mkdir $(f_debug);
@cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON
@echo ">>> Done";
release: ## Build a Release version of the project
@echo ">>> Building Release BayesNet ...";
@if [ -d ./build ]; then rm -rf ./build; fi
@mkdir build;
cmake -S . -B build -D CMAKE_BUILD_TYPE=Release; \
cmake --build build -t main -t BayesNetSample -j 32;
@echo ">>> Building Release BayesNet...";
@if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi
@mkdir $(f_release);
@cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release
@echo ">>> Done";
test: ## Run tests
@echo "* Running tests...";
find . -name "*.gcda" -print0 | xargs -0 rm
@cd build; \
cmake --build . --target unit_tests ;
@cd build/tests; \
./unit_tests;
opt = ""
test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
@echo ">>> Running BayesNet & Platform tests...";
@$(MAKE) clean
@cmake --build $(f_debug) -t $(test_targets) $(n_procs)
@for t in $(test_targets); do \
if [ -f $(f_debug)/tests/$$t ]; then \
cd $(f_debug)/tests ; \
./$$t $(opt) ; \
fi ; \
done
@echo ">>> Done";
opt = ""
testp: ## Run platform tests (opt="-s") to verbose output the tests, (opt="-c='Stratified Fold Test'") to run only that section
@echo ">>> Running Platform tests...";
@$(MAKE) clean
@cmake --build $(f_debug) --target unit_tests_platform $(n_procs)
@if [ -f $(f_debug)/tests/unit_tests_platform ]; then cd $(f_debug)/tests ; ./unit_tests_platform $(opt) ; fi ;
@echo ">>> Done";
opt = ""
testb: ## Run BayesNet tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
@echo ">>> Running BayesNet tests...";
@$(MAKE) clean
@cmake --build $(f_debug) --target unit_tests_bayesnet $(n_procs)
@if [ -f $(f_debug)/tests/unit_tests_bayesnet ]; then cd $(f_debug)/tests ; ./unit_tests_bayesnet $(opt) ; fi ;
@echo ">>> Done";
coverage: ## Run tests and generate coverage report (build/index.html)
@echo "*Building tests...";
find . -name "*.gcda" -print0 | xargs -0 rm
@cd build; \
cmake --build . --target unit_tests ;
@cd build/tests; \
./unit_tests;
gcovr ;
@echo ">>> Building tests with coverage...";
@$(MAKE) test
@cd $(f_debug) ; \
gcovr --config ../gcovr.cfg tests ;
@echo ">>> Done";
help: ## Show help message
@IFS=$$'\n' ; \

View File

@@ -2,4 +2,50 @@
Bayesian Network Classifier with libtorch from scratch
## 0. Setup
Before compiling BayesNet.
### boost library
[Getting Started](<https://www.boost.org/doc/libs/1_83_0/more/getting_started/index.html>)
The best option is install the packages that the Linux distribution have in its repository. If this is the case:
```bash
sudo dnf install boost-devel
```
If this is not possible and the compressed packaged is installed, the following environment variable has to be set:
```bash
export BOOST_ROOT=/path/to/library/
```
### libxlswriter
```bash
cd lib/libxlsxwriter
make
make install DESTDIR=/home/rmontanana/Code PREFIX=
```
Environment variable has to be set:
```bash
export LD_LIBRARY_PATH=/usr/local/lib
```
### Release
```bash
make release
```
### Debug & Tests
```bash
make debug
```
## 1. Introduction

View File

@@ -1,12 +0,0 @@
digraph BayesNet {
label=<BayesNet >
fontsize=30
fontcolor=blue
labelloc=t
layout=circo
class [shape=circle, fontcolor=red, fillcolor=lightblue, style=filled ]
class -> sepallength class -> sepalwidth class -> petallength class -> petalwidth petallength [shape=circle]
petallength -> sepallength petalwidth [shape=circle]
sepallength [shape=circle]
sepallength -> sepalwidth sepalwidth [shape=circle]
sepalwidth -> petalwidth }

View File

@@ -1 +0,0 @@
null

BIN
diagrams/BayesNet.pdf Executable file

Binary file not shown.

View File

@@ -1,2 +1 @@
add_library(ArffFiles ArffFiles.cc)
#target_link_libraries(BayesNet "${TORCH_LIBRARIES}")

1
lib/libxlsxwriter Submodule

Submodule lib/libxlsxwriter added at 29355a0887

33
mac_mst.txt Normal file
View File

@@ -0,0 +1,33 @@
Weights matrix:
0.0000000, 0.0384968, 0.0795434, 0.1546867, -0.0000000, 0.1788104, 0.2214721, 0.0323837, 0.0366549,
0.0384968, 0.0000000, 0.0200662, 0.0200937, -0.0000000, 0.0637224, 0.0183005, 0.0127657, 0.0136054,
0.0795434, 0.0200662, 0.0000000, 0.0605489, -0.0000000, 0.0894469, 0.1689408, 0.0321602, 0.0223184,
0.1546867, 0.0200937, 0.0605489, 0.0000000, -0.0000000, 0.1150757, 0.1332292, 0.0422865, 0.0191138,
-0.0000000, -0.0000000, -0.0000000, -0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000,
0.1788104, 0.0637224, 0.0894469, 0.1150757, 0.0000000, 0.0000000, 0.1407102, 0.0406590, 0.0366986,
0.2214721, 0.0183005, 0.1689408, 0.1332292, 0.0000000, 0.1407102, 0.0000000, 0.0427515, 0.0349965,
0.0323837, 0.0127657, 0.0321602, 0.0422865, 0.0000000, 0.0406590, 0.0427515, 0.0000000, 0.0343376,
0.0366549, 0.0136054, 0.0223184, 0.0191138, 0.0000000, 0.0366986, 0.0349965, 0.0343376, 0.0000000,
Edge : Weight
0 - 6 : 0.2214721
0 - 5 : 0.1788104
2 - 6 : 0.1689408
0 - 3 : 0.1546867
1 - 5 : 0.0637224
6 - 7 : 0.0427515
5 - 8 : 0.0366986
4 - 5 : 0.0000000
-------------------------------------------------------------------------------
Metrics Test
Test Maximum Spanning Tree
-------------------------------------------------------------------------------
/Users/rmontanana/Code/BayesNet/tests/TestBayesMetrics.cc:58
...............................................................................
/Users/rmontanana/Code/BayesNet/tests/TestBayesMetrics.cc:69: PASSED:
REQUIRE( result == resultsMST.at(file_name) )
with expansion:
(0, 6) (0, 5) (0, 3) (5, 1) (5, 8) (5, 4) (6, 2) (6, 7)
==
(0, 6) (0, 5) (0, 3) (5, 1) (5, 8) (5, 4) (6, 2) (6, 7)

View File

@@ -3,5 +3,6 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include)
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
add_executable(BayesNetSample sample.cc ${BayesNet_SOURCE_DIR}/src/Platform/Folding.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(BayesNetSample BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")

View File

@@ -3,13 +3,14 @@
#include <string>
#include <map>
#include <argparse/argparse.hpp>
#include "ArffFiles.h"
#include <nlohmann/json.hpp>
#include "ArffFiles.h"v
#include "BayesMetrics.h"
#include "CPPFImdlp.h"
#include "Folding.h"
#include "Models.h"
#include "modelRegister.h"
#include <fstream>
using namespace std;
@@ -178,59 +179,59 @@ int main(int argc, char** argv)
cout << "end." << endl;
auto score = clf->score(Xd, y);
cout << "Score: " << score << endl;
// auto graph = clf->graph();
// auto dot_file = model_name + "_" + file_name;
// ofstream file(dot_file + ".dot");
// file << graph;
// file.close();
// cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl;
// cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl;
// string stratified_string = stratified ? " Stratified" : "";
// cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl;
// cout << "==========================================" << endl;
// torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
// torch::Tensor yt = torch::tensor(y, torch::kInt32);
// for (int i = 0; i < features.size(); ++i) {
// Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
// }
// float total_score = 0, total_score_train = 0, score_train, score_test;
// Fold* fold;
// if (stratified)
// fold = new StratifiedKFold(nFolds, y, seed);
// else
// fold = new KFold(nFolds, y.size(), seed);
// for (auto i = 0; i < nFolds; ++i) {
// auto [train, test] = fold->getFold(i);
// cout << "Fold: " << i + 1 << endl;
// if (tensors) {
// auto ttrain = torch::tensor(train, torch::kInt64);
// auto ttest = torch::tensor(test, torch::kInt64);
// torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
// torch::Tensor ytraint = yt.index({ ttrain });
// torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
// torch::Tensor ytestt = yt.index({ ttest });
// clf->fit(Xtraint, ytraint, features, className, states);
// auto temp = clf->predict(Xtraint);
// score_train = clf->score(Xtraint, ytraint);
// score_test = clf->score(Xtestt, ytestt);
// } else {
// auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
// auto [Xtest, ytest] = extract_indices(test, Xd, y);
// clf->fit(Xtrain, ytrain, features, className, states);
// score_train = clf->score(Xtrain, ytrain);
// score_test = clf->score(Xtest, ytest);
// }
// if (dump_cpt) {
// cout << "--- CPT Tables ---" << endl;
// clf->dump_cpt();
// }
// total_score_train += score_train;
// total_score += score_test;
// cout << "Score Train: " << score_train << endl;
// cout << "Score Test : " << score_test << endl;
// cout << "-------------------------------------------------------------------------------" << endl;
// }
// cout << "**********************************************************************************" << endl;
// cout << "Average Score Train: " << total_score_train / nFolds << endl;
// cout << "Average Score Test : " << total_score / nFolds << endl;return 0;
auto graph = clf->graph();
auto dot_file = model_name + "_" + file_name;
ofstream file(dot_file + ".dot");
file << graph;
file.close();
cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl;
cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl;
string stratified_string = stratified ? " Stratified" : "";
cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl;
cout << "==========================================" << endl;
torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
torch::Tensor yt = torch::tensor(y, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
}
float total_score = 0, total_score_train = 0, score_train, score_test;
platform::Fold* fold;
if (stratified)
fold = new platform::StratifiedKFold(nFolds, y, seed);
else
fold = new platform::KFold(nFolds, y.size(), seed);
for (auto i = 0; i < nFolds; ++i) {
auto [train, test] = fold->getFold(i);
cout << "Fold: " << i + 1 << endl;
if (tensors) {
auto ttrain = torch::tensor(train, torch::kInt64);
auto ttest = torch::tensor(test, torch::kInt64);
torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
torch::Tensor ytraint = yt.index({ ttrain });
torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
torch::Tensor ytestt = yt.index({ ttest });
clf->fit(Xtraint, ytraint, features, className, states);
auto temp = clf->predict(Xtraint);
score_train = clf->score(Xtraint, ytraint);
score_test = clf->score(Xtestt, ytestt);
} else {
auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
auto [Xtest, ytest] = extract_indices(test, Xd, y);
clf->fit(Xtrain, ytrain, features, className, states);
score_train = clf->score(Xtrain, ytrain);
score_test = clf->score(Xtest, ytest);
}
if (dump_cpt) {
cout << "--- CPT Tables ---" << endl;
clf->dump_cpt();
}
total_score_train += score_train;
total_score += score_test;
cout << "Score Train: " << score_train << endl;
cout << "Score Test : " << score_test << endl;
cout << "-------------------------------------------------------------------------------" << endl;
}
cout << "**********************************************************************************" << endl;
cout << "Average Score Train: " << total_score_train / nFolds << endl;
cout << "Average Score Test : " << total_score / nFolds << endl;return 0;
}

View File

@@ -2,12 +2,14 @@
namespace bayesnet {
AODE::AODE() : Ensemble() {}
void AODE::buildModel()
void AODE::buildModel(const torch::Tensor& weights)
{
models.clear();
for (int i = 0; i < features.size(); ++i) {
models.push_back(std::make_unique<SPODE>(i));
}
n_models = models.size();
significanceModels = vector<double>(n_models, 1.0);
}
vector<string> AODE::graph(const string& title) const
{

View File

@@ -5,7 +5,7 @@
namespace bayesnet {
class AODE : public Ensemble {
protected:
void buildModel() override;
void buildModel(const torch::Tensor& weights) override;
public:
AODE();
virtual ~AODE() {};

View File

@@ -4,9 +4,9 @@
namespace bayesnet {
using namespace std;
AODELd::AODELd() : Ensemble(), Proposal(dataset, features, className) {}
AODELd& AODELd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
AODELd& AODELd::fit(torch::Tensor& X_, torch::Tensor& y_, const vector<string>& features_, const string& className_, map<string, vector<int>>& states_)
{
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
checkInput(X_, y_);
features = features_;
className = className_;
Xf = X_;
@@ -19,15 +19,16 @@ namespace bayesnet {
return *this;
}
void AODELd::buildModel()
void AODELd::buildModel(const torch::Tensor& weights)
{
models.clear();
for (int i = 0; i < features.size(); ++i) {
models.push_back(std::make_unique<SPODELd>(i));
}
n_models = models.size();
significanceModels = vector<double>(n_models, 1.0);
}
void AODELd::trainModel()
void AODELd::trainModel(const torch::Tensor& weights)
{
for (const auto& model : models) {
model->fit(Xf, y, features, className, states);

View File

@@ -8,13 +8,13 @@ namespace bayesnet {
using namespace std;
class AODELd : public Ensemble, public Proposal {
protected:
void trainModel() override;
void buildModel() override;
void trainModel(const torch::Tensor& weights) override;
void buildModel(const torch::Tensor& weights) override;
public:
AODELd();
AODELd& fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_) override;
AODELd& fit(torch::Tensor& X_, torch::Tensor& y_, const vector<string>& features_, const string& className_, map<string, vector<int>>& states_) override;
virtual ~AODELd() = default;
vector<string> graph(const string& name = "AODE") const override;
vector<string> graph(const string& name = "AODELd") const override;
static inline string version() { return "0.0.1"; };
};
}

View File

@@ -1,21 +1,25 @@
#ifndef BASE_H
#define BASE_H
#include <torch/torch.h>
#include <nlohmann/json.hpp>
#include <vector>
namespace bayesnet {
using namespace std;
enum status_t { NORMAL, WARNING, ERROR };
class BaseClassifier {
protected:
virtual void trainModel() = 0;
virtual void trainModel(const torch::Tensor& weights) = 0;
public:
// X is nxm vector, y is nx1 vector
virtual BaseClassifier& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states) = 0;
virtual BaseClassifier& fit(vector<vector<int>>& X, vector<int>& y, const vector<string>& features, const string& className, map<string, vector<int>>& states) = 0;
// X is nxm tensor, y is nx1 tensor
virtual BaseClassifier& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) = 0;
virtual BaseClassifier& fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states) = 0;
virtual BaseClassifier& fit(torch::Tensor& X, torch::Tensor& y, const vector<string>& features, const string& className, map<string, vector<int>>& states) = 0;
virtual BaseClassifier& fit(torch::Tensor& dataset, const vector<string>& features, const string& className, map<string, vector<int>>& states) = 0;
virtual BaseClassifier& fit(torch::Tensor& dataset, const vector<string>& features, const string& className, map<string, vector<int>>& states, const torch::Tensor& weights) = 0;
virtual ~BaseClassifier() = default;
torch::Tensor virtual predict(torch::Tensor& X) = 0;
vector<int> virtual predict(vector<vector<int>>& X) = 0;
status_t virtual getStatus() const = 0;
float virtual score(vector<vector<int>>& X, vector<int>& y) = 0;
float virtual score(torch::Tensor& X, torch::Tensor& y) = 0;
int virtual getNumberOfNodes()const = 0;
@@ -23,9 +27,10 @@ namespace bayesnet {
int virtual getNumberOfStates() const = 0;
vector<string> virtual show() const = 0;
vector<string> virtual graph(const string& title = "") const = 0;
const string inline getVersion() const { return "0.1.0"; };
const string inline getVersion() const { return "0.2.0"; };
vector<string> virtual topological_order() = 0;
void virtual dump_cpt()const = 0;
virtual void setHyperparameters(nlohmann::json& hyperparameters) = 0;
};
}
#endif

View File

@@ -1,7 +1,7 @@
#include "BayesMetrics.h"
#include "Mst.h"
namespace bayesnet {
//samples is nxm tensor used to fit the model
//samples is n+1xm tensor used to fit the model
Metrics::Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates)
: samples(samples)
, features(features)
@@ -21,28 +21,57 @@ namespace bayesnet {
}
samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
}
vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
vector<int> Metrics::SelectKBestWeighted(const torch::Tensor& weights, bool ascending, unsigned k)
{
vector<pair<string, string>> result;
for (int i = 0; i < source.size(); ++i) {
string temp = source[i];
for (int j = i + 1; j < source.size(); ++j) {
result.push_back({ temp, source[j] });
}
// Return the K Best features
auto n = samples.size(0) - 1;
if (k == 0) {
k = n;
}
return result;
// compute scores
scoresKBest.clear();
featuresKBest.clear();
auto label = samples.index({ -1, "..." });
for (int i = 0; i < n; ++i) {
scoresKBest.push_back(mutualInformation(label, samples.index({ i, "..." }), weights));
featuresKBest.push_back(i);
}
// sort & reduce scores and features
if (ascending) {
sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j)
{ return scoresKBest[i] < scoresKBest[j]; });
sort(scoresKBest.begin(), scoresKBest.end(), std::less<double>());
if (k < n) {
for (int i = 0; i < n - k; ++i) {
featuresKBest.erase(featuresKBest.begin());
scoresKBest.erase(scoresKBest.begin());
}
}
} else {
sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j)
{ return scoresKBest[i] > scoresKBest[j]; });
sort(scoresKBest.begin(), scoresKBest.end(), std::greater<double>());
featuresKBest.resize(k);
scoresKBest.resize(k);
}
return featuresKBest;
}
torch::Tensor Metrics::conditionalEdge()
vector<double> Metrics::getScoresKBest() const
{
return scoresKBest;
}
torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights)
{
auto result = vector<double>();
auto source = vector<string>(features);
source.push_back(className);
auto combinations = doCombinations(source);
// Compute class prior
auto margin = torch::zeros({ classNumStates });
auto margin = torch::zeros({ classNumStates }, torch::kFloat);
for (int value = 0; value < classNumStates; ++value) {
auto mask = samples.index({ -1, "..." }) == value;
margin[value] = mask.sum().item<float>() / samples.size(1);
margin[value] = mask.sum().item<double>() / samples.size(1);
}
for (auto [first, second] : combinations) {
int index_first = find(features.begin(), features.end(), first) - features.begin();
@@ -52,8 +81,9 @@ namespace bayesnet {
auto mask = samples.index({ -1, "..." }) == value;
auto first_dataset = samples.index({ index_first, mask });
auto second_dataset = samples.index({ index_second, mask });
auto mi = mutualInformation(first_dataset, second_dataset);
auto pb = margin[value].item<float>();
auto weights_dataset = weights.index({ mask });
auto mi = mutualInformation(first_dataset, second_dataset, weights_dataset);
auto pb = margin[value].item<double>();
accumulated += pb * mi;
}
result.push_back(accumulated);
@@ -70,31 +100,32 @@ namespace bayesnet {
return matrix;
}
// To use in Python
vector<float> Metrics::conditionalEdgeWeights()
vector<float> Metrics::conditionalEdgeWeights(vector<float>& weights_)
{
auto matrix = conditionalEdge();
const torch::Tensor weights = torch::tensor(weights_);
auto matrix = conditionalEdge(weights);
std::vector<float> v(matrix.data_ptr<float>(), matrix.data_ptr<float>() + matrix.numel());
return v;
}
double Metrics::entropy(const torch::Tensor& feature)
double Metrics::entropy(const torch::Tensor& feature, const torch::Tensor& weights)
{
torch::Tensor counts = feature.bincount();
int totalWeight = counts.sum().item<int>();
torch::Tensor counts = feature.bincount(weights);
double totalWeight = counts.sum().item<double>();
torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
torch::Tensor logProbs = torch::log(probs);
torch::Tensor entropy = -probs * logProbs;
return entropy.nansum().item<double>();
}
// H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
double Metrics::conditionalEntropy(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature)
double Metrics::conditionalEntropy(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights)
{
int numSamples = firstFeature.sizes()[0];
torch::Tensor featureCounts = secondFeature.bincount();
torch::Tensor featureCounts = secondFeature.bincount(weights);
unordered_map<int, unordered_map<int, double>> jointCounts;
double totalWeight = 0;
for (auto i = 0; i < numSamples; i++) {
jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += 1;
totalWeight += 1;
jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += weights[i].item<double>();
totalWeight += weights[i].item<float>();
}
if (totalWeight == 0)
return 0;
@@ -115,9 +146,9 @@ namespace bayesnet {
return entropyValue;
}
// I(X;Y) = H(Y) - H(Y|X)
double Metrics::mutualInformation(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature)
double Metrics::mutualInformation(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights)
{
return entropy(firstFeature) - conditionalEntropy(firstFeature, secondFeature);
return entropy(firstFeature, weights) - conditionalEntropy(firstFeature, secondFeature, weights);
}
/*
Compute the maximum spanning tree considering the weights as distances

View File

@@ -8,20 +8,43 @@ namespace bayesnet {
using namespace torch;
class Metrics {
private:
Tensor samples; // nxm tensor used to fit the model
vector<string> features;
string className;
int classNumStates = 0;
vector<double> scoresKBest;
vector<int> featuresKBest; // sorted indices of the features
double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
protected:
Tensor samples; // n+1xm tensor used to fit the model where samples[-1] is the y vector
string className;
double entropy(const Tensor& feature, const Tensor& weights);
vector<string> features;
template <class T>
vector<pair<T, T>> doCombinations(const vector<T>& source)
{
vector<pair<T, T>> result;
for (int i = 0; i < source.size(); ++i) {
T temp = source[i];
for (int j = i + 1; j < source.size(); ++j) {
result.push_back({ temp, source[j] });
}
}
return result;
}
template <class T>
T pop_first(vector<T>& v)
{
T temp = v[0];
v.erase(v.begin());
return temp;
}
public:
Metrics() = default;
Metrics(const Tensor&, const vector<string>&, const string&, const int);
Metrics(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&, const int);
double entropy(const Tensor&);
double conditionalEntropy(const Tensor&, const Tensor&);
double mutualInformation(const Tensor&, const Tensor&);
vector<float> conditionalEdgeWeights(); // To use in Python
Tensor conditionalEdge();
vector<pair<string, string>> doCombinations(const vector<string>&);
Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates);
vector<int> SelectKBestWeighted(const torch::Tensor& weights, bool ascending = false, unsigned k = 0);
vector<double> getScoresKBest() const;
double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
vector<float> conditionalEdgeWeights(vector<float>& weights); // To use in Python
Tensor conditionalEdge(const torch::Tensor& weights);
vector<pair<int, int>> maximumSpanningTree(const vector<string>& features, const Tensor& weights, const int root);
};
}

195
src/BayesNet/BoostAODE.cc Normal file
View File

@@ -0,0 +1,195 @@
#include <set>
#include <functional>
#include <limits.h>
#include "BoostAODE.h"
#include "Colors.h"
#include "Folding.h"
#include "Paths.h"
#include "CFS.h"
#include "FCBF.h"
#include "IWSS.h"
namespace bayesnet {
BoostAODE::BoostAODE() : Ensemble() {}
void BoostAODE::buildModel(const torch::Tensor& weights)
{
// Models shall be built in trainModel
models.clear();
n_models = 0;
// Prepare the validation dataset
auto y_ = dataset.index({ -1, "..." });
if (convergence) {
// Prepare train & validation sets from train data
auto fold = platform::StratifiedKFold(5, y_, 271);
dataset_ = torch::clone(dataset);
// save input dataset
auto [train, test] = fold.getFold(0);
auto train_t = torch::tensor(train);
auto test_t = torch::tensor(test);
// Get train and validation sets
X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), train_t });
y_train = dataset.index({ -1, train_t });
X_test = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), test_t });
y_test = dataset.index({ -1, test_t });
dataset = X_train;
m = X_train.size(1);
auto n_classes = states.at(className).size();
metrics = Metrics(dataset, features, className, n_classes);
// Build dataset with train data
buildDataset(y_train);
} else {
// Use all data to train
X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." });
y_train = y_;
}
}
void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters)
{
// Check if hyperparameters are valid
const vector<string> validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "threshold", "select_features" };
checkHyperparameters(validKeys, hyperparameters);
if (hyperparameters.contains("repeatSparent")) {
repeatSparent = hyperparameters["repeatSparent"];
}
if (hyperparameters.contains("maxModels")) {
maxModels = hyperparameters["maxModels"];
}
if (hyperparameters.contains("ascending")) {
ascending = hyperparameters["ascending"];
}
if (hyperparameters.contains("convergence")) {
convergence = hyperparameters["convergence"];
}
if (hyperparameters.contains("threshold")) {
threshold = hyperparameters["threshold"];
}
if (hyperparameters.contains("select_features")) {
auto selectedAlgorithm = hyperparameters["select_features"];
vector<string> algos = { "IWSS", "FCBF", "CFS" };
selectFeatures = true;
algorithm = selectedAlgorithm;
if (find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) {
throw invalid_argument("Invalid selectFeatures value [IWSS, FCBF, CFS]");
}
}
}
unordered_set<int> BoostAODE::initializeModels()
{
unordered_set<int> featuresUsed;
Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
int maxFeatures = 0;
if (algorithm == "CFS") {
featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
} else if (algorithm == "IWSS") {
if (threshold < 0 || threshold >0.5) {
throw invalid_argument("Invalid threshold value for IWSS [0, 0.5]");
}
featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
} else if (algorithm == "FCBF") {
if (threshold < 1e-7 || threshold > 1) {
throw invalid_argument("Invalid threshold value [1e-7, 1]");
}
featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
}
featureSelector->fit();
auto cfsFeatures = featureSelector->getFeatures();
for (const int& feature : cfsFeatures) {
// cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl;
featuresUsed.insert(feature);
unique_ptr<Classifier> model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
models.push_back(std::move(model));
significanceModels.push_back(1.0);
n_models++;
}
delete featureSelector;
return featuresUsed;
}
void BoostAODE::trainModel(const torch::Tensor& weights)
{
unordered_set<int> featuresUsed;
if (selectFeatures) {
featuresUsed = initializeModels();
}
if (maxModels == 0)
maxModels = .1 * n > 10 ? .1 * n : n;
Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
bool exitCondition = false;
// Variables to control the accuracy finish condition
double priorAccuracy = 0.0;
double delta = 1.0;
double threshold = 1e-4;
int tolerance = 5; // number of times the accuracy can be lower than the threshold
int count = 0; // number of times the accuracy is lower than the threshold
fitted = true; // to enable predict
// Step 0: Set the finish condition
// if not repeatSparent a finish condition is run out of features
// n_models == maxModels
// epsiolon sub t > 0.5 => inverse the weights policy
// validation error is not decreasing
while (!exitCondition) {
// Step 1: Build ranking with mutual information
auto featureSelection = metrics.SelectKBestWeighted(weights_, ascending, n); // Get all the features sorted
unique_ptr<Classifier> model;
auto feature = featureSelection[0];
if (!repeatSparent || featuresUsed.size() < featureSelection.size()) {
bool used = true;
for (const auto& feat : featureSelection) {
if (find(featuresUsed.begin(), featuresUsed.end(), feat) != featuresUsed.end()) {
continue;
}
used = false;
feature = feat;
break;
}
if (used) {
exitCondition = true;
continue;
}
}
featuresUsed.insert(feature);
model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
auto ypred = model->predict(X_train);
// Step 3.1: Compute the classifier amout of say
auto mask_wrong = ypred != y_train;
auto mask_right = ypred == y_train;
auto masked_weights = weights_ * mask_wrong.to(weights_.dtype());
double epsilon_t = masked_weights.sum().item<double>();
double wt = (1 - epsilon_t) / epsilon_t;
double alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt);
// Step 3.2: Update weights for next classifier
// Step 3.2.1: Update weights of wrong samples
weights_ += mask_wrong.to(weights_.dtype()) * exp(alpha_t) * weights_;
// Step 3.2.2: Update weights of right samples
weights_ += mask_right.to(weights_.dtype()) * exp(-alpha_t) * weights_;
// Step 3.3: Normalise the weights
double totalWeights = torch::sum(weights_).item<double>();
weights_ = weights_ / totalWeights;
// Step 3.4: Store classifier and its accuracy to weigh its future vote
models.push_back(std::move(model));
significanceModels.push_back(alpha_t);
n_models++;
if (convergence) {
auto y_val_predict = predict(X_test);
double accuracy = (y_val_predict == y_test).sum().item<double>() / (double)y_test.size(0);
if (priorAccuracy == 0) {
priorAccuracy = accuracy;
} else {
delta = accuracy - priorAccuracy;
}
if (delta < threshold) {
count++;
}
}
exitCondition = n_models >= maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance;
}
if (featuresUsed.size() != features.size()) {
status = WARNING;
}
}
vector<string> BoostAODE::graph(const string& title) const
{
return Ensemble::graph(title);
}
}

32
src/BayesNet/BoostAODE.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef BOOSTAODE_H
#define BOOSTAODE_H
#include "Ensemble.h"
#include <map>
#include "SPODE.h"
#include "FeatureSelect.h"
namespace bayesnet {
class BoostAODE : public Ensemble {
public:
BoostAODE();
virtual ~BoostAODE() {};
vector<string> graph(const string& title = "BoostAODE") const override;
void setHyperparameters(nlohmann::json& hyperparameters) override;
protected:
void buildModel(const torch::Tensor& weights) override;
void trainModel(const torch::Tensor& weights) override;
private:
torch::Tensor dataset_;
torch::Tensor X_train, y_train, X_test, y_test;
unordered_set<int> initializeModels();
// Hyperparameters
bool repeatSparent = false; // if true, a feature can be selected more than once
int maxModels = 0;
bool ascending = false; //Process KBest features ascending or descending order
bool convergence = false; //if true, stop when the model does not improve
bool selectFeatures = false; // if true, use feature selection
string algorithm = ""; // Selected feature selection algorithm
FeatureSelect* featureSelector = nullptr;
double threshold = -1;
};
}
#endif

72
src/BayesNet/CFS.cc Normal file
View File

@@ -0,0 +1,72 @@
#include "CFS.h"
#include <limits>
#include "bayesnetUtils.h"
namespace bayesnet {
void CFS::fit()
{
initialize();
computeSuLabels();
auto featureOrder = argsort(suLabels); // sort descending order
auto continueCondition = true;
auto feature = featureOrder[0];
selectedFeatures.push_back(feature);
selectedScores.push_back(suLabels[feature]);
selectedFeatures.erase(selectedFeatures.begin());
while (continueCondition) {
double merit = numeric_limits<double>::lowest();
int bestFeature = -1;
for (auto feature : featureOrder) {
selectedFeatures.push_back(feature);
// Compute merit with selectedFeatures
auto meritNew = computeMeritCFS();
if (meritNew > merit) {
merit = meritNew;
bestFeature = feature;
}
selectedFeatures.pop_back();
}
if (bestFeature == -1) {
// meritNew has to be nan due to constant features
break;
}
selectedFeatures.push_back(bestFeature);
selectedScores.push_back(merit);
featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
continueCondition = computeContinueCondition(featureOrder);
}
fitted = true;
}
bool CFS::computeContinueCondition(const vector<int>& featureOrder)
{
if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) {
return false;
}
if (selectedScores.size() >= 5) {
/*
"To prevent the best first search from exploring the entire
feature subset search space, a stopping criterion is imposed.
The search will terminate if five consecutive fully expanded
subsets show no improvement over the current best subset."
as stated in Mark A.Hall Thesis
*/
double item_ant = numeric_limits<double>::lowest();
int num = 0;
vector<double> lastFive(selectedScores.end() - 5, selectedScores.end());
for (auto item : lastFive) {
if (item_ant == numeric_limits<double>::lowest()) {
item_ant = item;
}
if (item > item_ant) {
break;
} else {
num++;
item_ant = item;
}
}
if (num == 5) {
return false;
}
}
return true;
}
}

21
src/BayesNet/CFS.h Normal file
View File

@@ -0,0 +1,21 @@
#ifndef CFS_H
#define CFS_H
#include <torch/torch.h>
#include <vector>
#include "FeatureSelect.h"
using namespace std;
namespace bayesnet {
class CFS : public FeatureSelect {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights)
{
}
virtual ~CFS() {};
void fit() override;
private:
bool computeContinueCondition(const vector<int>& featureOrder);
};
}
#endif

View File

@@ -1,7 +1,9 @@
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(BayesNet mdlp ArffFiles "${TORCH_LIBRARIES}")
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")

View File

@@ -5,7 +5,7 @@ namespace bayesnet {
using namespace torch;
Classifier::Classifier(Network model) : model(model), m(0), n(0), metrics(Metrics()), fitted(false) {}
Classifier& Classifier::build(vector<string>& features, string className, map<string, vector<int>>& states)
Classifier& Classifier::build(const vector<string>& features, const string& className, map<string, vector<int>>& states, const torch::Tensor& weights)
{
this->features = features;
this->className = className;
@@ -13,15 +13,14 @@ namespace bayesnet {
m = dataset.size(1);
n = dataset.size(0) - 1;
checkFitParameters();
auto n_classes = states[className].size();
auto n_classes = states.at(className).size();
metrics = Metrics(dataset, features, className, n_classes);
model.initialize();
buildModel();
trainModel();
buildModel(weights);
trainModel(weights);
fitted = true;
return *this;
}
void Classifier::buildDataset(Tensor& ytmp)
{
try {
@@ -35,19 +34,20 @@ namespace bayesnet {
exit(1);
}
}
void Classifier::trainModel()
void Classifier::trainModel(const torch::Tensor& weights)
{
model.fit(dataset, features, className, states);
model.fit(dataset, weights, features, className, states);
}
// X is nxm where n is the number of features and m the number of samples
Classifier& Classifier::fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states)
Classifier& Classifier::fit(torch::Tensor& X, torch::Tensor& y, const vector<string>& features, const string& className, map<string, vector<int>>& states)
{
dataset = X;
buildDataset(y);
return build(features, className, states);
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble);
return build(features, className, states, weights);
}
// X is nxm where n is the number of features and m the number of samples
Classifier& Classifier::fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states)
Classifier& Classifier::fit(vector<vector<int>>& X, vector<int>& y, const vector<string>& features, const string& className, map<string, vector<int>>& states)
{
dataset = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, kInt32);
for (int i = 0; i < X.size(); ++i) {
@@ -55,17 +55,27 @@ namespace bayesnet {
}
auto ytmp = torch::tensor(y, kInt32);
buildDataset(ytmp);
return build(features, className, states);
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble);
return build(features, className, states, weights);
}
Classifier& Classifier::fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states)
Classifier& Classifier::fit(torch::Tensor& dataset, const vector<string>& features, const string& className, map<string, vector<int>>& states)
{
this->dataset = dataset;
return build(features, className, states);
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble);
return build(features, className, states, weights);
}
Classifier& Classifier::fit(torch::Tensor& dataset, const vector<string>& features, const string& className, map<string, vector<int>>& states, const torch::Tensor& weights)
{
this->dataset = dataset;
return build(features, className, states, weights);
}
void Classifier::checkFitParameters()
{
if (torch::is_floating_point(dataset)) {
throw invalid_argument("dataset (X, y) must be of type Integer");
}
if (n != features.size()) {
throw invalid_argument("X " + to_string(n) + " and features " + to_string(features.size()) + " must have the same number of features");
throw invalid_argument("Classifier: X " + to_string(n) + " and features " + to_string(features.size()) + " must have the same number of features");
}
if (states.find(className) == states.end()) {
throw invalid_argument("className not found in states");
@@ -145,4 +155,18 @@ namespace bayesnet {
{
model.dump_cpt();
}
void Classifier::checkHyperparameters(const vector<string>& validKeys, nlohmann::json& hyperparameters)
{
for (const auto& item : hyperparameters.items()) {
if (find(validKeys.begin(), validKeys.end(), item.key()) == validKeys.end()) {
throw invalid_argument("Hyperparameter " + item.key() + " is not valid");
}
}
}
void Classifier::setHyperparameters(nlohmann::json& hyperparameters)
{
// Check if hyperparameters are valid, default is no hyperparameters
const vector<string> validKeys = { };
checkHyperparameters(validKeys, hyperparameters);
}
}

View File

@@ -10,37 +10,42 @@ using namespace torch;
namespace bayesnet {
class Classifier : public BaseClassifier {
private:
void buildDataset(torch::Tensor& y);
Classifier& build(vector<string>& features, string className, map<string, vector<int>>& states);
Classifier& build(const vector<string>& features, const string& className, map<string, vector<int>>& states, const torch::Tensor& weights);
protected:
bool fitted;
Network model;
int m, n; // m: number of samples, n: number of features
Tensor dataset; // (n+1)xm tensor
Network model;
Metrics metrics;
vector<string> features;
string className;
map<string, vector<int>> states;
Tensor dataset; // (n+1)xm tensor
status_t status = NORMAL;
void checkFitParameters();
virtual void buildModel() = 0;
void trainModel() override;
virtual void buildModel(const torch::Tensor& weights) = 0;
void trainModel(const torch::Tensor& weights) override;
void checkHyperparameters(const vector<string>& validKeys, nlohmann::json& hyperparameters);
void buildDataset(torch::Tensor& y);
public:
Classifier(Network model);
virtual ~Classifier() = default;
Classifier& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states) override;
Classifier& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override;
Classifier& fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states) override;
Classifier& fit(vector<vector<int>>& X, vector<int>& y, const vector<string>& features, const string& className, map<string, vector<int>>& states) override;
Classifier& fit(torch::Tensor& X, torch::Tensor& y, const vector<string>& features, const string& className, map<string, vector<int>>& states) override;
Classifier& fit(torch::Tensor& dataset, const vector<string>& features, const string& className, map<string, vector<int>>& states) override;
Classifier& fit(torch::Tensor& dataset, const vector<string>& features, const string& className, map<string, vector<int>>& states, const torch::Tensor& weights) override;
void addNodes();
int getNumberOfNodes() const override;
int getNumberOfEdges() const override;
int getNumberOfStates() const override;
Tensor predict(Tensor& X) override;
status_t getStatus() const override { return status; }
vector<int> predict(vector<vector<int>>& X) override;
float score(Tensor& X, Tensor& y) override;
float score(vector<vector<int>>& X, vector<int>& y) override;
vector<string> show() const override;
vector<string> topological_order() override;
void dump_cpt() const override;
void setHyperparameters(nlohmann::json& hyperparameters) override;
};
}
#endif

View File

@@ -3,9 +3,9 @@
namespace bayesnet {
using namespace torch;
Ensemble::Ensemble() : Classifier(Network()) {}
Ensemble::Ensemble() : Classifier(Network()), n_models(0) {}
void Ensemble::trainModel()
void Ensemble::trainModel(const torch::Tensor& weights)
{
n_models = models.size();
for (auto i = 0; i < n_models; ++i) {
@@ -17,10 +17,14 @@ namespace bayesnet {
{
auto y_pred_ = y_pred.accessor<int, 2>();
vector<int> y_pred_final;
int numClasses = states.at(className).size();
// y_pred is m x n_models with the prediction of every model for each sample
for (int i = 0; i < y_pred.size(0); ++i) {
vector<float> votes(y_pred.size(1), 0);
for (int j = 0; j < y_pred.size(1); ++j) {
votes[y_pred_[i][j]] += 1;
// votes store in each index (value of class) the significance added by each model
// i.e. votes[0] contains how much value has the value 0 of class. That value is generated by the models predictions
vector<double> votes(numClasses, 0.0);
for (int j = 0; j < n_models; ++j) {
votes[y_pred_[i][j]] += significanceModels.at(j);
}
// argsort in descending order
auto indices = argsort(votes);
@@ -34,7 +38,6 @@ namespace bayesnet {
throw logic_error("Ensemble has not been fitted");
}
Tensor y_pred = torch::zeros({ X.size(1), n_models }, kInt32);
//Create a threadpool
auto threads{ vector<thread>() };
mutex mtx;
for (auto i = 0; i < n_models; ++i) {

View File

@@ -14,7 +14,8 @@ namespace bayesnet {
protected:
unsigned n_models;
vector<unique_ptr<Classifier>> models;
void trainModel() override;
vector<double> significanceModels;
void trainModel(const torch::Tensor& weights) override;
vector<int> voting(Tensor& y_pred);
public:
Ensemble();

44
src/BayesNet/FCBF.cc Normal file
View File

@@ -0,0 +1,44 @@
#include "bayesnetUtils.h"
#include "FCBF.h"
namespace bayesnet {
FCBF::FCBF(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
{
if (threshold < 1e-7) {
throw std::invalid_argument("Threshold cannot be less than 1e-7");
}
}
void FCBF::fit()
{
initialize();
computeSuLabels();
auto featureOrder = argsort(suLabels); // sort descending order
auto featureOrderCopy = featureOrder;
for (const auto& feature : featureOrder) {
// Don't self compare
featureOrderCopy.erase(featureOrderCopy.begin());
if (suLabels.at(feature) == 0.0) {
// The feature has been removed from the list
continue;
}
if (suLabels.at(feature) < threshold) {
break;
}
// Remove redundant features
for (const auto& featureCopy : featureOrderCopy) {
double value = computeSuFeatures(feature, featureCopy);
if (value >= suLabels.at(featureCopy)) {
// Remove feature from list
suLabels[featureCopy] = 0.0;
}
}
selectedFeatures.push_back(feature);
selectedScores.push_back(suLabels[feature]);
if (selectedFeatures.size() == maxFeatures) {
break;
}
}
fitted = true;
}
}

18
src/BayesNet/FCBF.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef FCBF_H
#define FCBF_H
#include <torch/torch.h>
#include <vector>
#include "FeatureSelect.h"
using namespace std;
namespace bayesnet {
class FCBF : public FeatureSelect {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
FCBF(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
virtual ~FCBF() {};
void fit() override;
private:
double threshold = -1;
};
}
#endif

View File

@@ -0,0 +1,79 @@
#include "FeatureSelect.h"
#include <limits>
#include "bayesnetUtils.h"
namespace bayesnet {
FeatureSelect::FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
{
}
void FeatureSelect::initialize()
{
selectedFeatures.clear();
selectedScores.clear();
}
double FeatureSelect::symmetricalUncertainty(int a, int b)
{
/*
Compute symmetrical uncertainty. Normalize* information gain (mutual
information) with the entropies of the features in order to compensate
the bias due to high cardinality features. *Range [0, 1]
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
*/
auto x = samples.index({ a, "..." });
auto y = samples.index({ b, "..." });
auto mu = mutualInformation(x, y, weights);
auto hx = entropy(x, weights);
auto hy = entropy(y, weights);
return 2.0 * mu / (hx + hy);
}
void FeatureSelect::computeSuLabels()
{
// Compute Simmetrical Uncertainty between features and labels
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
for (int i = 0; i < features.size(); ++i) {
suLabels.push_back(symmetricalUncertainty(i, -1));
}
}
double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
{
// Compute Simmetrical Uncertainty between features
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
try {
return suFeatures.at({ firstFeature, secondFeature });
}
catch (const out_of_range& e) {
double result = symmetricalUncertainty(firstFeature, secondFeature);
suFeatures[{firstFeature, secondFeature}] = result;
return result;
}
}
double FeatureSelect::computeMeritCFS()
{
double result;
double rcf = 0;
for (auto feature : selectedFeatures) {
rcf += suLabels[feature];
}
double rff = 0;
int n = selectedFeatures.size();
for (const auto& item : doCombinations(selectedFeatures)) {
rff += computeSuFeatures(item.first, item.second);
}
return rcf / sqrt(n + (n * n - n) * rff);
}
vector<int> FeatureSelect::getFeatures() const
{
if (!fitted) {
throw runtime_error("FeatureSelect not fitted");
}
return selectedFeatures;
}
vector<double> FeatureSelect::getScores() const
{
if (!fitted) {
throw runtime_error("FeatureSelect not fitted");
}
return selectedScores;
}
}

View File

@@ -0,0 +1,31 @@
#ifndef FEATURE_SELECT_H
#define FEATURE_SELECT_H
#include <torch/torch.h>
#include <vector>
#include "BayesMetrics.h"
using namespace std;
namespace bayesnet {
class FeatureSelect : public Metrics {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
virtual ~FeatureSelect() {};
virtual void fit() = 0;
vector<int> getFeatures() const;
vector<double> getScores() const;
protected:
void initialize();
void computeSuLabels();
double computeSuFeatures(const int a, const int b);
double symmetricalUncertainty(int a, int b);
double computeMeritCFS();
const torch::Tensor& weights;
int maxFeatures;
vector<int> selectedFeatures;
vector<double> selectedScores;
vector<double> suLabels;
map<pair<int, int>, double> suFeatures;
bool fitted = false;
};
}
#endif

47
src/BayesNet/IWSS.cc Normal file
View File

@@ -0,0 +1,47 @@
#include "IWSS.h"
#include <limits>
#include "bayesnetUtils.h"
namespace bayesnet {
IWSS::IWSS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
{
if (threshold < 0 || threshold > .5) {
throw std::invalid_argument("Threshold has to be in [0, 0.5]");
}
}
void IWSS::fit()
{
initialize();
computeSuLabels();
auto featureOrder = argsort(suLabels); // sort descending order
auto featureOrderCopy = featureOrder;
// Add first and second features to result
// First with its own score
auto first_feature = pop_first(featureOrderCopy);
selectedFeatures.push_back(first_feature);
selectedScores.push_back(suLabels.at(first_feature));
// Second with the score of the candidates
selectedFeatures.push_back(pop_first(featureOrderCopy));
auto merit = computeMeritCFS();
selectedScores.push_back(merit);
for (const auto feature : featureOrderCopy) {
selectedFeatures.push_back(feature);
// Compute merit with selectedFeatures
auto meritNew = computeMeritCFS();
double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0;
if (meritNew > merit || delta < threshold) {
if (meritNew > merit) {
merit = meritNew;
}
selectedScores.push_back(meritNew);
} else {
selectedFeatures.pop_back();
break;
}
if (selectedFeatures.size() == maxFeatures) {
break;
}
}
fitted = true;
}
}

18
src/BayesNet/IWSS.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef IWSS_H
#define IWSS_H
#include <torch/torch.h>
#include <vector>
#include "FeatureSelect.h"
using namespace std;
namespace bayesnet {
class IWSS : public FeatureSelect {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
IWSS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
virtual ~IWSS() {};
void fit() override;
private:
double threshold = -1;
};
}
#endif

View File

@@ -4,7 +4,19 @@ namespace bayesnet {
using namespace torch;
KDB::KDB(int k, float theta) : Classifier(Network()), k(k), theta(theta) {}
void KDB::buildModel()
void KDB::setHyperparameters(nlohmann::json& hyperparameters)
{
// Check if hyperparameters are valid
const vector<string> validKeys = { "k", "theta" };
checkHyperparameters(validKeys, hyperparameters);
if (hyperparameters.contains("k")) {
k = hyperparameters["k"];
}
if (hyperparameters.contains("theta")) {
theta = hyperparameters["theta"];
}
}
void KDB::buildModel(const torch::Tensor& weights)
{
/*
1. For each feature Xi, compute mutual information, I(X;C),
@@ -29,13 +41,13 @@ namespace bayesnet {
// where C is the class.
addNodes();
const Tensor& y = dataset.index({ -1, "..." });
vector <float> mi;
vector<double> mi;
for (auto i = 0; i < features.size(); i++) {
Tensor firstFeature = dataset.index({ i, "..." });
mi.push_back(metrics.mutualInformation(firstFeature, y));
mi.push_back(metrics.mutualInformation(firstFeature, y, weights));
}
// 2. Compute class conditional mutual information I(Xi;XjIC), f or each
auto conditionalEdgeWeights = metrics.conditionalEdge();
auto conditionalEdgeWeights = metrics.conditionalEdge(weights);
// 3. Let the used variable list, S, be empty.
vector<int> S;
// 4. Let the DAG network being constructed, BN, begin with a single

View File

@@ -1,5 +1,6 @@
#ifndef KDB_H
#define KDB_H
#include <torch/torch.h>
#include "Classifier.h"
#include "bayesnetUtils.h"
namespace bayesnet {
@@ -11,10 +12,11 @@ namespace bayesnet {
float theta;
void add_m_edges(int idx, vector<int>& S, Tensor& weights);
protected:
void buildModel() override;
void buildModel(const torch::Tensor& weights) override;
public:
explicit KDB(int k, float theta = 0.03);
virtual ~KDB() {};
void setHyperparameters(nlohmann::json& hyperparameters) override;
vector<string> graph(const string& name = "KDB") const override;
};
}

View File

@@ -3,9 +3,9 @@
namespace bayesnet {
using namespace std;
KDBLd::KDBLd(int k) : KDB(k), Proposal(dataset, features, className) {}
KDBLd& KDBLd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
KDBLd& KDBLd::fit(torch::Tensor& X_, torch::Tensor& y_, const vector<string>& features_, const string& className_, map<string, vector<int>>& states_)
{
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
checkInput(X_, y_);
features = features_;
className = className_;
Xf = X_;

View File

@@ -10,7 +10,7 @@ namespace bayesnet {
public:
explicit KDBLd(int k);
virtual ~KDBLd() = default;
KDBLd& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override;
KDBLd& fit(torch::Tensor& X, torch::Tensor& y, const vector<string>& features, const string& className, map<string, vector<int>>& states) override;
vector<string> graph(const string& name = "KDB") const override;
Tensor predict(Tensor& X) override;
static inline string version() { return "0.0.1"; };

View File

@@ -1,5 +1,6 @@
#include "Mst.h"
#include <vector>
#include <list>
/*
Based on the code from https://www.softwaretestinghelp.com/minimum-spanning-tree-tutorial/
@@ -34,7 +35,7 @@ namespace bayesnet {
void Graph::kruskal_algorithm()
{
// sort the edges ordered on decreasing weight
sort(G.begin(), G.end(), [](const auto& left, const auto& right) {return left.first > right.first;});
stable_sort(G.begin(), G.end(), [](const auto& left, const auto& right) {return left.first > right.first;});
for (int i = 0; i < G.size(); i++) {
int uSt, vEd;
uSt = find_set(G[i].second.first);
@@ -55,15 +56,24 @@ namespace bayesnet {
}
}
void insertElement(list<int>& variables, int variable)
{
if (find(variables.begin(), variables.end(), variable) == variables.end()) {
variables.push_front(variable);
}
}
vector<pair<int, int>> reorder(vector<pair<float, pair<int, int>>> T, int root_original)
{
// Create the edges of a DAG from the MST
// replacing unordered_set with list because unordered_set cannot guarantee the order of the elements inserted
auto result = vector<pair<int, int>>();
auto visited = vector<int>();
auto nextVariables = unordered_set<int>();
nextVariables.emplace(root_original);
auto nextVariables = list<int>();
nextVariables.push_front(root_original);
while (nextVariables.size() > 0) {
int root = *nextVariables.begin();
nextVariables.erase(nextVariables.begin());
int root = nextVariables.front();
nextVariables.pop_front();
for (int i = 0; i < T.size(); ++i) {
auto [weight, edge] = T[i];
auto [from, to] = edge;
@@ -71,10 +81,10 @@ namespace bayesnet {
visited.insert(visited.begin(), i);
if (from == root) {
result.push_back({ from, to });
nextVariables.emplace(to);
insertElement(nextVariables, to);
} else {
result.push_back({ to, from });
nextVariables.emplace(from);
insertElement(nextVariables, from);
}
}
}
@@ -99,7 +109,6 @@ namespace bayesnet {
{
auto num_features = features.size();
Graph g(num_features);
// Make a complete graph
for (int i = 0; i < num_features - 1; ++i) {
for (int j = i + 1; j < num_features; ++j) {

View File

@@ -3,9 +3,8 @@
#include "Network.h"
#include "bayesnetUtils.h"
namespace bayesnet {
Network::Network() : features(vector<string>()), className(""), classNumStates(0), fitted(false) {}
Network::Network(float maxT) : features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {}
Network::Network(float maxT, int smoothing) : laplaceSmoothing(smoothing), features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {}
Network::Network() : features(vector<string>()), className(""), classNumStates(0), fitted(false), laplaceSmoothing(0) {}
Network::Network(float maxT) : features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false), laplaceSmoothing(0) {}
Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other.
getmaxThreads()), fitted(other.fitted)
{
@@ -104,8 +103,11 @@ namespace bayesnet {
{
return nodes;
}
void Network::checkFitData(int n_samples, int n_features, int n_samples_y, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
void Network::checkFitData(int n_samples, int n_features, int n_samples_y, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states, const torch::Tensor& weights)
{
if (weights.size(0) != n_samples) {
throw invalid_argument("Weights (" + to_string(weights.size(0)) + ") must have the same number of elements as samples (" + to_string(n_samples) + ") in Network::fit");
}
if (n_samples != n_samples_y) {
throw invalid_argument("X and y must have the same number of samples in Network::fit (" + to_string(n_samples) + " != " + to_string(n_samples_y) + ")");
}
@@ -130,34 +132,35 @@ namespace bayesnet {
void Network::setStates(const map<string, vector<int>>& states)
{
// Set states to every Node in the network
for (int i = 0; i < features.size(); ++i) {
nodes[features[i]]->setNumStates(states.at(features[i]).size());
}
classNumStates = nodes[className]->getNumStates();
for_each(features.begin(), features.end(), [this, &states](const string& feature) {
nodes.at(feature)->setNumStates(states.at(feature).size());
});
classNumStates = nodes.at(className)->getNumStates();
}
// X comes in nxm, where n is the number of features and m the number of samples
void Network::fit(const torch::Tensor& X, const torch::Tensor& y, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
void Network::fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
{
checkFitData(X.size(1), X.size(0), y.size(0), featureNames, className, states);
checkFitData(X.size(1), X.size(0), y.size(0), featureNames, className, states, weights);
this->className = className;
Tensor ytmp = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
samples = torch::cat({ X , ytmp }, 0);
for (int i = 0; i < featureNames.size(); ++i) {
auto row_feature = X.index({ i, "..." });
}
completeFit(states);
completeFit(states, weights);
}
void Network::fit(const torch::Tensor& samples, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
void Network::fit(const torch::Tensor& samples, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
{
checkFitData(samples.size(1), samples.size(0) - 1, samples.size(1), featureNames, className, states);
checkFitData(samples.size(1), samples.size(0) - 1, samples.size(1), featureNames, className, states, weights);
this->className = className;
this->samples = samples;
completeFit(states);
completeFit(states, weights);
}
// input_data comes in nxm, where n is the number of features and m the number of samples
void Network::fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
void Network::fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<double>& weights_, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
{
checkFitData(input_data[0].size(), input_data.size(), labels.size(), featureNames, className, states);
const torch::Tensor weights = torch::tensor(weights_, torch::kFloat64);
checkFitData(input_data[0].size(), input_data.size(), labels.size(), featureNames, className, states, weights);
this->className = className;
// Build tensor of samples (nxm) (n+1 because of the class)
samples = torch::zeros({ static_cast<int>(input_data.size() + 1), static_cast<int>(input_data[0].size()) }, torch::kInt32);
@@ -165,42 +168,17 @@ namespace bayesnet {
samples.index_put_({ i, "..." }, torch::tensor(input_data[i], torch::kInt32));
}
samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
completeFit(states);
completeFit(states, weights);
}
void Network::completeFit(const map<string, vector<int>>& states)
void Network::completeFit(const map<string, vector<int>>& states, const torch::Tensor& weights)
{
setStates(states);
int maxThreadsRunning = static_cast<int>(std::thread::hardware_concurrency() * maxThreads);
if (maxThreadsRunning < 1) {
maxThreadsRunning = 1;
}
laplaceSmoothing = 1.0 / samples.size(1); // To use in CPT computation
vector<thread> threads;
mutex mtx;
condition_variable cv;
int activeThreads = 0;
int nextNodeIndex = 0;
while (nextNodeIndex < nodes.size()) {
unique_lock<mutex> lock(mtx);
cv.wait(lock, [&activeThreads, &maxThreadsRunning]() { return activeThreads < maxThreadsRunning; });
threads.emplace_back([this, &nextNodeIndex, &mtx, &cv, &activeThreads]() {
while (true) {
unique_lock<mutex> lock(mtx);
if (nextNodeIndex >= nodes.size()) {
break; // No more work remaining
}
auto& pair = *std::next(nodes.begin(), nextNodeIndex);
++nextNodeIndex;
lock.unlock();
pair.second->computeCPT(samples, features, laplaceSmoothing);
lock.lock();
nodes[pair.first] = std::move(pair.second);
lock.unlock();
}
lock_guard<mutex> lock(mtx);
--activeThreads;
cv.notify_one();
for (auto& node : nodes) {
threads.emplace_back([this, &node, &weights]() {
node.second->computeCPT(samples, features, laplaceSmoothing, weights);
});
++activeThreads;
}
for (auto& thread : threads) {
thread.join();
@@ -223,8 +201,7 @@ namespace bayesnet {
}
if (proba)
return result;
else
return result.argmax(1);
return result.argmax(1);
}
// Return mxn tensor of probabilities
Tensor Network::predict_proba(const Tensor& samples)
@@ -343,7 +320,7 @@ namespace bayesnet {
}
// Normalize result
double sum = accumulate(result.begin(), result.end(), 0.0);
transform(result.begin(), result.end(), result.begin(), [sum](double& value) { return value / sum; });
transform(result.begin(), result.end(), result.begin(), [sum](const double& value) { return value / sum; });
return result;
}
vector<string> Network::show() const
@@ -395,7 +372,6 @@ namespace bayesnet {
auto result = features;
result.erase(remove(result.begin(), result.end(), className), result.end());
bool ending{ false };
int idx = 0;
while (!ending) {
ending = true;
for (auto feature : features) {
@@ -431,6 +407,7 @@ namespace bayesnet {
{
for (auto& node : nodes) {
cout << "* " << node.first << ": (" << node.second->getNumStates() << ") : " << node.second->getCPT().sizes() << endl;
cout << node.second->getCPT() << endl;
}
}
}

View File

@@ -13,21 +13,21 @@ namespace bayesnet {
int classNumStates;
vector<string> features; // Including classname
string className;
int laplaceSmoothing = 1;
double laplaceSmoothing;
torch::Tensor samples; // nxm tensor used to fit the model
bool isCyclic(const std::string&, std::unordered_set<std::string>&, std::unordered_set<std::string>&);
vector<double> predict_sample(const vector<int>&);
vector<double> predict_sample(const torch::Tensor&);
vector<double> exactInference(map<string, int>&);
double computeFactor(map<string, int>&);
void completeFit(const map<string, vector<int>>&);
void checkFitData(int n_features, int n_samples, int n_samples_y, const vector<string>& featureNames, const string& className, const map<string, vector<int>>&);
void completeFit(const map<string, vector<int>>& states, const torch::Tensor& weights);
void checkFitData(int n_features, int n_samples, int n_samples_y, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states, const torch::Tensor& weights);
void setStates(const map<string, vector<int>>&);
public:
Network();
explicit Network(float, int);
explicit Network(float);
explicit Network(Network&);
~Network() = default;
torch::Tensor& getSamples();
float getmaxThreads();
void addNode(const string&);
@@ -39,9 +39,12 @@ namespace bayesnet {
int getNumEdges() const;
int getClassNumStates() const;
string getClassName() const;
void fit(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&, const map<string, vector<int>>&);
void fit(const torch::Tensor&, const torch::Tensor&, const vector<string>&, const string&, const map<string, vector<int>>&);
void fit(const torch::Tensor&, const vector<string>&, const string&, const map<string, vector<int>>&);
/*
Notice: Nodes have to be inserted in the same order as they are in the dataset, i.e., first node is first column and so on.
*/
void fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<double>& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
void fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
void fit(const torch::Tensor& samples, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
vector<int> predict(const vector<vector<int>>&); // Return mx1 vector of predictions
torch::Tensor predict(const torch::Tensor&); // Return mx1 tensor of predictions
torch::Tensor predict_tensor(const torch::Tensor& samples, const bool proba);
@@ -53,7 +56,7 @@ namespace bayesnet {
vector<string> graph(const string& title) const; // Returns a vector of strings representing the graph in graphviz format
void initialize();
void dump_cpt() const;
inline string version() { return "0.1.0"; }
inline string version() { return "0.2.0"; }
};
}
#endif

View File

@@ -84,7 +84,7 @@ namespace bayesnet {
}
return result;
}
void Node::computeCPT(const torch::Tensor& dataset, const vector<string>& features, const int laplaceSmoothing)
void Node::computeCPT(const torch::Tensor& dataset, const vector<string>& features, const double laplaceSmoothing, const torch::Tensor& weights)
{
dimensions.clear();
// Get dimensions of the CPT
@@ -100,7 +100,7 @@ namespace bayesnet {
}
int name_index = pos - features.begin();
for (int n_sample = 0; n_sample < dataset.size(1); ++n_sample) {
torch::List<c10::optional<torch::Tensor>> coordinates;
c10::List<c10::optional<at::Tensor>> coordinates;
coordinates.push_back(dataset.index({ name_index, n_sample }));
for (auto parent : parents) {
pos = find(features.begin(), features.end(), parent->getName());
@@ -111,17 +111,17 @@ namespace bayesnet {
coordinates.push_back(dataset.index({ parent_index, n_sample }));
}
// Increment the count of the corresponding coordinate
cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + 1);
cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + weights.index({ n_sample }).item<double>());
}
// Normalize the counts
cpTable = cpTable / cpTable.sum(0);
}
float Node::getFactorValue(map<string, int>& evidence)
{
torch::List<c10::optional<torch::Tensor>> coordinates;
c10::List<c10::optional<at::Tensor>> coordinates;
// following predetermined order of indices in the cpTable (see Node.h)
coordinates.push_back(torch::tensor(evidence[name]));
transform(parents.begin(), parents.end(), back_inserter(coordinates), [&evidence](const auto& parent) { return torch::tensor(evidence[parent->getName()]); });
coordinates.push_back(at::tensor(evidence[name]));
transform(parents.begin(), parents.end(), back_inserter(coordinates), [&evidence](const auto& parent) { return at::tensor(evidence[parent->getName()]); });
return cpTable.index({ coordinates }).item<float>();
}
vector<string> Node::graph(const string& className)

View File

@@ -14,8 +14,8 @@ namespace bayesnet {
int numStates; // number of states of the variable
torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ...
vector<int64_t> dimensions; // dimensions of the cpTable
public:
vector<pair<string, string>> combinations(const vector<string>&);
public:
explicit Node(const string&);
void clear();
void addParent(Node*);
@@ -26,7 +26,7 @@ namespace bayesnet {
vector<Node*>& getParents();
vector<Node*>& getChildren();
torch::Tensor& getCPT();
void computeCPT(const torch::Tensor&, const vector<string>&, const int);
void computeCPT(const torch::Tensor& dataset, const vector<string>& features, const double laplaceSmoothing, const torch::Tensor& weights);
int getNumStates() const;
void setNumStates(int);
unsigned minFill();

View File

@@ -9,6 +9,15 @@ namespace bayesnet {
delete value;
}
}
void Proposal::checkInput(const torch::Tensor& X, const torch::Tensor& y)
{
if (!torch::is_floating_point(X)) {
throw std::invalid_argument("X must be a floating point tensor");
}
if (torch::is_floating_point(y)) {
throw std::invalid_argument("y must be an integer tensor");
}
}
map<string, vector<int>> Proposal::localDiscretizationProposal(const map<string, vector<int>>& oldStates, Network& model)
{
// order of local discretization is important. no good 0, 1, 2...
@@ -44,15 +53,6 @@ namespace bayesnet {
auto xvf_ptr = Xf.index({ index }).data_ptr<float>();
auto xvf = vector<mdlp::precision_t>(xvf_ptr, xvf_ptr + Xf.size(1));
discretizers[feature]->fit(xvf, yxv);
//
//
//
// auto tmp = discretizers[feature]->transform(xvf);
// Xv[index] = tmp;
// auto xStates = vector<int>(discretizers[pFeatures[index]]->getCutPoints().size() + 1);
// iota(xStates.begin(), xStates.end(), 0);
// //Update new states of the feature/node
// states[feature] = xStates;
}
if (upgrade) {
// Discretize again X (only the affected indices) with the new fitted discretizers
@@ -65,7 +65,8 @@ namespace bayesnet {
//Update new states of the feature/node
states[pFeatures[index]] = xStates;
}
model.fit(pDataset, pFeatures, pClassName, states);
const torch::Tensor weights = torch::full({ pDataset.size(1) }, 1.0 / pDataset.size(1), torch::kDouble);
model.fit(pDataset, weights, pFeatures, pClassName, states);
}
return states;
}

View File

@@ -13,6 +13,7 @@ namespace bayesnet {
Proposal(torch::Tensor& pDataset, vector<string>& features_, string& className_);
virtual ~Proposal();
protected:
void checkInput(const torch::Tensor& X, const torch::Tensor& y);
torch::Tensor prepareX(torch::Tensor& X);
map<string, vector<int>> localDiscretizationProposal(const map<string, vector<int>>& states, Network& model);
map<string, vector<int>> fit_local_discretization(const torch::Tensor& y);

View File

@@ -4,7 +4,7 @@ namespace bayesnet {
SPODE::SPODE(int root) : Classifier(Network()), root(root) {}
void SPODE::buildModel()
void SPODE::buildModel(const torch::Tensor& weights)
{
// 0. Add all nodes to the model
addNodes();

View File

@@ -7,7 +7,7 @@ namespace bayesnet {
private:
int root;
protected:
void buildModel() override;
void buildModel(const torch::Tensor& weights) override;
public:
explicit SPODE(int root);
virtual ~SPODE() {};

View File

@@ -3,9 +3,9 @@
namespace bayesnet {
using namespace std;
SPODELd::SPODELd(int root) : SPODE(root), Proposal(dataset, features, className) {}
SPODELd& SPODELd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
SPODELd& SPODELd::fit(torch::Tensor& X_, torch::Tensor& y_, const vector<string>& features_, const string& className_, map<string, vector<int>>& states_)
{
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
checkInput(X_, y_);
features = features_;
className = className_;
Xf = X_;
@@ -18,12 +18,13 @@ namespace bayesnet {
states = localDiscretizationProposal(states, model);
return *this;
}
SPODELd& SPODELd::fit(torch::Tensor& dataset, vector<string>& features_, string className_, map<string, vector<int>>& states_)
SPODELd& SPODELd::fit(torch::Tensor& dataset, const vector<string>& features_, const string& className_, map<string, vector<int>>& states_)
{
if (!torch::is_floating_point(dataset)) {
throw std::runtime_error("Dataset must be a floating point tensor");
}
Xf = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }).clone();
cout << "Xf " << Xf.sizes() << " dtype: " << Xf.dtype() << endl;
y = dataset.index({ -1, "..." }).clone();
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
features = features_;
className = className_;
// Fills vectors Xv & yv with the data from tensors X_ (discretized) & y

View File

@@ -9,8 +9,8 @@ namespace bayesnet {
public:
explicit SPODELd(int root);
virtual ~SPODELd() = default;
SPODELd& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override;
SPODELd& fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states) override;
SPODELd& fit(torch::Tensor& X, torch::Tensor& y, const vector<string>& features, const string& className, map<string, vector<int>>& states) override;
SPODELd& fit(torch::Tensor& dataset, const vector<string>& features, const string& className, map<string, vector<int>>& states) override;
vector<string> graph(const string& name = "SPODE") const override;
Tensor predict(Tensor& X) override;
static inline string version() { return "0.0.1"; };

View File

@@ -5,7 +5,7 @@ namespace bayesnet {
TAN::TAN() : Classifier(Network()) {}
void TAN::buildModel()
void TAN::buildModel(const torch::Tensor& weights)
{
// 0. Add all nodes to the model
addNodes();
@@ -15,15 +15,15 @@ namespace bayesnet {
Tensor class_dataset = dataset.index({ -1, "..." });
for (int i = 0; i < static_cast<int>(features.size()); ++i) {
Tensor feature_dataset = dataset.index({ i, "..." });
auto mi_value = metrics.mutualInformation(class_dataset, feature_dataset);
auto mi_value = metrics.mutualInformation(class_dataset, feature_dataset, weights);
mi.push_back({ i, mi_value });
}
sort(mi.begin(), mi.end(), [](const auto& left, const auto& right) {return left.second < right.second;});
auto root = mi[mi.size() - 1].first;
// 2. Compute mutual information between each feature and the class
auto weights = metrics.conditionalEdge();
auto weights_matrix = metrics.conditionalEdge(weights);
// 3. Compute the maximum spanning tree
auto mst = metrics.maximumSpanningTree(features, weights, root);
auto mst = metrics.maximumSpanningTree(features, weights_matrix, root);
// 4. Add edges from the maximum spanning tree to the model
for (auto i = 0; i < mst.size(); ++i) {
auto [from, to] = mst[i];

View File

@@ -3,11 +3,10 @@
#include "Classifier.h"
namespace bayesnet {
using namespace std;
using namespace torch;
class TAN : public Classifier {
private:
protected:
void buildModel() override;
void buildModel(const torch::Tensor& weights) override;
public:
TAN();
virtual ~TAN() {};

View File

@@ -3,9 +3,9 @@
namespace bayesnet {
using namespace std;
TANLd::TANLd() : TAN(), Proposal(dataset, features, className) {}
TANLd& TANLd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
TANLd& TANLd::fit(torch::Tensor& X_, torch::Tensor& y_, const vector<string>& features_, const string& className_, map<string, vector<int>>& states_)
{
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
checkInput(X_, y_);
features = features_;
className = className_;
Xf = X_;

View File

@@ -10,7 +10,7 @@ namespace bayesnet {
public:
TANLd();
virtual ~TANLd() = default;
TANLd& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override;
TANLd& fit(torch::Tensor& X, torch::Tensor& y, const vector<string>& features, const string& className, map<string, vector<int>>& states) override;
vector<string> graph(const string& name = "TAN") const override;
Tensor predict(Tensor& X) override;
static inline string version() { return "0.0.1"; };

View File

@@ -4,7 +4,7 @@ namespace bayesnet {
using namespace std;
using namespace torch;
// Return the indices in descending order
vector<int> argsort(vector<float>& nums)
vector<int> argsort(vector<double>& nums)
{
int n = nums.size();
vector<int> indices(n);

View File

@@ -5,7 +5,7 @@
namespace bayesnet {
using namespace std;
using namespace torch;
vector<int> argsort(vector<float>& nums);
vector<int> argsort(vector<double>& nums);
vector<vector<int>> tensorToVector(Tensor& tensor);
}
#endif //BAYESNET_UTILS_H

300
src/Platform/BestResults.cc Normal file
View File

@@ -0,0 +1,300 @@
#include <filesystem>
#include <set>
#include <fstream>
#include <iostream>
#include <sstream>
#include "BestResults.h"
#include "Result.h"
#include "Colors.h"
#include "Statistics.h"
#include "BestResultsExcel.h"
#include "CLocale.h"
namespace fs = std::filesystem;
// function ftime_to_string, Code taken from
// https://stackoverflow.com/a/58237530/1389271
template <typename TP>
std::string ftime_to_string(TP tp)
{
using namespace std::chrono;
auto sctp = time_point_cast<system_clock::duration>(tp - TP::clock::now()
+ system_clock::now());
auto tt = system_clock::to_time_t(sctp);
std::tm* gmt = std::gmtime(&tt);
std::stringstream buffer;
buffer << std::put_time(gmt, "%Y-%m-%d %H:%M");
return buffer.str();
}
namespace platform {
string BestResults::build()
{
auto files = loadResultFiles();
if (files.size() == 0) {
cerr << Colors::MAGENTA() << "No result files were found!" << Colors::RESET() << endl;
exit(1);
}
json bests;
for (const auto& file : files) {
auto result = Result(path, file);
auto data = result.load();
for (auto const& item : data.at("results")) {
bool update = false;
// Check if results file contains only one dataset
auto datasetName = item.at("dataset").get<string>();
if (bests.contains(datasetName)) {
if (item.at("score").get<double>() > bests[datasetName].at(0).get<double>()) {
update = true;
}
} else {
update = true;
}
if (update) {
bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file };
}
}
}
string bestFileName = path + bestResultFile();
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest);
cout << Colors::MAGENTA() << "File " << bestFileName << " already exists and it shall be overwritten." << Colors::RESET() << endl;
}
ofstream file(bestFileName);
file << bests;
file.close();
return bestFileName;
}
string BestResults::bestResultFile()
{
return "best_results_" + score + "_" + model + ".json";
}
pair<string, string> getModelScore(string name)
{
// results_accuracy_BoostAODE_MacBookpro16_2023-09-06_12:27:00_1.json
int i = 0;
auto pos = name.find("_");
auto pos2 = name.find("_", pos + 1);
string score = name.substr(pos + 1, pos2 - pos - 1);
pos = name.find("_", pos2 + 1);
string model = name.substr(pos2 + 1, pos - pos2 - 1);
return { model, score };
}
vector<string> BestResults::loadResultFiles()
{
vector<string> files;
using std::filesystem::directory_iterator;
string fileModel, fileScore;
for (const auto& file : directory_iterator(path)) {
auto fileName = file.path().filename().string();
if (fileName.find(".json") != string::npos && fileName.find("results_") == 0) {
tie(fileModel, fileScore) = getModelScore(fileName);
if (score == fileScore && (model == fileModel || model == "any")) {
files.push_back(fileName);
}
}
}
return files;
}
json BestResults::loadFile(const string& fileName)
{
ifstream resultData(fileName);
if (resultData.is_open()) {
json data = json::parse(resultData);
return data;
}
throw invalid_argument("Unable to open result file. [" + fileName + "]");
}
vector<string> BestResults::getModels()
{
set<string> models;
vector<string> result;
auto files = loadResultFiles();
if (files.size() == 0) {
cerr << Colors::MAGENTA() << "No result files were found!" << Colors::RESET() << endl;
exit(1);
}
string fileModel, fileScore;
for (const auto& file : files) {
// extract the model from the file name
tie(fileModel, fileScore) = getModelScore(file);
// add the model to the vector of models
models.insert(fileModel);
}
result = vector<string>(models.begin(), models.end());
return result;
}
vector<string> BestResults::getDatasets(json table)
{
vector<string> datasets;
for (const auto& dataset : table.items()) {
datasets.push_back(dataset.key());
}
return datasets;
}
void BestResults::buildAll()
{
auto models = getModels();
for (const auto& model : models) {
cout << "Building best results for model: " << model << endl;
this->model = model;
build();
}
model = "any";
}
void BestResults::reportSingle()
{
string bestFileName = path + bestResultFile();
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest);
} else {
cerr << Colors::MAGENTA() << "File " << bestFileName << " doesn't exist." << Colors::RESET() << endl;
exit(1);
}
auto temp = ConfigLocale();
auto date = ftime_to_string(filesystem::last_write_time(bestFileName));
auto data = loadFile(bestFileName);
auto datasets = getDatasets(data);
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
cout << Colors::GREEN() << "Best results for " << model << " and " << score << " as of " << date << endl;
cout << "--------------------------------------------------------" << endl;
cout << Colors::GREEN() << " # " << setw(maxDatasetName + 1) << left << string("Dataset") << "Score File Hyperparameters" << endl;
cout << "=== " << string(maxDatasetName, '=') << " =========== ================================================================== ================================================= " << endl;
auto i = 0;
bool odd = true;
for (auto const& item : data.items()) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
cout << color << setw(3) << fixed << right << i++ << " ";
cout << setw(maxDatasetName) << left << item.key() << " ";
cout << setw(11) << setprecision(9) << fixed << item.value().at(0).get<double>() << " ";
cout << setw(66) << item.value().at(2).get<string>() << " ";
cout << item.value().at(1) << " ";
cout << endl;
odd = !odd;
}
}
json BestResults::buildTableResults(vector<string> models)
{
json table;
auto maxDate = filesystem::file_time_type::max();
for (const auto& model : models) {
this->model = model;
string bestFileName = path + bestResultFile();
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest);
} else {
cerr << Colors::MAGENTA() << "File " << bestFileName << " doesn't exist." << Colors::RESET() << endl;
exit(1);
}
auto dateWrite = filesystem::last_write_time(bestFileName);
if (dateWrite < maxDate) {
maxDate = dateWrite;
}
auto data = loadFile(bestFileName);
table[model] = data;
}
table["dateTable"] = ftime_to_string(maxDate);
return table;
}
void BestResults::printTableResults(vector<string> models, json table)
{
cout << Colors::GREEN() << "Best results for " << score << " as of " << table.at("dateTable").get<string>() << endl;
cout << "------------------------------------------------" << endl;
cout << Colors::GREEN() << " # " << setw(maxDatasetName + 1) << left << string("Dataset");
for (const auto& model : models) {
cout << setw(maxModelName) << left << model << " ";
}
cout << endl;
cout << "=== " << string(maxDatasetName, '=') << " ";
for (const auto& model : models) {
cout << string(maxModelName, '=') << " ";
}
cout << endl;
auto i = 0;
bool odd = true;
map<string, double> totals;
int nDatasets = table.begin().value().size();
for (const auto& model : models) {
totals[model] = 0.0;
}
auto datasets = getDatasets(table.begin().value());
for (auto const& dataset : datasets) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
cout << color << setw(3) << fixed << right << i++ << " ";
cout << setw(maxDatasetName) << left << dataset << " ";
double maxValue = 0;
// Find out the max value for this dataset
for (const auto& model : models) {
double value = table[model].at(dataset).at(0).get<double>();
if (value > maxValue) {
maxValue = value;
}
}
// Print the row with red colors on max values
for (const auto& model : models) {
string efectiveColor = color;
double value = table[model].at(dataset).at(0).get<double>();
if (value == maxValue) {
efectiveColor = Colors::RED();
}
totals[model] += value;
cout << efectiveColor << setw(maxModelName) << setprecision(maxModelName - 2) << fixed << value << " ";
}
cout << endl;
odd = !odd;
}
cout << Colors::GREEN() << "=== " << string(maxDatasetName, '=') << " ";
for (const auto& model : models) {
cout << string(maxModelName, '=') << " ";
}
cout << endl;
cout << Colors::GREEN() << setw(5 + maxDatasetName) << " Totals...................";
double max = 0.0;
for (const auto& total : totals) {
if (total.second > max) {
max = total.second;
}
}
for (const auto& model : models) {
string efectiveColor = Colors::GREEN();
if (totals[model] == max) {
efectiveColor = Colors::RED();
}
cout << efectiveColor << right << setw(maxModelName) << setprecision(maxModelName - 4) << fixed << totals[model] << " ";
}
cout << endl;
}
void BestResults::reportAll(bool excel)
{
auto models = getModels();
// Build the table of results
json table = buildTableResults(models);
vector<string> datasets = getDatasets(table.begin().value());
maxModelName = (*max_element(models.begin(), models.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
maxModelName = max(12, maxModelName);
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
maxDatasetName = max(25, maxDatasetName);
// Print the table of results
printTableResults(models, table);
// Compute the Friedman test
map<string, map<string, float>> ranksModels;
if (friedman) {
Statistics stats(models, datasets, table, significance);
auto result = stats.friedmanTest();
stats.postHocHolmTest(result);
ranksModels = stats.getRanks();
}
if (excel) {
BestResultsExcel excel(score, models, datasets, table, ranksModels, friedman, significance);
excel.build();
cout << Colors::YELLOW() << "** Excel file generated: " << excel.getFileName() << Colors::RESET() << endl;
}
}
}

View File

@@ -0,0 +1,32 @@
#ifndef BESTRESULTS_H
#define BESTRESULTS_H
#include <string>
#include <nlohmann/json.hpp>
using namespace std;
using json = nlohmann::json;
namespace platform {
class BestResults {
public:
explicit BestResults(const string& path, const string& score, const string& model, bool friedman, double significance = 0.05) : path(path), score(score), model(model), friedman(friedman), significance(significance) {}
string build();
void reportSingle();
void reportAll(bool excel);
void buildAll();
private:
vector<string> getModels();
vector<string> getDatasets(json table);
vector<string> loadResultFiles();
json buildTableResults(vector<string> models);
void printTableResults(vector<string> models, json table);
string bestResultFile();
json loadFile(const string& fileName);
string path;
string score;
string model;
bool friedman;
double significance;
int maxModelName = 0;
int maxDatasetName = 0;
};
}
#endif //BESTRESULTS_H

View File

@@ -0,0 +1,177 @@
#include <sstream>
#include "BestResultsExcel.h"
#include "Paths.h"
#include "Statistics.h"
namespace platform {
BestResultsExcel::BestResultsExcel(const string& score, const vector<string>& models, const vector<string>& datasets, const json& table, const map<string, map<string, float>>& ranksModels, bool friedman, double significance) :
score(score), models(models), datasets(datasets), table(table), ranksModels(ranksModels), friedman(friedman), significance(significance)
{
workbook = workbook_new((Paths::excel() + fileName).c_str());
worksheet = workbook_add_worksheet(workbook, "Best Results");
setProperties("Best Results");
createFormats();
int maxModelName = (*max_element(models.begin(), models.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
modelNameSize = max(modelNameSize, maxModelName);
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
datasetNameSize = max(datasetNameSize, maxDatasetName);
formatColumns();
}
BestResultsExcel::~BestResultsExcel()
{
workbook_close(workbook);
}
void BestResultsExcel::formatColumns()
{
worksheet_freeze_panes(worksheet, 4, 2);
vector<int> columns_sizes = { 5, datasetNameSize };
for (int i = 0; i < models.size(); ++i) {
columns_sizes.push_back(modelNameSize);
}
for (int i = 0; i < columns_sizes.size(); ++i) {
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
}
}
void BestResultsExcel::build()
{
// Create Sheet with scores
header(false);
body(false);
footer(false);
if (friedman) {
// Create Sheet with ranks
worksheet = workbook_add_worksheet(workbook, "Ranks");
formatColumns();
header(true);
body(true);
footer(true);
// Create Sheet with Friedman Test
doFriedman();
}
}
string BestResultsExcel::getFileName()
{
return Paths::excel() + fileName;
}
void BestResultsExcel::header(bool ranks)
{
row = 0;
string message = ranks ? "Ranks for score " + score : "Best results for " + score;
worksheet_merge_range(worksheet, 0, 0, 0, 1 + models.size(), message.c_str(), styles["headerFirst"]);
// Body header
row = 3;
int col = 1;
writeString(row, 0, "", "bodyHeader");
writeString(row, 1, "Dataset", "bodyHeader");
for (const auto& model : models) {
writeString(row, ++col, model.c_str(), "bodyHeader");
}
}
void BestResultsExcel::body(bool ranks)
{
row = 4;
int i = 0;
json origin = table.begin().value();
for (auto const& item : origin.items()) {
writeInt(row, 0, i++, "ints");
writeString(row, 1, item.key().c_str(), "text");
int col = 1;
for (const auto& model : models) {
double value = ranks ? ranksModels[item.key()][model] : table[model].at(item.key()).at(0).get<double>();
writeDouble(row, ++col, value, "result");
}
++row;
}
}
void BestResultsExcel::footer(bool ranks)
{
// Set Totals
writeString(row, 1, "Total", "bodyHeader");
int col = 1;
for (const auto& model : models) {
stringstream oss;
oss << "=sum(indirect(address(" << 5 << "," << col + 2 << ")):indirect(address(" << row << "," << col + 2 << ")))";
worksheet_write_formula(worksheet, row, ++col, oss.str().c_str(), styles["bodyHeader_odd"]);
}
if (ranks) {
row++;
writeString(row, 1, "Average ranks", "bodyHeader");
int col = 1;
for (const auto& model : models) {
stringstream oss;
oss << "=sum(indirect(address(" << 5 << "," << col + 2 << ")):indirect(address(" << row - 1 << "," << col + 2 << ")))/" << datasets.size();
worksheet_write_formula(worksheet, row, ++col, oss.str().c_str(), styles["bodyHeader_odd"]);
}
}
}
void BestResultsExcel::doFriedman()
{
worksheet = workbook_add_worksheet(workbook, "Friedman");
vector<int> columns_sizes = { 5, datasetNameSize };
for (int i = 0; i < models.size(); ++i) {
columns_sizes.push_back(modelNameSize);
}
for (int i = 0; i < columns_sizes.size(); ++i) {
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
}
worksheet_merge_range(worksheet, 0, 0, 0, 1 + models.size(), "Friedman Test", styles["headerFirst"]);
row = 2;
Statistics stats(models, datasets, table, significance, false);
auto result = stats.friedmanTest();
stats.postHocHolmTest(result);
auto friedmanResult = stats.getFriedmanResult();
auto holmResult = stats.getHolmResult();
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Null hypothesis: H0 'There is no significant differences between all the classifiers.'", styles["headerSmall"]);
row += 2;
writeString(row, 1, "Friedman Q", "bodyHeader");
writeDouble(row, 2, friedmanResult.statistic, "bodyHeader");
row++;
writeString(row, 1, "Critical χ2 value", "bodyHeader");
writeDouble(row, 2, friedmanResult.criticalValue, "bodyHeader");
row++;
writeString(row, 1, "p-value", "bodyHeader");
writeDouble(row, 2, friedmanResult.pvalue, "bodyHeader");
writeString(row, 3, friedmanResult.reject ? "<" : ">", "bodyHeader");
writeDouble(row, 4, significance, "bodyHeader");
writeString(row, 5, friedmanResult.reject ? "Reject H0" : "Accept H0", "bodyHeader");
row += 3;
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Holm Test", styles["headerFirst"]);
row += 2;
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Null hypothesis: H0 'There is no significant differences between the control model and the other models.'", styles["headerSmall"]);
row += 2;
string controlModel = "Control Model: " + holmResult.model;
worksheet_merge_range(worksheet, row, 1, row, 7, controlModel.c_str(), styles["bodyHeader_odd"]);
row++;
writeString(row, 1, "Model", "bodyHeader");
writeString(row, 2, "p-value", "bodyHeader");
writeString(row, 3, "Rank", "bodyHeader");
writeString(row, 4, "Win", "bodyHeader");
writeString(row, 5, "Tie", "bodyHeader");
writeString(row, 6, "Loss", "bodyHeader");
writeString(row, 7, "Reject H0", "bodyHeader");
row++;
bool first = true;
for (const auto& item : holmResult.holmLines) {
writeString(row, 1, item.model, "text");
if (first) {
// Control model info
first = false;
writeString(row, 2, "", "text");
writeDouble(row, 3, item.rank, "result");
writeString(row, 4, "", "text");
writeString(row, 5, "", "text");
writeString(row, 6, "", "text");
writeString(row, 7, "", "textCentered");
} else {
// Rest of the models info
writeDouble(row, 2, item.pvalue, "result");
writeDouble(row, 3, item.rank, "result");
writeInt(row, 4, item.wtl.win, "ints");
writeInt(row, 5, item.wtl.tie, "ints");
writeInt(row, 6, item.wtl.loss, "ints");
writeString(row, 7, item.reject ? "Yes" : "No", "textCentered");
}
row++;
}
}
}

View File

@@ -0,0 +1,37 @@
#ifndef BESTRESULTS_EXCEL_H
#define BESTRESULTS_EXCEL_H
#include "ExcelFile.h"
#include <vector>
#include <map>
#include <nlohmann/json.hpp>
using namespace std;
using json = nlohmann::json;
namespace platform {
class BestResultsExcel : ExcelFile {
public:
BestResultsExcel(const string& score, const vector<string>& models, const vector<string>& datasets, const json& table, const map<string, map<string, float>>& ranks, bool friedman, double significance);
~BestResultsExcel();
void build();
string getFileName();
private:
void header(bool ranks);
void body(bool ranks);
void footer(bool ranks);
void formatColumns();
void doFriedman();
const string fileName = "BestResults.xlsx";
string score;
vector<string> models;
vector<string> datasets;
json table;
map<string, map<string, float>> ranksModels;
bool friedman;
double significance;
int modelNameSize = 12; // Min size of the column
int datasetNameSize = 25; // Min size of the column
};
}
#endif //BESTRESULTS_EXCEL_H

View File

@@ -1,7 +1,7 @@
#ifndef BESTRESULT_H
#define BESTRESULT_H
#ifndef BESTSCORE_H
#define BESTSCORE_H
#include <string>
class BestResult {
class BestScore {
public:
static std::string title() { return "STree_default (linear-ovo)"; }
static double score() { return 22.109799; }

24
src/Platform/CLocale.h Normal file
View File

@@ -0,0 +1,24 @@
#ifndef LOCALE_H
#define LOCALE_H
#include <locale>
#include <iostream>
#include <sstream>
#include <string>
using namespace std;
namespace platform {
struct separation : numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
string do_grouping() const { return "\03"; }
};
class ConfigLocale {
public:
explicit ConfigLocale()
{
locale mylocale(cout.getloc(), new separation);
locale::global(mylocale);
cout.imbue(mylocale);
}
};
}
#endif

View File

@@ -4,7 +4,14 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include)
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc Report.cc)
add_executable(manage manage.cc Results.cc Report.cc)
target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
target_link_libraries(manage "${TORCH_LIBRARIES}")
include_directories(${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/include)
add_executable(b_main main.cc Folding.cc Experiment.cc Datasets.cc Dataset.cc Models.cc ReportConsole.cc ReportBase.cc)
add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
add_executable(b_list list.cc Datasets.cc Dataset.cc)
add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc)
add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc )
target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)
target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}")
target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}")
target_link_libraries(testx ArffFiles BayesNet "${TORCH_LIBRARIES}")

215
src/Platform/Dataset.cc Normal file
View File

@@ -0,0 +1,215 @@
#include "Dataset.h"
#include "ArffFiles.h"
#include <fstream>
namespace platform {
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
{
}
string Dataset::getName() const
{
return name;
}
string Dataset::getClassName() const
{
return className;
}
vector<string> Dataset::getFeatures() const
{
if (loaded) {
return features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures() const
{
if (loaded) {
return n_features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples() const
{
if (loaded) {
return n_samples;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Dataset::getStates() const
{
if (loaded) {
return states;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<int>>&, vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
string line;
getline(file, line);
vector<string> tokens = split(line, ',');
features = vector<string>(tokens.begin(), tokens.end() - 1);
if (className == "-1") {
className = tokens.back();
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
}
states[className] = vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
}
vector<string> tokenize(string line)
{
vector<string> tokens;
for (auto i = 0; i < line.size(); ++i) {
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
string token = line.substr(0, i);
tokens.push_back(token);
line.erase(line.begin(), line.begin() + i + 1);
i = 0;
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
line.erase(line.begin(), line.begin() + i + 1);
}
}
if (line.size() > 0) {
tokens.push_back(line);
}
return tokens;
}
void Dataset::load_rdata()
{
ifstream file(path + "/" + name + "_R.dat");
if (file.is_open()) {
string line;
getline(file, line);
line = ArffFiles::trim(line);
vector<string> tokens = tokenize(line);
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
if (className == "-1") {
className = ArffFiles::trim(tokens.back());
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(vector<float>());
}
while (getline(file, line)) {
tokens = tokenize(line);
// We have to skip the first token, which is the instance number.
for (auto i = 1; i < features.size() + 1; ++i) {
const float value = stof(tokens[i]);
Xv[i - 1].push_back(value);
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw invalid_argument("Unable to open dataset file.");
}
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
} else if (fileType == RDATA) {
load_rdata();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
}
n_samples = Xv[0].size();
n_features = Xv.size();
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
}
y = torch::tensor(yv, torch::kInt32);
}
vector<mdlp::labels_t> Dataset::discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y)
{
vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
}

80
src/Platform/Dataset.h Normal file
View File

@@ -0,0 +1,80 @@
#ifndef DATASET_H
#define DATASET_H
#include <torch/torch.h>
#include <map>
#include <vector>
#include <string>
#include "CPPFImdlp.h"
#include "Utils.h"
namespace platform {
using namespace std;
enum fileType_t { CSV, ARFF, RDATA };
class SourceData {
public:
SourceData(string source)
{
if (source == "Surcov") {
path = "datasets/";
fileType = CSV;
} else if (source == "Arff") {
path = "datasets/";
fileType = ARFF;
} else if (source == "Tanveer") {
path = "data/";
fileType = RDATA;
} else {
throw invalid_argument("Unknown source.");
}
}
string getPath()
{
return path;
}
fileType_t getFileType()
{
return fileType;
}
private:
string path;
fileType_t fileType;
};
class Dataset {
private:
string path;
string name;
fileType_t fileType;
string className;
int n_samples{ 0 }, n_features{ 0 };
vector<string> features;
map<string, vector<int>> states;
bool loaded;
bool discretize;
torch::Tensor X, y;
vector<vector<float>> Xv;
vector<vector<int>> Xd;
vector<int> yv;
void buildTensors();
void load_csv();
void load_arff();
void load_rdata();
void computeStates();
vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y);
public:
Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
explicit Dataset(const Dataset&);
string getName() const;
string getClassName() const;
vector<string> getFeatures() const;
map<string, vector<int>> getStates() const;
pair<vector<vector<float>>&, vector<int>&> getVectors();
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized();
pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures() const;
int getNSamples() const;
void load();
const bool inline isLoaded() const { return loaded; };
};
};
#endif

View File

@@ -1,21 +1,31 @@
#include "Datasets.h"
#include "platformUtils.h"
#include "ArffFiles.h"
#include <fstream>
namespace platform {
void Datasets::load()
{
ifstream catalog(path + "/all.txt");
auto sd = SourceData(sfileType);
fileType = sd.getFileType();
path = sd.getPath();
ifstream catalog(path + "all.txt");
if (catalog.is_open()) {
string line;
while (getline(catalog, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
vector<string> tokens = split(line, ',');
string name = tokens[0];
string className = tokens[1];
string className;
if (tokens.size() == 1) {
className = "-1";
} else {
className = tokens[1];
}
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
}
catalog.close();
} else {
throw invalid_argument("Unable to open catalog file. [" + path + "/all.txt" + "]");
throw invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
}
}
vector<string> Datasets::getNames()
@@ -24,208 +34,96 @@ namespace platform {
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
return result;
}
vector<string> Datasets::getFeatures(string name)
vector<string> Datasets::getFeatures(const string& name) const
{
if (datasets[name]->isLoaded()) {
return datasets[name]->getFeatures();
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getFeatures();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Datasets::getStates(string name)
map<string, vector<int>> Datasets::getStates(const string& name) const
{
if (datasets[name]->isLoaded()) {
return datasets[name]->getStates();
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getStates();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
string Datasets::getClassName(string name)
void Datasets::loadDataset(const string& name) const
{
if (datasets[name]->isLoaded()) {
return datasets[name]->getClassName();
if (datasets.at(name)->isLoaded()) {
return;
} else {
datasets.at(name)->load();
}
}
string Datasets::getClassName(const string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getClassName();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNSamples(string name)
int Datasets::getNSamples(const string& name) const
{
if (datasets[name]->isLoaded()) {
return datasets[name]->getNSamples();
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getNSamples();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Datasets::getVectors(string name)
int Datasets::getNClasses(const string& name)
{
if (datasets.at(name)->isLoaded()) {
auto className = datasets.at(name)->getClassName();
if (discretize) {
auto states = getStates(name);
return states.at(className).size();
}
auto [Xv, yv] = getVectors(name);
return *max_element(yv.begin(), yv.end()) + 1;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
vector<int> Datasets::getClassesCounts(const string& name) const
{
if (datasets.at(name)->isLoaded()) {
auto [Xv, yv] = datasets.at(name)->getVectors();
vector<int> counts(*max_element(yv.begin(), yv.end()) + 1);
for (auto y : yv) {
counts[y]++;
}
return counts;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Datasets::getVectors(const string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getVectors();
}
pair<vector<vector<int>>&, vector<int>&> Datasets::getVectorsDiscretized(string name)
pair<vector<vector<int>>&, vector<int>&> Datasets::getVectorsDiscretized(const string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getVectorsDiscretized();
}
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(string name)
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(const string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getTensors();
}
bool Datasets::isDataset(const string& name)
bool Datasets::isDataset(const string& name) const
{
return datasets.find(name) != datasets.end();
}
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
{
}
string Dataset::getName()
{
return name;
}
string Dataset::getClassName()
{
return className;
}
vector<string> Dataset::getFeatures()
{
if (loaded) {
return features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures()
{
if (loaded) {
return n_features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples()
{
if (loaded) {
return n_samples;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Dataset::getStates()
{
if (loaded) {
return states;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<int>>&, vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
string line;
getline(file, line);
vector<string> tokens = split(line, ',');
features = vector<string>(tokens.begin(), tokens.end() - 1);
className = tokens.back();
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
iota(begin(states[features[i]]), end(states[features[i]]), 0);
}
states[className] = vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states[className]), end(states[className]), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
}
n_samples = Xv[0].size();
n_features = Xv.size();
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
}
y = torch::tensor(yv, torch::kInt32);
}
}

View File

@@ -1,64 +1,30 @@
#ifndef DATASETS_H
#define DATASETS_H
#include <torch/torch.h>
#include <map>
#include <vector>
#include <string>
#include "Dataset.h"
namespace platform {
using namespace std;
enum fileType_t { CSV, ARFF };
class Dataset {
private:
string path;
string name;
fileType_t fileType;
string className;
int n_samples{ 0 }, n_features{ 0 };
vector<string> features;
map<string, vector<int>> states;
bool loaded;
bool discretize;
torch::Tensor X, y;
vector<vector<float>> Xv;
vector<vector<int>> Xd;
vector<int> yv;
void buildTensors();
void load_csv();
void load_arff();
void computeStates();
public:
Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
explicit Dataset(const Dataset&);
string getName();
string getClassName();
vector<string> getFeatures();
map<string, vector<int>> getStates();
pair<vector<vector<float>>&, vector<int>&> getVectors();
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized();
pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures();
int getNSamples();
void load();
const bool inline isLoaded() const { return loaded; };
};
class Datasets {
private:
string path;
fileType_t fileType;
string sfileType;
map<string, unique_ptr<Dataset>> datasets;
bool discretize;
void load(); // Loads the list of datasets
public:
explicit Datasets(const string& path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); };
explicit Datasets(bool discretize, string sfileType) : discretize(discretize), sfileType(sfileType) { load(); };
vector<string> getNames();
vector<string> getFeatures(string name);
int getNSamples(string name);
string getClassName(string name);
map<string, vector<int>> getStates(string name);
pair<vector<vector<float>>&, vector<int>&> getVectors(string name);
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized(string name);
pair<torch::Tensor&, torch::Tensor&> getTensors(string name);
bool isDataset(const string& name);
vector<string> getFeatures(const string& name) const;
int getNSamples(const string& name) const;
string getClassName(const string& name) const;
int getNClasses(const string& name);
vector<int> getClassesCounts(const string& name) const;
map<string, vector<int>> getStates(const string& name) const;
pair<vector<vector<float>>&, vector<int>&> getVectors(const string& name);
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized(const string& name);
pair<torch::Tensor&, torch::Tensor&> getTensors(const string& name);
bool isDataset(const string& name) const;
void loadDataset(const string& name) const;
};
};

View File

@@ -4,7 +4,11 @@
#include <map>
#include <fstream>
#include <sstream>
#include "platformUtils.h"
#include <algorithm>
#include <iostream>
#include "Utils.h"
//#include "Dataset.h"
namespace platform {
class DotEnv {
private:
@@ -43,7 +47,7 @@ namespace platform {
}
std::string get(const std::string& key)
{
return env[key];
return env.at(key);
}
std::vector<int> getSeeds()
{

164
src/Platform/ExcelFile.cc Normal file
View File

@@ -0,0 +1,164 @@
#include "ExcelFile.h"
namespace platform {
ExcelFile::ExcelFile()
{
setDefault();
}
ExcelFile::ExcelFile(lxw_workbook* workbook) : workbook(workbook)
{
setDefault();
}
void ExcelFile::setDefault()
{
normalSize = 14; //font size for report body
row = 0;
colorTitle = 0xB1A0C7;
colorOdd = 0xDCE6F1;
colorEven = 0xFDE9D9;
}
lxw_workbook* ExcelFile::getWorkbook()
{
return workbook;
}
void ExcelFile::setProperties(string title)
{
char line[title.size() + 1];
strcpy(line, title.c_str());
lxw_doc_properties properties = {
.title = line,
.subject = (char*)"Machine learning results",
.author = (char*)"Ricardo Montañana Gómez",
.manager = (char*)"Dr. J. A. Gámez, Dr. J. M. Puerta",
.company = (char*)"UCLM",
.comments = (char*)"Created with libxlsxwriter and c++",
};
workbook_set_properties(workbook, &properties);
}
lxw_format* ExcelFile::efectiveStyle(const string& style)
{
lxw_format* efectiveStyle = NULL;
if (style != "") {
string suffix = row % 2 ? "_odd" : "_even";
try {
efectiveStyle = styles.at(style + suffix);
}
catch (const out_of_range& oor) {
try {
efectiveStyle = styles.at(style);
}
catch (const out_of_range& oor) {
throw invalid_argument("Style " + style + " not found");
}
}
}
return efectiveStyle;
}
void ExcelFile::writeString(int row, int col, const string& text, const string& style)
{
worksheet_write_string(worksheet, row, col, text.c_str(), efectiveStyle(style));
}
void ExcelFile::writeInt(int row, int col, const int number, const string& style)
{
worksheet_write_number(worksheet, row, col, number, efectiveStyle(style));
}
void ExcelFile::writeDouble(int row, int col, const double number, const string& style)
{
worksheet_write_number(worksheet, row, col, number, efectiveStyle(style));
}
void ExcelFile::addColor(lxw_format* style, bool odd)
{
uint32_t efectiveColor = odd ? colorEven : colorOdd;
format_set_bg_color(style, lxw_color_t(efectiveColor));
}
void ExcelFile::createStyle(const string& name, lxw_format* style, bool odd)
{
addColor(style, odd);
if (name == "textCentered") {
format_set_align(style, LXW_ALIGN_CENTER);
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "text") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "bodyHeader") {
format_set_bold(style);
format_set_font_size(style, normalSize);
format_set_align(style, LXW_ALIGN_CENTER);
format_set_align(style, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(style, LXW_BORDER_THIN);
format_set_bg_color(style, lxw_color_t(colorTitle));
} else if (name == "result") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
format_set_num_format(style, "0.0000000");
} else if (name == "time") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
format_set_num_format(style, "#,##0.000000");
} else if (name == "ints") {
format_set_font_size(style, normalSize);
format_set_num_format(style, "###,##0");
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "floats") {
format_set_border(style, LXW_BORDER_THIN);
format_set_font_size(style, normalSize);
format_set_num_format(style, "#,##0.00");
}
}
void ExcelFile::createFormats()
{
auto styleNames = { "text", "textCentered", "bodyHeader", "result", "time", "ints", "floats" };
lxw_format* style;
for (string name : styleNames) {
lxw_format* style = workbook_add_format(workbook);
style = workbook_add_format(workbook);
createStyle(name, style, true);
styles[name + "_odd"] = style;
style = workbook_add_format(workbook);
createStyle(name, style, false);
styles[name + "_even"] = style;
}
// Header 1st line
lxw_format* headerFirst = workbook_add_format(workbook);
format_set_bold(headerFirst);
format_set_font_size(headerFirst, 18);
format_set_align(headerFirst, LXW_ALIGN_CENTER);
format_set_align(headerFirst, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(headerFirst, LXW_BORDER_THIN);
format_set_bg_color(headerFirst, lxw_color_t(colorTitle));
// Header rest
lxw_format* headerRest = workbook_add_format(workbook);
format_set_bold(headerRest);
format_set_align(headerRest, LXW_ALIGN_CENTER);
format_set_font_size(headerRest, 16);
format_set_align(headerRest, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(headerRest, LXW_BORDER_THIN);
format_set_bg_color(headerRest, lxw_color_t(colorOdd));
// Header small
lxw_format* headerSmall = workbook_add_format(workbook);
format_set_bold(headerSmall);
format_set_align(headerSmall, LXW_ALIGN_LEFT);
format_set_font_size(headerSmall, 12);
format_set_border(headerSmall, LXW_BORDER_THIN);
format_set_align(headerSmall, LXW_ALIGN_VERTICAL_CENTER);
format_set_bg_color(headerSmall, lxw_color_t(colorOdd));
// Summary style
lxw_format* summaryStyle = workbook_add_format(workbook);
format_set_bold(summaryStyle);
format_set_font_size(summaryStyle, 16);
format_set_border(summaryStyle, LXW_BORDER_THIN);
format_set_align(summaryStyle, LXW_ALIGN_VERTICAL_CENTER);
styles["headerFirst"] = headerFirst;
styles["headerRest"] = headerRest;
styles["headerSmall"] = headerSmall;
styles["summaryStyle"] = summaryStyle;
}
}

43
src/Platform/ExcelFile.h Normal file
View File

@@ -0,0 +1,43 @@
#ifndef EXCELFILE_H
#define EXCELFILE_H
#include <locale>
#include <string>
#include <map>
#include "xlsxwriter.h"
using namespace std;
namespace platform {
struct separated : numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
string do_grouping() const { return "\03"; }
};
class ExcelFile {
public:
ExcelFile();
ExcelFile(lxw_workbook* workbook);
lxw_workbook* getWorkbook();
protected:
void setProperties(string title);
void writeString(int row, int col, const string& text, const string& style = "");
void writeInt(int row, int col, const int number, const string& style = "");
void writeDouble(int row, int col, const double number, const string& style = "");
void createFormats();
void createStyle(const string& name, lxw_format* style, bool odd);
void addColor(lxw_format* style, bool odd);
lxw_format* efectiveStyle(const string& name);
lxw_workbook* workbook;
lxw_worksheet* worksheet;
map<string, lxw_format*> styles;
int row;
int normalSize; //font size for report body
uint32_t colorTitle;
uint32_t colorOdd;
uint32_t colorEven;
private:
void setDefault();
};
}
#endif // !EXCELFILE_H

View File

@@ -1,8 +1,9 @@
#include <fstream>
#include "Experiment.h"
#include "Datasets.h"
#include "Models.h"
#include "Report.h"
#include "ReportConsole.h"
#include "Paths.h"
namespace platform {
using json = nlohmann::json;
string get_date()
@@ -25,6 +26,7 @@ namespace platform {
oss << std::put_time(timeinfo, "%H:%M:%S");
return oss.str();
}
Experiment::Experiment() : hyperparameters(json::parse("{}")) {}
string Experiment::get_file_name()
{
string result = "results_" + score_name + "_" + model + "_" + platform + "_" + get_date() + "_" + get_time() + "_" + (stratified ? "1" : "0") + ".json";
@@ -90,7 +92,7 @@ namespace platform {
void Experiment::report()
{
json data = build_json();
Report report(data);
ReportConsole report(data);
report.show();
}
@@ -100,19 +102,39 @@ namespace platform {
cout << data.dump(4) << endl;
}
void Experiment::go(vector<string> filesToProcess, const string& path)
void Experiment::go(vector<string> filesToProcess)
{
cout << "*** Starting experiment: " << title << " ***" << endl;
for (auto fileName : filesToProcess) {
cout << "- " << setw(20) << left << fileName << " " << right << flush;
cross_validation(path, fileName);
cross_validation(fileName);
cout << endl;
}
}
void Experiment::cross_validation(const string& path, const string& fileName)
string getColor(bayesnet::status_t status)
{
auto datasets = platform::Datasets(path, discretized, platform::ARFF);
switch (status) {
case bayesnet::NORMAL:
return Colors::GREEN();
case bayesnet::WARNING:
return Colors::YELLOW();
case bayesnet::ERROR:
return Colors::RED();
default:
return Colors::RESET();
}
}
void showProgress(int fold, const string& color, const string& phase)
{
string prefix = phase == "a" ? "" : "\b\b\b\b";
cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush;
}
void Experiment::cross_validation(const string& fileName)
{
auto datasets = platform::Datasets(discretized, Paths::datasets());
// Get dataset
auto [X, y] = datasets.getTensors(fileName);
auto states = datasets.getStates(fileName);
@@ -124,6 +146,8 @@ namespace platform {
auto result = Result();
auto [values, counts] = at::_unique(y);
result.setSamples(X.size(1)).setFeatures(X.size(0)).setClasses(values.size(0));
result.setHyperparameters(hyperparameters);
// Initialize results vectors
int nResults = nfolds * static_cast<int>(randomSeeds.size());
auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64);
auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64);
@@ -144,6 +168,10 @@ namespace platform {
for (int nfold = 0; nfold < nfolds; nfold++) {
auto clf = Models::instance()->create(model);
setModelVersion(clf->getVersion());
if (hyperparameters.size() != 0) {
clf->setHyperparameters(hyperparameters);
}
// Split train - test dataset
train_timer.start();
auto [train, test] = fold->getFold(nfold);
auto train_t = torch::tensor(train);
@@ -152,24 +180,31 @@ namespace platform {
auto y_train = y.index({ train_t });
auto X_test = X.index({ "...", test_t });
auto y_test = y.index({ test_t });
cout << nfold + 1 << ", " << flush;
showProgress(nfold + 1, getColor(clf->getStatus()), "a");
// Train model
clf->fit(X_train, y_train, features, className, states);
showProgress(nfold + 1, getColor(clf->getStatus()), "b");
nodes[item] = clf->getNumberOfNodes();
edges[item] = clf->getNumberOfEdges();
num_states[item] = clf->getNumberOfStates();
train_time[item] = train_timer.getDuration();
// Score train
auto accuracy_train_value = clf->score(X_train, y_train);
// Test model
showProgress(nfold + 1, getColor(clf->getStatus()), "c");
test_timer.start();
auto accuracy_test_value = clf->score(X_test, y_test);
test_time[item] = test_timer.getDuration();
accuracy_train[item] = accuracy_train_value;
accuracy_test[item] = accuracy_test_value;
cout << "\b\b\b, " << flush;
// Store results and times in vector
result.addScoreTrain(accuracy_train_value);
result.addScoreTest(accuracy_test_value);
result.addTimeTrain(train_time[item].item<double>());
result.addTimeTest(test_time[item].item<double>());
item++;
clf.reset();
}
cout << "end. " << flush;
delete fold;
@@ -177,6 +212,7 @@ namespace platform {
result.setScoreTest(torch::mean(accuracy_test).item<double>()).setScoreTrain(torch::mean(accuracy_train).item<double>());
result.setScoreTestStd(torch::std(accuracy_test).item<double>()).setScoreTrainStd(torch::std(accuracy_train).item<double>());
result.setTrainTime(torch::mean(train_time).item<double>()).setTestTime(torch::mean(test_time).item<double>());
result.setTestTimeStd(torch::std(test_time).item<double>()).setTrainTimeStd(torch::std(train_time).item<double>());
result.setNodes(torch::mean(nodes).item<double>()).setLeaves(torch::mean(edges).item<double>()).setDepth(torch::mean(num_states).item<double>());
result.setDataset(fileName);
addResult(result);

View File

@@ -29,7 +29,8 @@ namespace platform {
};
class Result {
private:
string dataset, hyperparameters, model_version;
string dataset, model_version;
json hyperparameters;
int samples{ 0 }, features{ 0 }, classes{ 0 };
double score_train{ 0 }, score_test{ 0 }, score_train_std{ 0 }, score_test_std{ 0 }, train_time{ 0 }, train_time_std{ 0 }, test_time{ 0 }, test_time_std{ 0 };
float nodes{ 0 }, leaves{ 0 }, depth{ 0 };
@@ -37,7 +38,7 @@ namespace platform {
public:
Result() = default;
Result& setDataset(const string& dataset) { this->dataset = dataset; return *this; }
Result& setHyperparameters(const string& hyperparameters) { this->hyperparameters = hyperparameters; return *this; }
Result& setHyperparameters(const json& hyperparameters) { this->hyperparameters = hyperparameters; return *this; }
Result& setSamples(int samples) { this->samples = samples; return *this; }
Result& setFeatures(int features) { this->features = features; return *this; }
Result& setClasses(int classes) { this->classes = classes; return *this; }
@@ -59,7 +60,7 @@ namespace platform {
const float get_score_train() const { return score_train; }
float get_score_test() { return score_test; }
const string& getDataset() const { return dataset; }
const string& getHyperparameters() const { return hyperparameters; }
const json& getHyperparameters() const { return hyperparameters; }
const int getSamples() const { return samples; }
const int getFeatures() const { return features; }
const int getClasses() const { return classes; }
@@ -85,11 +86,12 @@ namespace platform {
bool discretized{ false }, stratified{ false };
vector<Result> results;
vector<int> randomSeeds;
json hyperparameters = "{}";
int nfolds{ 0 };
float duration{ 0 };
json build_json();
public:
Experiment() = default;
Experiment();
Experiment& setTitle(const string& title) { this->title = title; return *this; }
Experiment& setModel(const string& model) { this->model = model; return *this; }
Experiment& setPlatform(const string& platform) { this->platform = platform; return *this; }
@@ -103,10 +105,11 @@ namespace platform {
Experiment& addResult(Result result) { results.push_back(result); return *this; }
Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); return *this; }
Experiment& setDuration(float duration) { this->duration = duration; return *this; }
Experiment& setHyperparameters(const json& hyperparameters) { this->hyperparameters = hyperparameters; return *this; }
string get_file_name();
void save(const string& path);
void cross_validation(const string& path, const string& fileName);
void go(vector<string> filesToProcess, const string& path);
void cross_validation(const string& fileName);
void go(vector<string> filesToProcess);
void show();
void report();
};

View File

@@ -1,95 +1,104 @@
#include "Folding.h"
#include <algorithm>
#include <map>
Fold::Fold(int k, int n, int seed) : k(k), n(n), seed(seed)
{
random_device rd;
random_seed = default_random_engine(seed == -1 ? rd() : seed);
srand(seed == -1 ? time(0) : seed);
}
KFold::KFold(int k, int n, int seed) : Fold(k, n, seed), indices(vector<int>(n))
{
iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1
shuffle(indices.begin(), indices.end(), random_seed);
}
pair<vector<int>, vector<int>> KFold::getFold(int nFold)
{
if (nFold >= k || nFold < 0) {
throw out_of_range("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")");
namespace platform {
Fold::Fold(int k, int n, int seed) : k(k), n(n), seed(seed)
{
random_device rd;
random_seed = default_random_engine(seed == -1 ? rd() : seed);
srand(seed == -1 ? time(0) : seed);
}
int nTest = n / k;
auto train = vector<int>();
auto test = vector<int>();
for (int i = 0; i < n; i++) {
if (i >= nTest * nFold && i < nTest * (nFold + 1)) {
test.push_back(indices[i]);
} else {
train.push_back(indices[i]);
}
}
return { train, test };
}
StratifiedKFold::StratifiedKFold(int k, torch::Tensor& y, int seed) : Fold(k, y.numel(), seed)
{
n = y.numel();
this->y = vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + n);
build();
}
StratifiedKFold::StratifiedKFold(int k, const vector<int>& y, int seed)
: Fold(k, y.size(), seed)
{
this->y = y;
n = y.size();
build();
}
void StratifiedKFold::build()
{
stratified_indices = vector<vector<int>>(k);
int fold_size = n / k;
// Compute class counts and indices
auto class_indices = map<int, vector<int>>();
vector<int> class_counts(*max_element(y.begin(), y.end()) + 1, 0);
for (auto i = 0; i < n; ++i) {
class_counts[y[i]]++;
class_indices[y[i]].push_back(i);
}
// Shuffle class indices
for (auto& [cls, indices] : class_indices) {
KFold::KFold(int k, int n, int seed) : Fold(k, n, seed), indices(vector<int>(n))
{
iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1
shuffle(indices.begin(), indices.end(), random_seed);
}
// Assign indices to folds
for (auto label = 0; label < class_counts.size(); ++label) {
auto num_samples_to_take = class_counts[label] / k;
if (num_samples_to_take == 0)
continue;
auto remainder_samples_to_take = class_counts[label] % k;
for (auto fold = 0; fold < k; ++fold) {
auto it = next(class_indices[label].begin(), num_samples_to_take);
move(class_indices[label].begin(), it, back_inserter(stratified_indices[fold])); // ##
class_indices[label].erase(class_indices[label].begin(), it);
pair<vector<int>, vector<int>> KFold::getFold(int nFold)
{
if (nFold >= k || nFold < 0) {
throw out_of_range("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")");
}
while (remainder_samples_to_take > 0) {
int fold = (rand() % static_cast<int>(k));
if (stratified_indices[fold].size() == fold_size + 1) {
int nTest = n / k;
auto train = vector<int>();
auto test = vector<int>();
for (int i = 0; i < n; i++) {
if (i >= nTest * nFold && i < nTest * (nFold + 1)) {
test.push_back(indices[i]);
} else {
train.push_back(indices[i]);
}
}
return { train, test };
}
StratifiedKFold::StratifiedKFold(int k, torch::Tensor& y, int seed) : Fold(k, y.numel(), seed)
{
n = y.numel();
this->y = vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + n);
build();
}
StratifiedKFold::StratifiedKFold(int k, const vector<int>& y, int seed)
: Fold(k, y.size(), seed)
{
this->y = y;
n = y.size();
build();
}
void StratifiedKFold::build()
{
stratified_indices = vector<vector<int>>(k);
int fold_size = n / k;
// Compute class counts and indices
auto class_indices = map<int, vector<int>>();
vector<int> class_counts(*max_element(y.begin(), y.end()) + 1, 0);
for (auto i = 0; i < n; ++i) {
class_counts[y[i]]++;
class_indices[y[i]].push_back(i);
}
// Shuffle class indices
for (auto& [cls, indices] : class_indices) {
shuffle(indices.begin(), indices.end(), random_seed);
}
// Assign indices to folds
for (auto label = 0; label < class_counts.size(); ++label) {
auto num_samples_to_take = class_counts.at(label) / k;
if (num_samples_to_take == 0) {
cerr << "Warning! The number of samples in class " << label << " (" << class_counts.at(label)
<< ") is less than the number of folds (" << k << ")." << endl;
faulty = true;
continue;
}
auto it = next(class_indices[label].begin(), 1);
stratified_indices[fold].push_back(*class_indices[label].begin());
class_indices[label].erase(class_indices[label].begin(), it);
remainder_samples_to_take--;
auto remainder_samples_to_take = class_counts[label] % k;
for (auto fold = 0; fold < k; ++fold) {
auto it = next(class_indices[label].begin(), num_samples_to_take);
move(class_indices[label].begin(), it, back_inserter(stratified_indices[fold])); // ##
class_indices[label].erase(class_indices[label].begin(), it);
}
auto chosen = vector<bool>(k, false);
while (remainder_samples_to_take > 0) {
int fold = (rand() % static_cast<int>(k));
if (chosen.at(fold)) {
continue;
}
chosen[fold] = true;
auto it = next(class_indices[label].begin(), 1);
stratified_indices[fold].push_back(*class_indices[label].begin());
class_indices[label].erase(class_indices[label].begin(), it);
remainder_samples_to_take--;
}
}
}
}
pair<vector<int>, vector<int>> StratifiedKFold::getFold(int nFold)
{
if (nFold >= k || nFold < 0) {
throw out_of_range("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")");
pair<vector<int>, vector<int>> StratifiedKFold::getFold(int nFold)
{
if (nFold >= k || nFold < 0) {
throw out_of_range("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")");
}
vector<int> test_indices = stratified_indices[nFold];
vector<int> train_indices;
for (int i = 0; i < k; ++i) {
if (i == nFold) continue;
train_indices.insert(train_indices.end(), stratified_indices[i].begin(), stratified_indices[i].end());
}
return { train_indices, test_indices };
}
vector<int> test_indices = stratified_indices[nFold];
vector<int> train_indices;
for (int i = 0; i < k; ++i) {
if (i == nFold) continue;
train_indices.insert(train_indices.end(), stratified_indices[i].begin(), stratified_indices[i].end());
}
return { train_indices, test_indices };
}

View File

@@ -4,34 +4,37 @@
#include <vector>
#include <random>
using namespace std;
class Fold {
protected:
int k;
int n;
int seed;
default_random_engine random_seed;
public:
Fold(int k, int n, int seed = -1);
virtual pair<vector<int>, vector<int>> getFold(int nFold) = 0;
virtual ~Fold() = default;
int getNumberOfFolds() { return k; }
};
class KFold : public Fold {
private:
vector<int> indices;
public:
KFold(int k, int n, int seed = -1);
pair<vector<int>, vector<int>> getFold(int nFold) override;
};
class StratifiedKFold : public Fold {
private:
vector<int> y;
vector<vector<int>> stratified_indices;
void build();
public:
StratifiedKFold(int k, const vector<int>& y, int seed = -1);
StratifiedKFold(int k, torch::Tensor& y, int seed = -1);
pair<vector<int>, vector<int>> getFold(int nFold) override;
};
namespace platform {
class Fold {
protected:
int k;
int n;
int seed;
default_random_engine random_seed;
public:
Fold(int k, int n, int seed = -1);
virtual pair<vector<int>, vector<int>> getFold(int nFold) = 0;
virtual ~Fold() = default;
int getNumberOfFolds() { return k; }
};
class KFold : public Fold {
private:
vector<int> indices;
public:
KFold(int k, int n, int seed = -1);
pair<vector<int>, vector<int>> getFold(int nFold) override;
};
class StratifiedKFold : public Fold {
private:
vector<int> y;
vector<vector<int>> stratified_indices;
void build();
bool faulty = false; // Only true if the number of samples of any class is less than the number of folds.
public:
StratifiedKFold(int k, const vector<int>& y, int seed = -1);
StratifiedKFold(int k, torch::Tensor& y, int seed = -1);
pair<vector<int>, vector<int>> getFold(int nFold) override;
bool isFaulty() { return faulty; }
};
}
#endif

View File

@@ -26,7 +26,7 @@ namespace platform {
instance = it->second();
// wrap instance in a shared ptr and return
if (instance != nullptr)
return shared_ptr<bayesnet::BaseClassifier>(instance);
return unique_ptr<bayesnet::BaseClassifier>(instance);
else
return nullptr;
}

View File

@@ -10,6 +10,7 @@
#include "KDBLd.h"
#include "SPODELd.h"
#include "AODELd.h"
#include "BoostAODE.h"
namespace platform {
class Models {
private:

View File

@@ -1,10 +1,18 @@
#ifndef PATHS_H
#define PATHS_H
#include <string>
#include "DotEnv.h"
namespace platform {
class Paths {
public:
static std::string datasets() { return "datasets/"; }
static std::string results() { return "results/"; }
static std::string excel() { return "excel/"; }
static std::string cfs() { return "cfs/"; }
static std::string datasets()
{
auto env = platform::DotEnv();
return env.get("source_data");
}
};
}
#endif

View File

@@ -1,93 +0,0 @@
#include "Report.h"
#include "BestResult.h"
namespace platform {
string headerLine(const string& text)
{
int n = MAXL - text.length() - 3;
n = n < 0 ? 0 : n;
return "* " + text + string(n, ' ') + "*\n";
}
string Report::fromVector(const string& key)
{
string result = "";
for (auto& item : data[key]) {
result += to_string(item) + ", ";
}
return "[" + result.substr(0, result.size() - 2) + "]";
}
string fVector(const json& data)
{
string result = "";
for (const auto& item : data) {
result += to_string(item) + ", ";
}
return "[" + result.substr(0, result.size() - 2) + "]";
}
void Report::show()
{
header();
body();
footer();
}
void Report::header()
{
cout << Colors::MAGENTA() << string(MAXL, '*') << endl;
cout << headerLine("Report " + data["model"].get<string>() + " ver. " + data["version"].get<string>() + " with " + to_string(data["folds"].get<int>()) + " Folds cross validation and " + to_string(data["seeds"].size()) + " random seeds. " + data["date"].get<string>() + " " + data["time"].get<string>());
cout << headerLine(data["title"].get<string>());
cout << headerLine("Random seeds: " + fromVector("seeds") + " Stratified: " + (data["stratified"].get<bool>() ? "True" : "False"));
cout << headerLine("Execution took " + to_string(data["duration"].get<float>()) + " seconds, " + to_string(data["duration"].get<float>() / 3600) + " hours, on " + data["platform"].get<string>());
cout << headerLine("Score is " + data["score_name"].get<string>());
cout << string(MAXL, '*') << endl;
cout << endl;
}
void Report::body()
{
cout << Colors::GREEN() << "Dataset Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << endl;
cout << "============================== ====== ===== === ======= ======= ======= =============== ================== ===============" << endl;
json lastResult;
totalScore = 0;
bool odd = true;
for (const auto& r : data["results"]) {
auto color = odd ? Colors::CYAN() : Colors::BLUE();
cout << color << setw(30) << left << r["dataset"].get<string>() << " ";
cout << setw(6) << right << r["samples"].get<int>() << " ";
cout << setw(5) << right << r["features"].get<int>() << " ";
cout << setw(3) << right << r["classes"].get<int>() << " ";
cout << setw(7) << setprecision(2) << fixed << r["nodes"].get<float>() << " ";
cout << setw(7) << setprecision(2) << fixed << r["leaves"].get<float>() << " ";
cout << setw(7) << setprecision(2) << fixed << r["depth"].get<float>() << " ";
cout << setw(8) << right << setprecision(6) << fixed << r["score"].get<double>() << "±" << setw(6) << setprecision(4) << fixed << r["score_std"].get<double>() << " ";
cout << setw(11) << right << setprecision(6) << fixed << r["time"].get<double>() << "±" << setw(6) << setprecision(4) << fixed << r["time_std"].get<double>() << " ";
try {
cout << r["hyperparameters"].get<string>();
}
catch (const exception& err) {
cout << r["hyperparameters"];
}
cout << endl;
lastResult = r;
totalScore += r["score"].get<double>();
odd = !odd;
}
if (data["results"].size() == 1) {
cout << string(MAXL, '*') << endl;
cout << headerLine("Train scores: " + fVector(lastResult["scores_train"]));
cout << headerLine("Test scores: " + fVector(lastResult["scores_test"]));
cout << headerLine("Train times: " + fVector(lastResult["times_train"]));
cout << headerLine("Test times: " + fVector(lastResult["times_test"]));
cout << string(MAXL, '*') << endl;
}
}
void Report::footer()
{
cout << Colors::MAGENTA() << string(MAXL, '*') << endl;
auto score = data["score_name"].get<string>();
if (score == BestResult::scoreName()) {
cout << headerLine(score + " compared to " + BestResult::title() + " .: " + to_string(totalScore / BestResult::score()));
}
cout << string(MAXL, '*') << endl << Colors::RESET();
}
}

View File

@@ -1,26 +0,0 @@
#ifndef REPORT_H
#define REPORT_H
#include <string>
#include <iostream>
#include <nlohmann/json.hpp>
#include "Colors.h"
using json = nlohmann::json;
const int MAXL = 121;
namespace platform {
using namespace std;
class Report {
public:
explicit Report(json data_) { data = data_; };
virtual ~Report() = default;
void show();
private:
void header();
void body();
void footer();
string fromVector(const string& key);
json data;
double totalScore; // Total score of all results in a report
};
};
#endif

114
src/Platform/ReportBase.cc Normal file
View File

@@ -0,0 +1,114 @@
#include <sstream>
#include <locale>
#include "Datasets.h"
#include "ReportBase.h"
#include "BestScore.h"
#include "DotEnv.h"
namespace platform {
ReportBase::ReportBase(json data_, bool compare) : data(data_), compare(compare), margin(0.1)
{
stringstream oss;
oss << "Better than ZeroR + " << setprecision(1) << fixed << margin * 100 << "%";
meaning = {
{Symbols::equal_best, "Equal to best"},
{Symbols::better_best, "Better than best"},
{Symbols::cross, "Less than or equal to ZeroR"},
{Symbols::upward_arrow, oss.str()}
};
}
string ReportBase::fromVector(const string& key)
{
stringstream oss;
string sep = "";
oss << "[";
for (auto& item : data[key]) {
oss << sep << item.get<double>();
sep = ", ";
}
oss << "]";
return oss.str();
}
string ReportBase::fVector(const string& title, const json& data, const int width, const int precision)
{
stringstream oss;
string sep = "";
oss << title << "[";
for (const auto& item : data) {
oss << sep << fixed << setw(width) << setprecision(precision) << item.get<double>();
sep = ", ";
}
oss << "]";
return oss.str();
}
void ReportBase::show()
{
header();
body();
}
string ReportBase::compareResult(const string& dataset, double result)
{
string status = " ";
if (compare) {
double best = bestResult(dataset, data["model"].get<string>());
if (result == best) {
status = Symbols::equal_best;
} else if (result > best) {
status = Symbols::better_best;
}
} else {
if (data["score_name"].get<string>() == "accuracy") {
auto dt = Datasets(false, Paths::datasets());
dt.loadDataset(dataset);
auto numClasses = dt.getNClasses(dataset);
if (numClasses == 2) {
vector<int> distribution = dt.getClassesCounts(dataset);
double nSamples = dt.getNSamples(dataset);
vector<int>::iterator maxValue = max_element(distribution.begin(), distribution.end());
double mark = *maxValue / nSamples * (1 + margin);
if (mark > 1) {
mark = 0.9995;
}
status = result < mark ? Symbols::cross : result > mark ? Symbols::upward_arrow : "=";
}
}
}
if (status != " ") {
auto item = summary.find(status);
if (item != summary.end()) {
summary[status]++;
} else {
summary[status] = 1;
}
}
return status;
}
double ReportBase::bestResult(const string& dataset, const string& model)
{
double value = 0.0;
if (bestResults.size() == 0) {
// try to load the best results
string score = data["score_name"];
replace(score.begin(), score.end(), '_', '-');
string fileName = "best_results_" + score + "_" + model + ".json";
ifstream resultData(Paths::results() + "/" + fileName);
if (resultData.is_open()) {
bestResults = json::parse(resultData);
} else {
existBestFile = false;
}
}
try {
value = bestResults.at(dataset).at(0);
}
catch (exception) {
value = 1.0;
}
return value;
}
bool ReportBase::getExistBestFile()
{
return existBestFile;
}
}

37
src/Platform/ReportBase.h Normal file
View File

@@ -0,0 +1,37 @@
#ifndef REPORTBASE_H
#define REPORTBASE_H
#include <string>
#include <iostream>
#include "Paths.h"
#include "Symbols.h"
#include <nlohmann/json.hpp>
using json = nlohmann::json;
namespace platform {
using namespace std;
class ReportBase {
public:
explicit ReportBase(json data_, bool compare);
virtual ~ReportBase() = default;
void show();
protected:
json data;
string fromVector(const string& key);
string fVector(const string& title, const json& data, const int width, const int precision);
bool getExistBestFile();
virtual void header() = 0;
virtual void body() = 0;
virtual void showSummary() = 0;
string compareResult(const string& dataset, double result);
map<string, int> summary;
double margin;
map<string, string> meaning;
bool compare;
private:
double bestResult(const string& dataset, const string& model);
json bestResults;
bool existBestFile = true;
};
};
#endif

View File

@@ -0,0 +1,100 @@
#include <sstream>
#include <locale>
#include "ReportConsole.h"
#include "BestScore.h"
#include "CLocale.h"
namespace platform {
string ReportConsole::headerLine(const string& text, int utf = 0)
{
int n = MAXL - text.length() - 3;
n = n < 0 ? 0 : n;
return "* " + text + string(n + utf, ' ') + "*\n";
}
void ReportConsole::header()
{
stringstream oss;
cout << Colors::MAGENTA() << string(MAXL, '*') << endl;
cout << headerLine("Report " + data["model"].get<string>() + " ver. " + data["version"].get<string>() + " with " + to_string(data["folds"].get<int>()) + " Folds cross validation and " + to_string(data["seeds"].size()) + " random seeds. " + data["date"].get<string>() + " " + data["time"].get<string>());
cout << headerLine(data["title"].get<string>());
cout << headerLine("Random seeds: " + fromVector("seeds") + " Stratified: " + (data["stratified"].get<bool>() ? "True" : "False"));
oss << "Execution took " << setprecision(2) << fixed << data["duration"].get<float>() << " seconds, " << data["duration"].get<float>() / 3600 << " hours, on " << data["platform"].get<string>();
cout << headerLine(oss.str());
cout << headerLine("Score is " + data["score_name"].get<string>());
cout << string(MAXL, '*') << endl;
cout << endl;
}
void ReportConsole::body()
{
auto tmp = ConfigLocale();
cout << Colors::GREEN() << " # Dataset Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << endl;
cout << "=== ========================= ====== ===== === ========= ========= ========= =============== =================== ====================" << endl;
json lastResult;
double totalScore = 0.0;
bool odd = true;
int index = 0;
for (const auto& r : data["results"]) {
if (selectedIndex != -1 && index != selectedIndex) {
index++;
continue;
}
auto color = odd ? Colors::CYAN() : Colors::BLUE();
cout << color;
cout << setw(3) << index++ << " ";
cout << setw(25) << left << r["dataset"].get<string>() << " ";
cout << setw(6) << right << r["samples"].get<int>() << " ";
cout << setw(5) << right << r["features"].get<int>() << " ";
cout << setw(3) << right << r["classes"].get<int>() << " ";
cout << setw(9) << setprecision(2) << fixed << r["nodes"].get<float>() << " ";
cout << setw(9) << setprecision(2) << fixed << r["leaves"].get<float>() << " ";
cout << setw(9) << setprecision(2) << fixed << r["depth"].get<float>() << " ";
cout << setw(8) << right << setprecision(6) << fixed << r["score"].get<double>() << "±" << setw(6) << setprecision(4) << fixed << r["score_std"].get<double>();
const string status = compareResult(r["dataset"].get<string>(), r["score"].get<double>());
cout << status;
cout << setw(12) << right << setprecision(6) << fixed << r["time"].get<double>() << "±" << setw(6) << setprecision(4) << fixed << r["time_std"].get<double>() << " ";
cout << r["hyperparameters"].dump();
cout << endl;
cout << flush;
lastResult = r;
totalScore += r["score"].get<double>();
odd = !odd;
}
if (data["results"].size() == 1 || selectedIndex != -1) {
cout << string(MAXL, '*') << endl;
cout << headerLine(fVector("Train scores: ", lastResult["scores_train"], 14, 12));
cout << headerLine(fVector("Test scores: ", lastResult["scores_test"], 14, 12));
cout << headerLine(fVector("Train times: ", lastResult["times_train"], 10, 3));
cout << headerLine(fVector("Test times: ", lastResult["times_test"], 10, 3));
cout << string(MAXL, '*') << endl;
} else {
footer(totalScore);
}
}
void ReportConsole::showSummary()
{
for (const auto& item : summary) {
stringstream oss;
oss << setw(3) << left << item.first;
oss << setw(3) << right << item.second << " ";
oss << left << meaning.at(item.first);
cout << headerLine(oss.str(), 2);
}
}
void ReportConsole::footer(double totalScore)
{
cout << Colors::MAGENTA() << string(MAXL, '*') << endl;
showSummary();
auto score = data["score_name"].get<string>();
if (score == BestScore::scoreName()) {
stringstream oss;
oss << score << " compared to " << BestScore::title() << " .: " << totalScore / BestScore::score();
cout << headerLine(oss.str());
}
if (!getExistBestFile() && compare) {
cout << headerLine("*** Best Results File not found. Couldn't compare any result!");
}
cout << string(MAXL, '*') << endl << Colors::RESET();
}
}

View File

@@ -0,0 +1,24 @@
#ifndef REPORTCONSOLE_H
#define REPORTCONSOLE_H
#include <string>
#include <iostream>
#include "ReportBase.h"
#include "Colors.h"
namespace platform {
using namespace std;
const int MAXL = 133;
class ReportConsole : public ReportBase {
public:
explicit ReportConsole(json data_, bool compare = false, int index = -1) : ReportBase(data_, compare), selectedIndex(index) {};
virtual ~ReportConsole() = default;
private:
int selectedIndex;
string headerLine(const string& text, int utf);
void header() override;
void body() override;
void footer(double totalScore);
void showSummary() override;
};
};
#endif

182
src/Platform/ReportExcel.cc Normal file
View File

@@ -0,0 +1,182 @@
#include <sstream>
#include <locale>
#include "ReportExcel.h"
#include "BestScore.h"
namespace platform {
ReportExcel::ReportExcel(json data_, bool compare, lxw_workbook* workbook) : ReportBase(data_, compare), ExcelFile(workbook)
{
createFile();
}
void ReportExcel::formatColumns()
{
worksheet_freeze_panes(worksheet, 6, 1);
vector<int> columns_sizes = { 22, 10, 9, 7, 12, 12, 12, 12, 12, 3, 15, 12, 23 };
for (int i = 0; i < columns_sizes.size(); ++i) {
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
}
}
void ReportExcel::createFile()
{
if (workbook == NULL) {
workbook = workbook_new((Paths::excel() + fileName).c_str());
}
const string name = data["model"].get<string>();
string suffix = "";
string efectiveName;
int num = 1;
// Create a sheet with the name of the model
while (true) {
efectiveName = name + suffix;
if (workbook_get_worksheet_by_name(workbook, efectiveName.c_str())) {
suffix = to_string(++num);
} else {
worksheet = workbook_add_worksheet(workbook, efectiveName.c_str());
break;
}
if (num > 100) {
throw invalid_argument("Couldn't create sheet " + efectiveName);
}
}
cout << "Adding sheet " << efectiveName << " to " << Paths::excel() + fileName << endl;
setProperties(data["title"].get<string>());
createFormats();
formatColumns();
}
void ReportExcel::closeFile()
{
workbook_close(workbook);
}
void ReportExcel::header()
{
locale mylocale(cout.getloc(), new separated);
locale::global(mylocale);
cout.imbue(mylocale);
stringstream oss;
string message = data["model"].get<string>() + " ver. " + data["version"].get<string>() + " " +
data["language"].get<string>() + " ver. " + data["language_version"].get<string>() +
" with " + to_string(data["folds"].get<int>()) + " Folds cross validation and " + to_string(data["seeds"].size()) +
" random seeds. " + data["date"].get<string>() + " " + data["time"].get<string>();
worksheet_merge_range(worksheet, 0, 0, 0, 12, message.c_str(), styles["headerFirst"]);
worksheet_merge_range(worksheet, 1, 0, 1, 12, data["title"].get<string>().c_str(), styles["headerRest"]);
worksheet_merge_range(worksheet, 2, 0, 3, 0, ("Score is " + data["score_name"].get<string>()).c_str(), styles["headerRest"]);
worksheet_merge_range(worksheet, 2, 1, 3, 3, "Execution time", styles["headerRest"]);
oss << setprecision(2) << fixed << data["duration"].get<float>() << " s";
worksheet_merge_range(worksheet, 2, 4, 2, 5, oss.str().c_str(), styles["headerRest"]);
oss.str("");
oss.clear();
oss << setprecision(2) << fixed << data["duration"].get<float>() / 3600 << " h";
worksheet_merge_range(worksheet, 3, 4, 3, 5, oss.str().c_str(), styles["headerRest"]);
worksheet_merge_range(worksheet, 2, 6, 3, 7, "Platform", styles["headerRest"]);
worksheet_merge_range(worksheet, 2, 8, 3, 9, data["platform"].get<string>().c_str(), styles["headerRest"]);
worksheet_merge_range(worksheet, 2, 10, 2, 12, ("Random seeds: " + fromVector("seeds")).c_str(), styles["headerSmall"]);
oss.str("");
oss.clear();
oss << "Stratified: " << (data["stratified"].get<bool>() ? "True" : "False");
worksheet_merge_range(worksheet, 3, 10, 3, 11, oss.str().c_str(), styles["headerSmall"]);
oss.str("");
oss.clear();
oss << "Discretized: " << (data["discretized"].get<bool>() ? "True" : "False");
worksheet_write_string(worksheet, 3, 12, oss.str().c_str(), styles["headerSmall"]);
}
void ReportExcel::body()
{
auto head = vector<string>(
{ "Dataset", "Samples", "Features", "Classes", "Nodes", "Edges", "States", "Score", "Score Std.", "St.", "Time",
"Time Std.", "Hyperparameters" });
int col = 0;
for (const auto& item : head) {
writeString(5, col++, item, "bodyHeader");
}
row = 6;
col = 0;
int hypSize = 22;
json lastResult;
double totalScore = 0.0;
string hyperparameters;
for (const auto& r : data["results"]) {
writeString(row, col, r["dataset"].get<string>(), "text");
writeInt(row, col + 1, r["samples"].get<int>(), "ints");
writeInt(row, col + 2, r["features"].get<int>(), "ints");
writeInt(row, col + 3, r["classes"].get<int>(), "ints");
writeDouble(row, col + 4, r["nodes"].get<float>(), "floats");
writeDouble(row, col + 5, r["leaves"].get<float>(), "floats");
writeDouble(row, col + 6, r["depth"].get<double>(), "floats");
writeDouble(row, col + 7, r["score"].get<double>(), "result");
writeDouble(row, col + 8, r["score_std"].get<double>(), "result");
const string status = compareResult(r["dataset"].get<string>(), r["score"].get<double>());
writeString(row, col + 9, status, "textCentered");
writeDouble(row, col + 10, r["time"].get<double>(), "time");
writeDouble(row, col + 11, r["time_std"].get<double>(), "time");
try {
hyperparameters = r["hyperparameters"].get<string>();
}
catch (const exception& err) {
stringstream oss;
oss << r["hyperparameters"];
hyperparameters = oss.str();
}
if (hyperparameters.size() > hypSize) {
hypSize = hyperparameters.size();
}
writeString(row, col + 12, hyperparameters, "text");
lastResult = r;
totalScore += r["score"].get<double>();
row++;
}
// Set the right column width of hyperparameters with the maximum length
worksheet_set_column(worksheet, 12, 12, hypSize + 5, NULL);
// Show totals if only one dataset is present in the result
if (data["results"].size() == 1) {
for (const string& group : { "scores_train", "scores_test", "times_train", "times_test" }) {
row++;
col = 1;
writeString(row, col, group, "text");
for (double item : lastResult[group]) {
string style = group.find("scores") != string::npos ? "result" : "time";
writeDouble(row, ++col, item, style);
}
}
// Set with of columns to show those totals completely
worksheet_set_column(worksheet, 1, 1, 12, NULL);
for (int i = 2; i < 7; ++i) {
// doesn't work with from col to col, so...
worksheet_set_column(worksheet, i, i, 15, NULL);
}
} else {
footer(totalScore, row);
}
}
void ReportExcel::showSummary()
{
for (const auto& item : summary) {
worksheet_write_string(worksheet, row + 2, 1, item.first.c_str(), styles["summaryStyle"]);
worksheet_write_number(worksheet, row + 2, 2, item.second, styles["summaryStyle"]);
worksheet_merge_range(worksheet, row + 2, 3, row + 2, 5, meaning.at(item.first).c_str(), styles["summaryStyle"]);
row += 1;
}
}
void ReportExcel::footer(double totalScore, int row)
{
showSummary();
row += 4 + summary.size();
auto score = data["score_name"].get<string>();
if (score == BestScore::scoreName()) {
worksheet_merge_range(worksheet, row, 1, row, 5, (score + " compared to " + BestScore::title() + " .:").c_str(), efectiveStyle("text"));
writeDouble(row, 6, totalScore / BestScore::score(), "result");
}
if (!getExistBestFile() && compare) {
worksheet_write_string(worksheet, row + 1, 0, "*** Best Results File not found. Couldn't compare any result!", styles["summaryStyle"]);
}
}
}

View File

@@ -0,0 +1,25 @@
#ifndef REPORTEXCEL_H
#define REPORTEXCEL_H
#include<map>
#include "xlsxwriter.h"
#include "ReportBase.h"
#include "ExcelFile.h"
#include "Colors.h"
namespace platform {
using namespace std;
class ReportExcel : public ReportBase, public ExcelFile {
public:
explicit ReportExcel(json data_, bool compare, lxw_workbook* workbook);
private:
const string fileName = "some_results.xlsx";
void formatColumns();
void createFile();
void closeFile();
void header() override;
void body() override;
void showSummary() override;
void footer(double totalScore, int row);
};
};
#endif // !REPORTEXCEL_H

56
src/Platform/Result.cc Normal file
View File

@@ -0,0 +1,56 @@
#include <filesystem>
#include <fstream>
#include <sstream>
#include "Result.h"
#include "Colors.h"
#include "BestScore.h"
#include "CLocale.h"
namespace platform {
Result::Result(const string& path, const string& filename)
: path(path)
, filename(filename)
{
auto data = load();
date = data["date"];
score = 0;
for (const auto& result : data["results"]) {
score += result["score"].get<double>();
}
scoreName = data["score_name"];
if (scoreName == BestScore::scoreName()) {
score /= BestScore::score();
}
title = data["title"];
duration = data["duration"];
model = data["model"];
complete = data["results"].size() > 1;
}
json Result::load() const
{
ifstream resultData(path + "/" + filename);
if (resultData.is_open()) {
json data = json::parse(resultData);
return data;
}
throw invalid_argument("Unable to open result file. [" + path + "/" + filename + "]");
}
string Result::to_string() const
{
auto tmp = ConfigLocale();
stringstream oss;
double durationShow = duration > 3600 ? duration / 3600 : duration > 60 ? duration / 60 : duration;
string durationUnit = duration > 3600 ? "h" : duration > 60 ? "m" : "s";
oss << date << " ";
oss << setw(12) << left << model << " ";
oss << setw(11) << left << scoreName << " ";
oss << right << setw(11) << setprecision(7) << fixed << score << " ";
auto completeString = isComplete() ? "C" : "P";
oss << setw(1) << " " << completeString << " ";
oss << setw(7) << setprecision(2) << fixed << durationShow << " " << durationUnit << " ";
oss << setw(50) << left << title << " ";
return oss.str();
}
}

37
src/Platform/Result.h Normal file
View File

@@ -0,0 +1,37 @@
#ifndef RESULT_H
#define RESULT_H
#include <map>
#include <vector>
#include <string>
#include <nlohmann/json.hpp>
namespace platform {
using namespace std;
using json = nlohmann::json;
class Result {
public:
Result(const string& path, const string& filename);
json load() const;
string to_string() const;
string getFilename() const { return filename; };
string getDate() const { return date; };
double getScore() const { return score; };
string getTitle() const { return title; };
double getDuration() const { return duration; };
string getModel() const { return model; };
string getScoreName() const { return scoreName; };
bool isComplete() const { return complete; };
private:
string path;
string filename;
string date;
double score;
string title;
double duration;
string model;
string scoreName;
bool complete;
};
};
#endif

View File

@@ -1,37 +1,11 @@
#include <filesystem>
#include "platformUtils.h"
#include "Results.h"
#include "Report.h"
#include "BestResult.h"
#include "ReportConsole.h"
#include "ReportExcel.h"
#include "BestScore.h"
#include "Colors.h"
#include "CLocale.h"
namespace platform {
Result::Result(const string& path, const string& filename)
: path(path)
, filename(filename)
{
auto data = load();
date = data["date"];
score = 0;
for (const auto& result : data["results"]) {
score += result["score"].get<double>();
}
scoreName = data["score_name"];
if (scoreName == BestResult::scoreName()) {
score /= BestResult::score();
}
title = data["title"];
duration = data["duration"];
model = data["model"];
}
json Result::load() const
{
ifstream resultData(path + "/" + filename);
if (resultData.is_open()) {
json data = json::parse(resultData);
return data;
}
throw invalid_argument("Unable to open result file. [" + path + "/" + filename + "]");
}
void Results::load()
{
using std::filesystem::directory_iterator;
@@ -40,31 +14,30 @@ namespace platform {
if (filename.find(".json") != string::npos && filename.find("results_") == 0) {
auto result = Result(path, filename);
bool addResult = true;
if (model != "any" && result.getModel() != model || scoreName != "any" && scoreName != result.getScoreName())
if (model != "any" && result.getModel() != model || scoreName != "any" && scoreName != result.getScoreName() || complete && !result.isComplete() || partial && result.isComplete())
addResult = false;
if (addResult)
files.push_back(result);
}
}
}
string Result::to_string() const
{
stringstream oss;
oss << date << " ";
oss << setw(12) << left << model << " ";
oss << setw(11) << left << scoreName << " ";
oss << right << setw(11) << setprecision(7) << fixed << score << " ";
oss << setw(9) << setprecision(3) << fixed << duration << " ";
oss << setw(50) << left << title << " ";
return oss.str();
if (max == 0) {
max = files.size();
}
}
void Results::show() const
{
auto temp = ConfigLocale();
cout << Colors::GREEN() << "Results found: " << files.size() << endl;
cout << "-------------------" << endl;
if (complete) {
cout << Colors::MAGENTA() << "Only listing complete results" << endl;
}
if (partial) {
cout << Colors::MAGENTA() << "Only listing partial results" << endl;
}
auto i = 0;
cout << " # Date Model Score Name Score Duration Title" << endl;
cout << "=== ========== ============ =========== =========== ========= =============================================================" << endl;
cout << Colors::GREEN() << " # Date Model Score Name Score C/P Duration Title" << endl;
cout << "=== ========== ============ =========== =========== === ========= =============================================================" << endl;
bool odd = true;
for (const auto& result : files) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
@@ -94,21 +67,51 @@ namespace platform {
cout << "Invalid index" << endl;
return -1;
}
void Results::report(const int index) const
void Results::report(const int index, const bool excelReport)
{
cout << Colors::YELLOW() << "Reporting " << files.at(index).getFilename() << endl;
auto data = files.at(index).load();
Report report(data);
report.show();
if (excelReport) {
ReportExcel reporter(data, compare, workbook);
reporter.show();
openExcel = true;
workbook = reporter.getWorkbook();
} else {
ReportConsole reporter(data, compare);
reporter.show();
}
}
void Results::showIndex(const int index, const int idx) const
{
auto data = files.at(index).load();
if (idx < 0 or idx >= static_cast<int>(data["results"].size())) {
cout << "Invalid index" << endl;
return;
}
cout << Colors::YELLOW() << "Showing " << files.at(index).getFilename() << endl;
ReportConsole reporter(data, compare, idx);
reporter.show();
}
void Results::menu()
{
char option;
int index;
bool finished = false;
string filename, line, options = "qldhsr";
string color, context;
string filename, line, options = "qldhsre";
while (!finished) {
cout << Colors::RESET() << "Choose option (quit='q', list='l', delete='d', hide='h', sort='s', report='r'): ";
if (indexList) {
color = Colors::GREEN();
context = " (quit='q', list='l', delete='d', hide='h', sort='s', report='r', excel='e'): ";
options = "qldhsre";
} else {
color = Colors::MAGENTA();
context = " (quit='q', list='l'): ";
options = "ql";
}
cout << Colors::RESET() << color;
cout << "Choose option " << context;
getline(cin, line);
if (line.size() == 0)
continue;
@@ -119,12 +122,23 @@ namespace platform {
}
option = line[0];
} else {
index = stoi(line);
if (index >= 0 && index < files.size()) {
report(index);
} else {
cout << "Invalid option" << endl;
if (all_of(line.begin(), line.end(), ::isdigit)) {
int idx = stoi(line);
if (indexList) {
// The value is about the files list
index = idx;
if (index >= 0 && index < max) {
report(index, false);
indexList = false;
continue;
}
} else {
// The value is about the result showed on screen
showIndex(index, idx);
continue;
}
}
cout << "Invalid option" << endl;
continue;
}
switch (option) {
@@ -133,6 +147,7 @@ namespace platform {
break;
case 'l':
show();
indexList = true;
break;
case 'd':
index = getIndex("delete");
@@ -144,6 +159,7 @@ namespace platform {
files.erase(files.begin() + index);
cout << "File: " + filename + " deleted!" << endl;
show();
indexList = true;
break;
case 'h':
index = getIndex("hide");
@@ -155,16 +171,26 @@ namespace platform {
files.erase(files.begin() + index);
show();
menu();
indexList = true;
break;
case 's':
sortList();
indexList = true;
show();
break;
case 'r':
index = getIndex("report");
if (index == -1)
break;
report(index);
indexList = false;
report(index, false);
break;
case 'e':
index = getIndex("excel");
if (index == -1)
break;
indexList = true;
report(index, true);
break;
default:
cout << "Invalid option" << endl;
@@ -231,9 +257,13 @@ namespace platform {
cout << "No results found!" << endl;
exit(0);
}
sortDate();
show();
menu();
cout << "Done!" << endl;
if (openExcel) {
workbook_close(workbook);
}
cout << Colors::RESET() << "Done!" << endl;
}
}

View File

@@ -1,48 +1,39 @@
#ifndef RESULTS_H
#define RESULTS_H
#include "xlsxwriter.h"
#include <map>
#include <vector>
#include <string>
#include <nlohmann/json.hpp>
#include "Result.h"
namespace platform {
using namespace std;
using json = nlohmann::json;
class Result {
public:
Result(const string& path, const string& filename);
json load() const;
string to_string() const;
string getFilename() const { return filename; };
string getDate() const { return date; };
double getScore() const { return score; };
string getTitle() const { return title; };
double getDuration() const { return duration; };
string getModel() const { return model; };
string getScoreName() const { return scoreName; };
private:
string path;
string filename;
string date;
double score;
string title;
double duration;
string model;
string scoreName;
};
class Results {
public:
Results(const string& path, const int max, const string& model, const string& score) : path(path), max(max), model(model), scoreName(score) { load(); };
Results(const string& path, const int max, const string& model, const string& score, bool complete, bool partial, bool compare) :
path(path), max(max), model(model), scoreName(score), complete(complete), partial(partial), compare(compare)
{
load();
};
void manage();
private:
string path;
int max;
string model;
string scoreName;
bool complete;
bool partial;
bool indexList = true;
bool openExcel = false;
bool compare;
lxw_workbook* workbook = NULL;
vector<Result> files;
void load(); // Loads the list of results
void show() const;
void report(const int index) const;
void report(const int index, const bool excelReport);
void showIndex(const int index, const int idx) const;
int getIndex(const string& intent) const;
void menu();
void sortList();

252
src/Platform/Statistics.cc Normal file
View File

@@ -0,0 +1,252 @@
#include <sstream>
#include "Statistics.h"
#include "Colors.h"
#include "Symbols.h"
#include <boost/math/distributions/chi_squared.hpp>
#include <boost/math/distributions/normal.hpp>
#include "CLocale.h"
namespace platform {
Statistics::Statistics(const vector<string>& models, const vector<string>& datasets, const json& data, double significance, bool output) :
models(models), datasets(datasets), data(data), significance(significance), output(output)
{
nModels = models.size();
nDatasets = datasets.size();
auto temp = ConfigLocale();
};
void Statistics::fit()
{
if (nModels < 3 || nDatasets < 3) {
cerr << "nModels: " << nModels << endl;
cerr << "nDatasets: " << nDatasets << endl;
throw runtime_error("Can't make the Friedman test with less than 3 models and/or less than 3 datasets.");
}
ranksModels.clear();
computeRanks();
// Set the control model as the one with the lowest average rank
controlIdx = distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; }));
computeWTL();
maxModelName = (*max_element(models.begin(), models.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const string& a, const string& b) { return a.size() < b.size(); })).size();
fitted = true;
}
map<string, float> assignRanks(vector<pair<string, double>>& ranksOrder)
{
// sort the ranksOrder vector by value
sort(ranksOrder.begin(), ranksOrder.end(), [](const pair<string, double>& a, const pair<string, double>& b) {
return a.second > b.second;
});
//Assign ranks to values and if they are the same they share the same averaged rank
map<string, float> ranks;
for (int i = 0; i < ranksOrder.size(); i++) {
ranks[ranksOrder[i].first] = i + 1.0;
}
int i = 0;
while (i < static_cast<int>(ranksOrder.size())) {
int j = i + 1;
int sumRanks = ranks[ranksOrder[i].first];
while (j < static_cast<int>(ranksOrder.size()) && ranksOrder[i].second == ranksOrder[j].second) {
sumRanks += ranks[ranksOrder[j++].first];
}
if (j > i + 1) {
float averageRank = (float)sumRanks / (j - i);
for (int k = i; k < j; k++) {
ranks[ranksOrder[k].first] = averageRank;
}
}
i = j;
}
return ranks;
}
void Statistics::computeRanks()
{
map<string, float> ranksLine;
for (const auto& dataset : datasets) {
vector<pair<string, double>> ranksOrder;
for (const auto& model : models) {
double value = data[model].at(dataset).at(0).get<double>();
ranksOrder.push_back({ model, value });
}
// Assign the ranks
ranksLine = assignRanks(ranksOrder);
// Store the ranks of the dataset
ranksModels[dataset] = ranksLine;
if (ranks.size() == 0) {
ranks = ranksLine;
} else {
for (const auto& rank : ranksLine) {
ranks[rank.first] += rank.second;
}
}
}
// Average the ranks
for (const auto& rank : ranks) {
ranks[rank.first] /= nDatasets;
}
}
void Statistics::computeWTL()
{
// Compute the WTL matrix
for (int i = 0; i < nModels; ++i) {
wtl[i] = { 0, 0, 0 };
}
json origin = data.begin().value();
for (auto const& item : origin.items()) {
auto controlModel = models.at(controlIdx);
double controlValue = data[controlModel].at(item.key()).at(0).get<double>();
for (int i = 0; i < nModels; ++i) {
if (i == controlIdx) {
continue;
}
double value = data[models[i]].at(item.key()).at(0).get<double>();
if (value < controlValue) {
wtl[i].win++;
} else if (value == controlValue) {
wtl[i].tie++;
} else {
wtl[i].loss++;
}
}
}
}
void Statistics::postHocHolmTest(bool friedmanResult)
{
if (!fitted) {
fit();
}
stringstream oss;
// Reference https://link.springer.com/article/10.1007/s44196-022-00083-8
// Post-hoc Holm test
// Calculate the p-value for the models paired with the control model
map<int, double> stats; // p-value of each model paired with the control model
boost::math::normal dist(0.0, 1.0);
double diff = sqrt(nModels * (nModels + 1) / (6.0 * nDatasets));
for (int i = 0; i < nModels; i++) {
if (i == controlIdx) {
stats[i] = 0.0;
continue;
}
double z = abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
double p_value = (long double)2 * (1 - cdf(dist, z));
stats[i] = p_value;
}
// Sort the models by p-value
vector<pair<int, double>> statsOrder;
for (const auto& stat : stats) {
statsOrder.push_back({ stat.first, stat.second });
}
sort(statsOrder.begin(), statsOrder.end(), [](const pair<int, double>& a, const pair<int, double>& b) {
return a.second < b.second;
});
// Holm adjustment
for (int i = 0; i < statsOrder.size(); ++i) {
auto item = statsOrder.at(i);
double before = i == 0 ? 0.0 : statsOrder.at(i - 1).second;
double p_value = min((double)1.0, item.second * (nModels - i));
p_value = max(before, p_value);
statsOrder[i] = { item.first, p_value };
}
holmResult.model = models.at(controlIdx);
auto color = friedmanResult ? Colors::CYAN() : Colors::YELLOW();
oss << color;
oss << " *************************************************************************************************************" << endl;
oss << " Post-hoc Holm test: H0: 'There is no significant differences between the control model and the other models.'" << endl;
oss << " Control model: " << models.at(controlIdx) << endl;
oss << " " << left << setw(maxModelName) << string("Model") << " p-value rank win tie loss Status" << endl;
oss << " " << string(maxModelName, '=') << " ============ ========= === === ==== =============" << endl;
// sort ranks from lowest to highest
vector<pair<string, float>> ranksOrder;
for (const auto& rank : ranks) {
ranksOrder.push_back({ rank.first, rank.second });
}
sort(ranksOrder.begin(), ranksOrder.end(), [](const pair<string, float>& a, const pair<string, float>& b) {
return a.second < b.second;
});
// Show the control model info.
oss << " " << Colors::BLUE() << left << setw(maxModelName) << ranksOrder.at(0).first << " ";
oss << setw(12) << " " << setprecision(7) << fixed << " " << ranksOrder.at(0).second << endl;
for (const auto& item : ranksOrder) {
auto idx = distance(models.begin(), find(models.begin(), models.end(), item.first));
double pvalue = 0.0;
for (const auto& stat : statsOrder) {
if (stat.first == idx) {
pvalue = stat.second;
}
}
holmResult.holmLines.push_back({ item.first, pvalue, item.second, wtl.at(idx), pvalue < significance });
if (item.first == models.at(controlIdx)) {
continue;
}
auto colorStatus = pvalue > significance ? Colors::GREEN() : Colors::MAGENTA();
auto status = pvalue > significance ? Symbols::check_mark : Symbols::cross;
auto textStatus = pvalue > significance ? " accepted H0" : " rejected H0";
oss << " " << colorStatus << left << setw(maxModelName) << item.first << " ";
oss << setprecision(6) << scientific << pvalue << setprecision(7) << fixed << " " << item.second;
oss << " " << right << setw(3) << wtl.at(idx).win << " " << setw(3) << wtl.at(idx).tie << " " << setw(4) << wtl.at(idx).loss;
oss << " " << status << textStatus << endl;
}
oss << color << " *************************************************************************************************************" << endl;
oss << Colors::RESET();
if (output) {
cout << oss.str();
}
}
bool Statistics::friedmanTest()
{
if (!fitted) {
fit();
}
stringstream oss;
// Friedman test
// Calculate the Friedman statistic
oss << Colors::BLUE() << endl;
oss << "***************************************************************************************************************" << endl;
oss << Colors::GREEN() << "Friedman test: H0: 'There is no significant differences between all the classifiers.'" << Colors::BLUE() << endl;
double degreesOfFreedom = nModels - 1.0;
double sumSquared = 0;
for (const auto& rank : ranks) {
sumSquared += pow(rank.second, 2);
}
// Compute the Friedman statistic as in https://link.springer.com/article/10.1007/s44196-022-00083-8
double friedmanQ = 12.0 * nDatasets / (nModels * (nModels + 1)) * (sumSquared - (nModels * pow(nModels + 1, 2)) / 4);
// Calculate the critical value
boost::math::chi_squared chiSquared(degreesOfFreedom);
long double p_value = (long double)1.0 - cdf(chiSquared, friedmanQ);
double criticalValue = quantile(chiSquared, 1 - significance);
oss << "Friedman statistic: " << friedmanQ << endl;
oss << "Critical χ2 Value for df=" << fixed << (int)degreesOfFreedom
<< " and alpha=" << setprecision(2) << fixed << significance << ": " << setprecision(7) << scientific << criticalValue << std::endl;
oss << "p-value: " << scientific << p_value << " is " << (p_value < significance ? "less" : "greater") << " than " << setprecision(2) << fixed << significance << endl;
bool result;
if (p_value < significance) {
oss << Colors::GREEN() << "The null hypothesis H0 is rejected." << endl;
result = true;
} else {
oss << Colors::YELLOW() << "The null hypothesis H0 is accepted. Computed p-values will not be significant." << endl;
result = false;
}
oss << Colors::BLUE() << "***************************************************************************************************************" << Colors::RESET() << endl;
if (output) {
cout << oss.str();
}
friedmanResult = { friedmanQ, criticalValue, p_value, result };
return result;
}
FriedmanResult& Statistics::getFriedmanResult()
{
return friedmanResult;
}
HolmResult& Statistics::getHolmResult()
{
return holmResult;
}
map<string, map<string, float>>& Statistics::getRanks()
{
return ranksModels;
}
} // namespace platform

64
src/Platform/Statistics.h Normal file
View File

@@ -0,0 +1,64 @@
#ifndef STATISTICS_H
#define STATISTICS_H
#include <iostream>
#include <vector>
#include <map>
#include <nlohmann/json.hpp>
using namespace std;
using json = nlohmann::json;
namespace platform {
struct WTL {
int win;
int tie;
int loss;
};
struct FriedmanResult {
double statistic;
double criticalValue;
long double pvalue;
bool reject;
};
struct HolmLine {
string model;
long double pvalue;
double rank;
WTL wtl;
bool reject;
};
struct HolmResult {
string model;
vector<HolmLine> holmLines;
};
class Statistics {
public:
Statistics(const vector<string>& models, const vector<string>& datasets, const json& data, double significance = 0.05, bool output = true);
bool friedmanTest();
void postHocHolmTest(bool friedmanResult);
FriedmanResult& getFriedmanResult();
HolmResult& getHolmResult();
map<string, map<string, float>>& getRanks();
private:
void fit();
void computeRanks();
void computeWTL();
const vector<string>& models;
const vector<string>& datasets;
const json& data;
double significance;
bool output;
bool fitted = false;
int nModels = 0;
int nDatasets = 0;
int controlIdx = 0;
map<int, WTL> wtl;
map<string, float> ranks;
int maxModelName = 0;
int maxDatasetName = 0;
FriedmanResult friedmanResult;
HolmResult holmResult;
map<string, map<string, float>> ranksModels;
};
}
#endif // !STATISTICS_H

18
src/Platform/Symbols.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef SYMBOLS_H
#define SYMBOLS_H
#include <string>
using namespace std;
namespace platform {
class Symbols {
public:
inline static const string check_mark{ "\u2714" };
inline static const string exclamation{ "\u2757" };
inline static const string black_star{ "\u2605" };
inline static const string cross{ "\u2717" };
inline static const string upward_arrow{ "\u27B6" };
inline static const string down_arrow{ "\u27B4" };
inline static const string equal_best{ check_mark };
inline static const string better_best{ black_star };
};
}
#endif // !SYMBOLS_H

19
src/Platform/Utils.h Normal file
View File

@@ -0,0 +1,19 @@
#ifndef UTILS_H
#define UTILS_H
#include <sstream>
#include <string>
#include <vector>
namespace platform {
//static vector<string> split(const string& text, char delimiter);
static std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(token);
}
return result;
}
}
#endif

95
src/Platform/best.cc Normal file
View File

@@ -0,0 +1,95 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include "Paths.h"
#include "BestResults.h"
#include "Colors.h"
using namespace std;
argparse::ArgumentParser manageArguments(int argc, char** argv)
{
argparse::ArgumentParser program("best");
program.add_argument("-m", "--model").default_value("").help("Filter results of the selected model) (any for all models)");
program.add_argument("-s", "--score").default_value("").help("Filter results of the score name supplied");
program.add_argument("--build").help("build best score results file").default_value(false).implicit_value(true);
program.add_argument("--report").help("report of best score results file").default_value(false).implicit_value(true);
program.add_argument("--friedman").help("Friedman test").default_value(false).implicit_value(true);
program.add_argument("--excel").help("Output to excel").default_value(false).implicit_value(true);
program.add_argument("--level").help("significance level").default_value(0.05).scan<'g', double>().action([](const string& value) {
try {
auto k = stod(value);
if (k < 0.01 || k > 0.15) {
throw runtime_error("Significance level hast to be a number in [0.01, 0.15]");
}
return k;
}
catch (const runtime_error& err) {
throw runtime_error(err.what());
}
catch (...) {
throw runtime_error("Number of folds must be an decimal number");
}});
try {
program.parse_args(argc, argv);
auto model = program.get<string>("model");
auto score = program.get<string>("score");
auto build = program.get<bool>("build");
auto report = program.get<bool>("report");
auto friedman = program.get<bool>("friedman");
auto excel = program.get<bool>("excel");
auto level = program.get<double>("level");
if (model == "" || score == "") {
throw runtime_error("Model and score name must be supplied");
}
if (friedman && model != "any") {
cerr << "Friedman test can only be used with all models" << endl;
cerr << program;
exit(1);
}
if (excel && model != "any") {
cerr << "Excel ourput can only be used with all models" << endl;
cerr << program;
exit(1);
}
if (!report && !build) {
cerr << "Either build, report or both, have to be selected to do anything!" << endl;
cerr << program;
exit(1);
}
}
catch (const exception& err) {
cerr << err.what() << endl;
cerr << program;
exit(1);
}
return program;
}
int main(int argc, char** argv)
{
auto program = manageArguments(argc, argv);
auto model = program.get<string>("model");
auto score = program.get<string>("score");
auto build = program.get<bool>("build");
auto report = program.get<bool>("report");
auto friedman = program.get<bool>("friedman");
auto excel = program.get<bool>("excel");
auto level = program.get<double>("level");
auto results = platform::BestResults(platform::Paths::results(), score, model, friedman, level);
if (build) {
if (model == "any") {
results.buildAll();
} else {
string fileName = results.build();
cout << Colors::GREEN() << fileName << " created!" << Colors::RESET() << endl;
}
}
if (report) {
if (model == "any") {
results.reportAll(excel);
} else {
results.reportSingle();
}
}
return 0;
}

Some files were not shown because too many files have changed in this diff Show More