Compare commits

...

208 Commits

Author SHA1 Message Date
f94e2d6a27 Add quiet parameter 2023-11-24 21:16:20 +01:00
2121ba9b98 Refactor input grid parameters to json file 2023-11-24 09:57:29 +01:00
8b7b59d42b Complete first step 2023-11-23 12:59:21 +01:00
bbe5302ab1 Add info to output 2023-11-22 16:38:50 +01:00
c2eb727fc7 Complete output interface of gridsearch 2023-11-22 16:30:04 +01:00
fb347ed5b9 Begin gridsearch implementation 2023-11-22 12:22:30 +01:00
b657762c0c Generate combinations sample 2023-11-22 00:18:24 +01:00
495d8a8528 Begin implementing grid combinations 2023-11-21 13:11:14 +01:00
4628e48d3c Build gridsearch structure 2023-11-20 23:32:34 +01:00
5876be4b24 Add more install instructions of Boost to README 2023-11-20 20:39:22 +01:00
dc3400197f Add coment todo impelemt number of nodes 2023-11-20 01:14:13 +01:00
26d3a57782 Add info to invalid hyperparameter exception 2023-11-19 23:02:28 +01:00
4f3a04058f Refactor Hyperparameters management 2023-11-19 22:36:27 +01:00
89c4613591 Implement hyperparameters with json file 2023-11-18 11:56:10 +01:00
28f3d87e32 Add Python Classifiers
Add STree, Odte, SVC & RandomForest Classifiers
Remove using namespace ... in project
2023-11-17 11:11:05 +01:00
e8d2c9fc0b Set intolerant convergence 2023-11-17 10:26:25 +01:00
d3cb580387 Remove n_jobs from STree 2023-11-17 10:10:31 +01:00
f088df14fd Restore the Creation model position in experiment 2023-11-17 01:10:46 +01:00
e2249eace7 Disable Warning messages in python clfs
Disable removing Python env
2023-11-16 22:38:46 +01:00
64f5a7f14a Fix header in example 2023-11-16 17:03:40 +01:00
408db2aad5 Mark override fit funtcion 2023-11-14 18:59:41 +01:00
e03efb5f63 set tolerance=0 if feature selection in BoostAODE 2023-11-14 10:12:02 +01:00
f617886133 Add new models to example 2023-11-14 09:12:25 +01:00
69ad660040 Refactor version method in PyClassifier 2023-11-13 13:59:06 +01:00
431b3a3aa5 Fit PyWrap into BayesNet 2023-11-13 11:13:32 +01:00
6a23e2cc26 Add CMakelist integration 2023-11-12 22:14:29 +01:00
f6e00530be Add Pywrap sources 2023-11-12 21:43:07 +01:00
f9258e43b9 Remove using namespace from Library 2023-11-08 18:45:35 +01:00
92820555da Simple fix 2023-10-28 10:56:47 +02:00
5a3af51826 Activate best score in odte 2023-10-25 10:23:42 +02:00
a8f9800631 Fix mistake when no results in manage 2023-10-24 19:44:23 +02:00
84cec0c1e0 Add results files affected in best results excel 2023-10-24 16:18:52 +02:00
130139f644 Update formulas to use letters in ranges in excel 2023-10-24 13:06:31 +02:00
651f84b562 Fix mistake in conditional format in bestresults 2023-10-24 11:18:19 +02:00
553ab0fa22 Add conditional format to BestResults Excel 2023-10-24 10:56:41 +02:00
4975feabff Fix mistake in node count 2023-10-23 22:46:10 +02:00
32293af69f Fix header in manage 2023-10-23 17:04:59 +02:00
858664be2d Add total number of results in manage 2023-10-23 16:22:15 +02:00
1f705f6018 Refactor BestScore and add experiment to .env 2023-10-23 16:12:52 +02:00
7bcd2eed06 Add variable width of dataset name in reports 2023-10-22 22:58:52 +02:00
833acefbb3 Fix index limits mistake in manage 2023-10-22 20:21:50 +02:00
26b649ebae Refactor ManageResults and CommandParser 2023-10-22 20:03:34 +02:00
080eddf9cd Fix hyperparameters output in b_best 2023-10-20 22:52:48 +02:00
04e754b2f5 Adjust filename and hyperparameters in reports 2023-10-20 11:12:46 +02:00
38423048bd Add excel to best report of model 2023-10-19 18:12:55 +02:00
64fc97b892 Rename utilities sources to match final names 2023-10-19 09:57:04 +02:00
2c2159f192 Add quiet mode to b_main
Reduce output when --quiet is set, not showing fold info
2023-10-17 21:51:53 +02:00
6765552a7c Update submodule versions 2023-10-16 19:21:57 +02:00
f72aa5b9a6 Merge pull request 'Create Boost_CFS' (#11) from Boost_CFS into main
Add hyper parameter to BoostAODE. This hyper parameter decides if we select features with cfs/fcbf/iwss before start building models and build a Spode with the selected features.
The hyperparameter is select_features
2023-10-15 09:22:14 +00:00
fa7fe081ad Fix xlsx library finding 2023-10-15 11:19:58 +02:00
660e783517 Update validation for feature selection 2023-10-14 13:32:09 +02:00
b35532dd9e Implement IWSS and FCBF too for BoostAODE 2023-10-14 13:12:04 +02:00
6ef49385ea Remove unneeded method declaration FeatureSelect 2023-10-14 11:30:32 +02:00
6d5a25cdc8 Refactor CFS class creating abstract base class 2023-10-14 11:27:46 +02:00
d00b08cbe8 Fix Header for Linux 2023-10-13 14:26:47 +02:00
977ff6fddb Update CMakeLists for Linux 2023-10-13 14:01:52 +02:00
54b8939f35 Prepare BoostAODE first try 2023-10-13 13:46:22 +02:00
5022a4dc90 Complete CFS tested with Python mufs 2023-10-13 12:29:25 +02:00
40d1dad5d8 Begin CFS implementation 2023-10-11 21:17:26 +02:00
47e2b138c5 Complete first working cfs 2023-10-11 11:33:29 +02:00
e7ded68267 First cfs working version 2023-10-10 23:00:38 +02:00
ca833a34f5 try openssl sha256 2023-10-10 18:16:43 +02:00
df9b4c48d2 Begin CFS initialization 2023-10-10 13:39:11 +02:00
f288bbd6fa Begin adding cfs to BoostAODE 2023-10-10 11:52:39 +02:00
7d8aca4f59 Add Locale shared config to reports 2023-10-09 19:41:29 +02:00
8fdad78a8c Continue Test Network 2023-10-09 11:25:30 +02:00
e3ae073333 Continue test Network 2023-10-08 15:54:58 +02:00
4b732e76c2 MST change unordered_set to list 2023-10-07 19:08:13 +02:00
fe5fead27e Begin Fix Test MST 2023-10-07 01:43:26 +02:00
8c3864f3c8 Complete Folding Test 2023-10-07 01:23:36 +02:00
1287160c47 Refactor makefile to use variables 2023-10-07 00:16:25 +02:00
2f58807322 Begin refactor CMakeLists debug/release paths 2023-10-06 19:32:29 +02:00
17e079edd5 Begin Test Folding 2023-10-06 17:08:54 +02:00
b9e0028e9d Refactor Makefile 2023-10-06 01:28:27 +02:00
e0d39fe631 Fix BayesMetrics Test 2023-10-06 01:14:55 +02:00
36b0277576 Add Maximum Spanning Tree test 2023-10-05 15:45:36 +02:00
da8d018ec4 Refactor Makefile 2023-10-05 11:45:00 +02:00
5f0676691c Add First BayesMetrics Tests 2023-10-05 01:14:16 +02:00
3448fb1299 Refactor Tests and add BayesMetrics test 2023-10-04 23:19:23 +02:00
5e938d5cca Add ranks sheet to excel best results 2023-10-04 16:26:57 +02:00
55e742438f Add constant references to Statistics 2023-10-04 13:40:45 +02:00
c4ae3fe429 Add Control model rank info to report 2023-10-04 12:42:35 +02:00
93e4ff94db Add significance level as parameter in best 2023-10-02 15:46:40 +02:00
57c27f739c Remove unused code in BestResults 2023-10-02 15:31:02 +02:00
a434d7f1ae Add a Linux config in launch.json 2023-09-30 18:44:21 +02:00
294666c516 Fix a Linux problem in Datasets 2023-09-30 18:43:47 +02:00
fd04e78ad9 Restore sample.cc 2023-09-29 18:50:25 +02:00
66ec1b343b Remove platformUtils and split Datasets & Dataset 2023-09-29 18:20:46 +02:00
bb423da42f Add csv and R_dat files to platform 2023-09-29 13:52:50 +02:00
db17c14042 Change names of executables to b_... 2023-09-29 09:17:50 +02:00
a4401cb78f Linux CMakeLists.txt adjustment 2023-09-29 00:30:47 +02:00
9d3d9cc6c6 Complete Excel output for bestResults with Friedman test 2023-09-28 18:52:37 +02:00
cfcf3c16df Add best results Excel 2023-09-28 17:12:04 +02:00
85202260f3 Separate specific Excel methods to ExcelFile 2023-09-28 13:07:11 +02:00
82acb3cab5 Enhance output of Best results reports 2023-09-28 12:08:56 +02:00
623ceed396 Merge pull request 'Add Friedman Test & post hoc tests to BestResults' (#10) from boost into main
Reviewed-on: #10
2023-09-28 07:44:55 +00:00
926de2bebd Add boost info to README 2023-09-28 09:44:33 +02:00
71704e3547 Enhance output info in Statistics 2023-09-28 01:27:18 +02:00
3b06534327 Remove duplicated code in BestResults 2023-09-28 00:59:34 +02:00
ac89a451e3 Duplicate statistics tests in class 2023-09-28 00:45:15 +02:00
00c6cf663b Fix order of output in posthoc 2023-09-27 19:11:47 +02:00
5043c12be8 Complete posthoc with Holm adjust 2023-09-27 18:34:16 +02:00
11320e2cc7 Complete friedman test as in exreport 2023-09-27 12:36:03 +02:00
ce66483b65 Update boost version requirement for Linux 2023-09-26 14:12:53 +02:00
cab8e14b2d Add friedman hyperparameter 2023-09-26 11:26:59 +02:00
f0d0abe891 Add boost library link to linux build 2023-09-26 01:07:50 +02:00
dcba146e12 Begin adding Friedman test to BestResults 2023-09-26 01:04:59 +02:00
3ea0285119 Fix ranks to match friedman test ranks 2023-09-25 18:38:12 +02:00
e3888e1503 Merge pull request 'bestResults' (#9) from bestResults into main
Reviewed-on: https://gitea.rmontanana.es:3000/rmontanana/BayesNet/pulls/9

Add best results management, build, report, build all & report all
2023-09-25 12:02:17 +00:00
06de13df98 Add date/time to header of report best 2023-09-25 10:04:53 +02:00
de4fa6a04f Add color to totals 2023-09-23 10:30:39 +02:00
3a7bf4e672 Fix ranking order mistake 2023-09-23 01:33:23 +02:00
cd0bc02a74 Add report/build all with totals and ranks 2023-09-23 01:14:02 +02:00
c8597a794e Begin report all models 2023-09-22 18:13:32 +02:00
b30416364d Fix mistake in best results file name 2023-09-22 14:14:39 +02:00
3a16589220 Add best config for debug in vscode 2023-09-22 01:04:36 +02:00
c4f9187e2a Complete best build and report 2023-09-22 01:03:55 +02:00
c4d0a5b4e6 Split Result from Results 2023-09-21 23:30:17 +02:00
7bfafe555f Begin BestResults build 2023-09-21 23:04:11 +02:00
337b6f7e79 Rename BestResult to BestScore 2023-09-21 19:30:07 +02:00
5fa0b957dd Fix mistake in idx range in manage 2023-09-20 19:12:07 +02:00
67252fc41d Fix CMakeLists libxlsxwriter for Linux 2023-09-20 19:02:53 +02:00
94ae9456a0 Fix libxslxwriter linking problem 2023-09-20 18:50:11 +02:00
781993e326 Resolve some warnings 2023-09-20 17:54:15 +02:00
8257a6ae39 Add message of not exist Best Results 2023-09-20 13:50:34 +02:00
fc81730dfc Merge pull request 'Exchange OpenXLSX to libxlsxwriter' (#8) from libxlsxwriter into main
Add multiple sheets to excel file
Add format and color to sheets
Add comparison with ZeroR
Add comparison with Best Results
Separate contextual menu from general in manage
2023-09-20 11:17:16 +00:00
d8734ff082 Separate contextual menu from general 2023-09-20 13:15:33 +02:00
03533461c8 Add compare to best results in manage 2023-09-20 12:51:19 +02:00
68f22a673d Add comparison to report console 2023-09-20 11:40:01 +02:00
b9bc0088f3 Add format to unique dataset results summary 2023-09-20 10:30:45 +02:00
c280e254ca Remove OpenXLSX submodule 2023-09-20 01:09:58 +02:00
3d0f29fda3 Remove .vscode/settings.json from repository 2023-09-20 01:01:40 +02:00
20a6ebab7c Support to add any number of sheets to excel 2023-09-20 00:58:01 +02:00
925f71166c Fix mistake in comparison 2023-09-19 23:46:49 +02:00
f69f415b92 Complete comparison with ZeroR 2023-09-19 17:55:03 +02:00
1bdfbd1620 Complete adding color to format 2023-09-19 14:07:41 +02:00
06fb135526 First approach 2023-09-18 23:26:22 +02:00
501ea0ab4e Fix CMakeList manage build with Linux 2023-09-18 19:27:40 +02:00
847c6761d7 Add Linux specific link library to cmake 2023-09-17 10:42:19 +02:00
6030885fc3 Add partial result filter to manage 2023-09-16 17:27:18 +02:00
89df7f4db0 Add library to manage link 2023-09-14 01:41:49 +02:00
41257ed566 If ! convergence don't predict test 2023-09-10 19:50:36 +02:00
506369e46b Add Convergence hyperparameter 2023-09-07 11:27:35 +02:00
d908f389f5 Begin using validation as finish condition 2023-09-06 10:51:07 +02:00
5a7c8f1818 Add status to classifier and Experiment 2023-09-05 13:39:43 +02:00
64fc7bd9dd Add show dataset detail in report 2023-09-05 09:26:49 +02:00
0b7beda78c Add threads without limit to network fit 2023-09-04 21:24:11 +02:00
05b670dfc0 Add detail to fold progress in main 2023-09-03 16:33:48 +02:00
de62d42b74 Fix make debug command 2023-09-03 14:13:10 +02:00
edb957d22e Add filter complete results to manage 2023-09-03 14:07:11 +02:00
4de5cb4c6c Merge pull request 'Solve Ensemble models exceptions on certain datasets' (#7) from solveexceptions into main
Reviewed-on: #7
2023-09-02 15:29:33 +00:00
c35030f137 Upgrade models version and Add class diagram 2023-09-02 14:39:43 +02:00
182b07ed90 Solve voting vector error 2023-09-02 13:58:12 +02:00
7806f961e2 Remove threads 2023-08-31 20:30:28 +02:00
7c3e315ae7 Add Linux specific options to compile 2023-08-29 18:20:55 +02:00
284ef6dfd1 Add significanceModels to AODELd 2023-08-24 12:58:53 +02:00
1c6af619b5 Exception if hyperparameters not valid 2023-08-24 12:09:35 +02:00
86ffdfd6f3 Add const feature and className to fit models 2023-08-23 23:15:39 +02:00
d82148079d Add KDB hyperparameters K and theta 2023-08-23 00:44:10 +02:00
067430fd1b Add xlsxopen submodule 2023-08-22 23:45:11 +02:00
f5d0d16365 Merge pull request 'Add excel report to manage results' (#6) from xlsx into main
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/6
2023-08-22 21:40:11 +00:00
97ca8ac084 Move check valid hyperparameters to Classifier 2023-08-22 22:12:20 +02:00
1c1385b768 Fix maxModels mistake in BoostAODE if !repeatSp
Throw exception if wrong hyperparmeter is supplied
2023-08-22 21:55:17 +02:00
35432b6294 Fix time std was not saved in experiment 2023-08-22 12:30:27 +02:00
c59dd30e53 Complete Excel Report with data 2023-08-22 11:55:15 +02:00
d2da0ddb88 Create ReportExcel eq to ReportConsole 2023-08-21 17:51:49 +02:00
8066701c3c Refactor Report class into ReportBase & ReportCons 2023-08-21 17:16:29 +02:00
0f66ac73d0 Revert "Refactor Report into ReportBase & ReportConsole"
This reverts commit 4370bf51d7.
2023-08-21 17:15:14 +02:00
4370bf51d7 Refactor Report into ReportBase & ReportConsole 2023-08-21 17:14:23 +02:00
2b7353b9e0 Add default sorting by date in manage 2023-08-21 16:30:10 +02:00
b686b3c9c3 Enhance copy in Makefile 2023-08-21 12:18:23 +02:00
2dd04a6c44 enhance saving results and add Makefile copy 2023-08-21 11:57:45 +02:00
1da83662d0 Always save results 2023-08-21 10:55:20 +02:00
3ac9593c65 Fix mistake in sample 2023-08-20 20:36:46 +02:00
6b317accf1 Add hyperparameters and processing order to Boost 2023-08-20 20:31:23 +02:00
4964aab722 Add hyperparameters management in experiments 2023-08-20 17:57:38 +02:00
7a6ec73d63 Merge pull request 'boostAode' (#5) from boostAode into main
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/5
Implement boostAODE
add list datasets
add manage results
2023-08-20 09:02:07 +00:00
1a534888d6 Fix report format 2023-08-19 23:30:44 +02:00
59ffd179f4 Fix report format 2023-08-19 21:26:48 +02:00
9972738deb Add list datasets and add locale format 2023-08-19 19:05:16 +02:00
bafcb26bb6 Add manage to build target 2023-08-18 13:43:53 +02:00
2d7999d5f2 Add manage to release targets 2023-08-18 13:43:13 +02:00
a6bb22dfb5 Complete first BoostAODE 2023-08-18 11:50:34 +02:00
704dc937be Remove FeatureSel, add SelectKBest to BayesMetrics 2023-08-16 19:05:18 +02:00
a3e665eed6 make weights double 2023-08-16 12:46:09 +02:00
918a7b4180 Remove unneeded output 2023-08-16 12:36:38 +02:00
80b20f35b4 Fix weights mistakes in computation 2023-08-16 12:32:51 +02:00
4d4780c1d5 Add BoostAODE model based on AODE 2023-08-15 16:16:04 +02:00
fa612c531e Complete Adding weights to Models 2023-08-15 15:59:56 +02:00
24b68f9ae2 Add weigths as parameter 2023-08-15 15:04:56 +02:00
a062ebf445 Merge pull request 'reports' (#4) from reports into boostAode
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/4
2023-08-14 16:58:48 +00:00
2a3fc9aa45 Add colors and enhace input control 2023-08-14 17:03:06 +02:00
55d21294d5 Add class Paths and enhance input 2023-08-14 00:40:31 +02:00
3691cb4a61 Add totals and filter by scoreName and model 2023-08-13 18:13:00 +02:00
054567c65a Add sorting capacity 2023-08-13 17:10:18 +02:00
2729b92f06 Summary list 2023-08-13 16:19:17 +02:00
f26ea1f0ac Add weights to BayesMetrics 2023-08-13 12:56:06 +02:00
af0419c9da First approx with const 1 weights 2023-08-13 00:59:02 +02:00
90c92e5c56 Merge pull request 'Add states as result in Proposal methods' (#3) from optimize_memory into main
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/3
2023-08-12 14:16:55 +00:00
182b52a887 Add states as result in Proposal methods 2023-08-12 16:16:17 +02:00
6679b90a82 Merge pull request 'optimize_memory' (#2) from optimize_memory into main
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/2
2023-08-12 14:15:03 +00:00
405887f833 Solved Ld poor results 2023-08-12 11:49:18 +02:00
3a85481a5a Redo pass states to Network Fit needed in crossval
fix mistake in headerline (report)
2023-08-12 11:10:53 +02:00
0ad5505c16 Spodeld working with poor accuracy 2023-08-10 02:06:18 +02:00
323444b74a const functions 2023-08-08 01:53:41 +02:00
ef1bffcac3 Fixed normal classifiers 2023-08-07 13:50:11 +02:00
06db8f51ce Refactor library and models to lighten data stored
Refactro Ensemble to inherit from Classifier insted of BaseClassifier
2023-08-07 12:49:37 +02:00
e74565ba01 update clang-tidy 2023-08-07 00:44:12 +02:00
154 changed files with 7965 additions and 1908 deletions

View File

@@ -13,5 +13,4 @@ HeaderFilterRegex: 'src/*'
AnalyzeTemporaryDtors: false AnalyzeTemporaryDtors: false
WarningsAsErrors: '' WarningsAsErrors: ''
FormatStyle: file FormatStyle: file
FormatStyleOptions: ''
... ...

31
.clang-uml Normal file
View File

@@ -0,0 +1,31 @@
compilation_database_dir: build
output_directory: puml
diagrams:
BayesNet:
type: class
glob:
- src/BayesNet/*.cc
- src/Platform/*.cc
using_namespace: bayesnet
include:
namespaces:
- bayesnet
- platform
plantuml:
after:
- "note left of {{ alias(\"MyProjectMain\") }}: Main class of myproject library."
sequence:
type: sequence
glob:
- src/Platform/main.cc
combine_free_functions_into_file_participants: true
using_namespace:
- std
- bayesnet
- platform
include:
paths:
- src/BayesNet
- src/Platform
start_from:
- function: main(int,const char **)

5
.gitignore vendored
View File

@@ -31,7 +31,10 @@
*.exe *.exe
*.out *.out
*.app *.app
build/ build/**
build_*/**
*.dSYM/** *.dSYM/**
cmake-build*/** cmake-build*/**
.idea .idea
puml/**
.vscode/settings.json

3
.gitmodules vendored
View File

@@ -10,3 +10,6 @@
[submodule "lib/json"] [submodule "lib/json"]
path = lib/json path = lib/json
url = https://github.com/nlohmann/json.git url = https://github.com/nlohmann/json.git
[submodule "lib/libxlsxwriter"]
path = lib/libxlsxwriter
url = https://github.com/jmcnamara/libxlsxwriter.git

18
.vscode/c_cpp_properties.json vendored Normal file
View File

@@ -0,0 +1,18 @@
{
"configurations": [
{
"name": "Mac",
"includePath": [
"${workspaceFolder}/**"
],
"defines": [],
"macFrameworkPath": [
"/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
],
"cStandard": "c17",
"cppStandard": "c++17",
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json"
}
],
"version": 4
}

85
.vscode/launch.json vendored
View File

@@ -5,40 +5,103 @@
"type": "lldb", "type": "lldb",
"request": "launch", "request": "launch",
"name": "sample", "name": "sample",
"program": "${workspaceFolder}/build/sample/BayesNetSample", "program": "${workspaceFolder}/build_debug/sample/BayesNetSample",
"args": [ "args": [
"-d", "-d",
"iris", "iris",
"-m", "-m",
"KDB", "TANLd",
"-s", "-s",
"271", "271",
"-p", "-p",
"/Users/rmontanana/Code/discretizbench/datasets/", "/home/rmontanana/Code/discretizbench/datasets/",
], ],
//"cwd": "${workspaceFolder}/build/sample/", //"cwd": "${workspaceFolder}/build/sample/",
}, },
{ {
"type": "lldb", "type": "lldb",
"request": "launch", "request": "launch",
"name": "experiment", "name": "experimentPy",
"program": "${workspaceFolder}/build/src/Platform/main", "program": "${workspaceFolder}/build_debug/src/Platform/b_main",
"args": [ "args": [
"-m", "-m",
"AODELd", "STree",
"-p",
"/Users/rmontanana/Code/discretizbench/datasets",
"--stratified", "--stratified",
"-d", "-d",
"iris" "iris",
//"--discretize"
// "--hyperparameters",
// "{\"repeatSparent\": true, \"maxModels\": 12}"
], ],
"cwd": "/Users/rmontanana/Code/discretizbench", "cwd": "/home/rmontanana/Code/discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "experimentBayes",
"program": "${workspaceFolder}/build_debug/src/Platform/b_main",
"args": [
"-m",
"TAN",
"--stratified",
"--discretize",
"-d",
"iris",
"--hyperparameters",
"{\"repeatSparent\": true, \"maxModels\": 12}"
],
"cwd": "/home/rmontanana/Code/discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "best",
"program": "${workspaceFolder}/build_debug/src/Platform/b_best",
"args": [
"-m",
"BoostAODE",
"-s",
"accuracy",
"--build",
],
"cwd": "/home/rmontanana/Code/discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "manage",
"program": "${workspaceFolder}/build_debug/src/Platform/b_manage",
"args": [
"-n",
"20"
],
"cwd": "/home/rmontanana/Code/discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "list",
"program": "${workspaceFolder}/build_debug/src/Platform/b_list",
"args": [],
//"cwd": "/Users/rmontanana/Code/discretizbench",
"cwd": "/home/rmontanana/Code/covbench",
},
{
"type": "lldb",
"request": "launch",
"name": "test",
"program": "${workspaceFolder}/build_debug/tests/unit_tests",
"args": [
"-c=\"Metrics Test\"",
// "-s",
],
"cwd": "${workspaceFolder}/build/tests",
}, },
{ {
"name": "Build & debug active file", "name": "Build & debug active file",
"type": "cppdbg", "type": "cppdbg",
"request": "launch", "request": "launch",
"program": "${workspaceFolder}/build/bayesnet", "program": "${workspaceFolder}/build_debug/bayesnet",
"args": [], "args": [],
"stopAtEntry": false, "stopAtEntry": false,
"cwd": "${workspaceFolder}", "cwd": "${workspaceFolder}",

109
.vscode/settings.json vendored
View File

@@ -1,109 +0,0 @@
{
"files.associations": {
"*.rmd": "markdown",
"*.py": "python",
"vector": "cpp",
"__bit_reference": "cpp",
"__bits": "cpp",
"__config": "cpp",
"__debug": "cpp",
"__errc": "cpp",
"__hash_table": "cpp",
"__locale": "cpp",
"__mutex_base": "cpp",
"__node_handle": "cpp",
"__nullptr": "cpp",
"__split_buffer": "cpp",
"__string": "cpp",
"__threading_support": "cpp",
"__tuple": "cpp",
"array": "cpp",
"atomic": "cpp",
"bitset": "cpp",
"cctype": "cpp",
"chrono": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"exception": "cpp",
"initializer_list": "cpp",
"ios": "cpp",
"iosfwd": "cpp",
"istream": "cpp",
"limits": "cpp",
"locale": "cpp",
"memory": "cpp",
"mutex": "cpp",
"new": "cpp",
"optional": "cpp",
"ostream": "cpp",
"ratio": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"streambuf": "cpp",
"string": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"typeinfo": "cpp",
"unordered_map": "cpp",
"variant": "cpp",
"algorithm": "cpp",
"iostream": "cpp",
"iomanip": "cpp",
"numeric": "cpp",
"set": "cpp",
"__tree": "cpp",
"deque": "cpp",
"list": "cpp",
"map": "cpp",
"unordered_set": "cpp",
"any": "cpp",
"condition_variable": "cpp",
"forward_list": "cpp",
"fstream": "cpp",
"stack": "cpp",
"thread": "cpp",
"__memory": "cpp",
"filesystem": "cpp",
"*.toml": "toml",
"utility": "cpp",
"__verbose_abort": "cpp",
"bit": "cpp",
"random": "cpp",
"*.tcc": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory_resource": "cpp",
"format": "cpp",
"valarray": "cpp",
"regex": "cpp",
"span": "cpp",
"cfenv": "cpp",
"cinttypes": "cpp",
"csetjmp": "cpp",
"future": "cpp",
"queue": "cpp",
"typeindex": "cpp",
"shared_mutex": "cpp",
"*.ipp": "cpp",
"cassert": "cpp",
"charconv": "cpp",
"source_location": "cpp",
"ranges": "cpp"
},
"cmake.configureOnOpen": false,
"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools"
}

23
.vscode/tasks.json vendored
View File

@@ -32,6 +32,29 @@
], ],
"group": "build", "group": "build",
"detail": "Task generated by Debugger." "detail": "Task generated by Debugger."
},
{
"type": "cppbuild",
"label": "C/C++: g++ build active file",
"command": "/usr/bin/g++",
"args": [
"-fdiagnostics-color=always",
"-g",
"${file}",
"-o",
"${fileDirname}/${fileBasenameNoExtension}"
],
"options": {
"cwd": "${fileDirname}"
},
"problemMatcher": [
"$gcc"
],
"group": {
"kind": "build",
"isDefault": true
},
"detail": "Task generated by Debugger."
} }
] ]
} }

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
project(BayesNet project(BayesNet
VERSION 0.1.0 VERSION 0.2.0
DESCRIPTION "Bayesian Network and basic classifiers Library." DESCRIPTION "Bayesian Network and basic classifiers Library."
HOMEPAGE_URL "https://github.com/rmontanana/bayesnet" HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
LANGUAGES CXX LANGUAGES CXX
@@ -24,6 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
# Options # Options
# ------- # -------
@@ -31,17 +32,30 @@ option(ENABLE_CLANG_TIDY "Enable to add clang tidy." OFF)
option(ENABLE_TESTING "Unit testing build" OFF) option(ENABLE_TESTING "Unit testing build" OFF)
option(CODE_COVERAGE "Collect coverage from test library" OFF) option(CODE_COVERAGE "Collect coverage from test library" OFF)
# Boost Library
set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_MULTITHREADED ON)
set(Boost_USE_STATIC_RUNTIME OFF)
find_package(Boost 1.66.0 REQUIRED COMPONENTS python3 numpy3)
if(Boost_FOUND)
message("Boost_INCLUDE_DIRS=${Boost_INCLUDE_DIRS}")
include_directories(${Boost_INCLUDE_DIRS})
endif()
# Python
find_package(Python3 3.11...3.11.9 COMPONENTS Interpreter Development REQUIRED)
message("Python3_LIBRARIES=${Python3_LIBRARIES}")
# CMakes modules # CMakes modules
# -------------- # --------------
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
include(AddGitSubmodule) include(AddGitSubmodule)
if (CODE_COVERAGE) if (CODE_COVERAGE)
enable_testing() enable_testing()
include(CodeCoverage) include(CodeCoverage)
MESSAGE("Code coverage enabled") MESSAGE("Code coverage enabled")
set(CMAKE_C_FLAGS " ${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage") set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage")
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage") SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
endif (CODE_COVERAGE) endif (CODE_COVERAGE)
@@ -56,15 +70,21 @@ add_git_submodule("lib/mdlp")
add_git_submodule("lib/argparse") add_git_submodule("lib/argparse")
add_git_submodule("lib/json") add_git_submodule("lib/json")
find_library(XLSXWRITER_LIB NAMES libxlsxwriter.dylib libxlsxwriter.so PATHS ${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/lib)
message("XLSXWRITER_LIB=${XLSXWRITER_LIB}")
# Subdirectories # Subdirectories
# -------------- # --------------
add_subdirectory(config) add_subdirectory(config)
add_subdirectory(lib/Files) add_subdirectory(lib/Files)
add_subdirectory(src/BayesNet) add_subdirectory(src/BayesNet)
add_subdirectory(src/Platform) add_subdirectory(src/Platform)
add_subdirectory(src/PyClassifiers)
add_subdirectory(sample) add_subdirectory(sample)
file(GLOB BayesNet_HEADERS CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.h ${BayesNet_SOURCE_DIR}/BayesNet/*.hpp) file(GLOB BayesNet_HEADERS CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.h ${BayesNet_SOURCE_DIR}/BayesNet/*.h)
file(GLOB BayesNet_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.cc ${BayesNet_SOURCE_DIR}/src/BayesNet/*.cpp) file(GLOB BayesNet_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.cc ${BayesNet_SOURCE_DIR}/src/BayesNet/*.cpp)
file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/Platform/*.cc ${BayesNet_SOURCE_DIR}/src/Platform/*.cpp) file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/Platform/*.cc ${BayesNet_SOURCE_DIR}/src/Platform/*.cpp)
@@ -73,8 +93,7 @@ file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/Platform
if (ENABLE_TESTING) if (ENABLE_TESTING)
MESSAGE("Testing enabled") MESSAGE("Testing enabled")
add_git_submodule("lib/catch2") add_git_submodule("lib/catch2")
include(CTest) include(CTest)
add_subdirectory(tests) add_subdirectory(tests)
endif (ENABLE_TESTING) endif (ENABLE_TESTING)

120
Makefile
View File

@@ -1,6 +1,26 @@
SHELL := /bin/bash SHELL := /bin/bash
.DEFAULT_GOAL := help .DEFAULT_GOAL := help
.PHONY: coverage setup help build test .PHONY: coverage setup help build test clean debug release
f_release = build_release
f_debug = build_debug
app_targets = b_best b_list b_main b_manage b_grid
test_targets = unit_tests_bayesnet unit_tests_platform
n_procs = -j 16
define ClearTests
@for t in $(test_targets); do \
if [ -f $(f_debug)/tests/$$t ]; then \
echo ">>> Cleaning $$t..." ; \
rm -f $(f_debug)/tests/$$t ; \
fi ; \
done
@nfiles="$(find . -name "*.gcda" -print0)" ; \
if test "${nfiles}" != "" ; then \
find . -name "*.gcda" -print0 | xargs -0 rm 2>/dev/null ;\
fi ;
endef
setup: ## Install dependencies for tests and coverage setup: ## Install dependencies for tests and coverage
@if [ "$(shell uname)" = "Darwin" ]; then \ @if [ "$(shell uname)" = "Darwin" ]; then \
@@ -11,49 +31,87 @@ setup: ## Install dependencies for tests and coverage
pip install gcovr; \ pip install gcovr; \
fi fi
dest ?= ${HOME}/bin
install: ## Copy binary files to bin folder
@echo "Destination folder: $(dest)"
make buildr
@echo "*******************************************"
@echo ">>> Copying files to $(dest)"
@echo "*******************************************"
@for item in $(app_targets); do \
echo ">>> Copying $$item" ; \
cp $(f_release)/src/Platform/$$item $(dest) ; \
done
dependency: ## Create a dependency graph diagram of the project (build/dependency.png) dependency: ## Create a dependency graph diagram of the project (build/dependency.png)
cd build && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png @echo ">>> Creating dependency graph diagram of the project...";
$(MAKE) debug
cd $(f_debug) && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png
build: ## Build the main and BayesNetSample buildd: ## Build the debug targets
cmake --build build -t main -t BayesNetSample -j 32 cmake --build $(f_debug) -t $(app_targets) $(n_procs)
clean: ## Clean the debug info buildr: ## Build the release targets
@echo ">>> Cleaning Debug BayesNet ..."; cmake --build $(f_release) -t $(app_targets) $(n_procs)
find . -name "*.gcda" -print0 | xargs -0 rm
clean: ## Clean the tests info
@echo ">>> Cleaning Debug BayesNet tests...";
$(call ClearTests)
@echo ">>> Done"; @echo ">>> Done";
clang-uml: ## Create uml class and sequence diagrams
clang-uml -p --add-compile-flag -I /usr/lib/gcc/x86_64-redhat-linux/8/include/
debug: ## Build a debug version of the project debug: ## Build a debug version of the project
@echo ">>> Building Debug BayesNet ..."; @echo ">>> Building Debug BayesNet...";
@if [ -d ./build ]; then rm -rf ./build; fi @if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi
@mkdir build; @mkdir $(f_debug);
cmake -S . -B build -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON; \ @cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON
cmake --build build -j 32;
@echo ">>> Done"; @echo ">>> Done";
release: ## Build a Release version of the project release: ## Build a Release version of the project
@echo ">>> Building Release BayesNet ..."; @echo ">>> Building Release BayesNet...";
@if [ -d ./build ]; then rm -rf ./build; fi @if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi
@mkdir build; @mkdir $(f_release);
cmake -S . -B build -D CMAKE_BUILD_TYPE=Release; \ @cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release
cmake --build build -t main -t BayesNetSample -j 32;
@echo ">>> Done"; @echo ">>> Done";
test: ## Run tests opt = ""
@echo "* Running tests..."; test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
find . -name "*.gcda" -print0 | xargs -0 rm @echo ">>> Running BayesNet & Platform tests...";
@cd build; \ @$(MAKE) clean
cmake --build . --target unit_tests ; @cmake --build $(f_debug) -t $(test_targets) $(n_procs)
@cd build/tests; \ @for t in $(test_targets); do \
./unit_tests; if [ -f $(f_debug)/tests/$$t ]; then \
cd $(f_debug)/tests ; \
./$$t $(opt) ; \
fi ; \
done
@echo ">>> Done";
opt = ""
testp: ## Run platform tests (opt="-s") to verbose output the tests, (opt="-c='Stratified Fold Test'") to run only that section
@echo ">>> Running Platform tests...";
@$(MAKE) clean
@cmake --build $(f_debug) --target unit_tests_platform $(n_procs)
@if [ -f $(f_debug)/tests/unit_tests_platform ]; then cd $(f_debug)/tests ; ./unit_tests_platform $(opt) ; fi ;
@echo ">>> Done";
opt = ""
testb: ## Run BayesNet tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
@echo ">>> Running BayesNet tests...";
@$(MAKE) clean
@cmake --build $(f_debug) --target unit_tests_bayesnet $(n_procs)
@if [ -f $(f_debug)/tests/unit_tests_bayesnet ]; then cd $(f_debug)/tests ; ./unit_tests_bayesnet $(opt) ; fi ;
@echo ">>> Done";
coverage: ## Run tests and generate coverage report (build/index.html) coverage: ## Run tests and generate coverage report (build/index.html)
@echo "*Building tests..."; @echo ">>> Building tests with coverage...";
find . -name "*.gcda" -print0 | xargs -0 rm @$(MAKE) test
@cd build; \ @cd $(f_debug) ; \
cmake --build . --target unit_tests ; gcovr --config ../gcovr.cfg tests ;
@cd build/tests; \ @echo ">>> Done";
./unit_tests;
gcovr ;
help: ## Show help message help: ## Show help message
@IFS=$$'\n' ; \ @IFS=$$'\n' ; \

View File

@@ -1,5 +1,65 @@
# BayesNet # BayesNet
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
Bayesian Network Classifier with libtorch from scratch Bayesian Network Classifier with libtorch from scratch
## 0. Setup
Before compiling BayesNet.
### boost library
[Getting Started](<https://www.boost.org/doc/libs/1_83_0/more/getting_started/index.html>)
The best option is install the packages that the Linux distribution have in its repository. If this is the case:
```bash
sudo dnf install boost-devel
```
If this is not possible and the compressed packaged is installed, the following environment variable has to be set pointing to the folder where it was unzipped to:
```bash
export BOOST_ROOT=/path/to/library/
```
In some cases, it is needed to build the library, to do so:
```bash
cd /path/to/library
mkdir own
./bootstrap.sh --prefix=/path/to/library/own
./b2 install
export BOOST_ROOT=/path/to/library/own/
```
Don't forget to add the export BOOST_ROOT statement to .bashrc or wherever it is meant to be.
### libxlswriter
```bash
cd lib/libxlsxwriter
make
make install DESTDIR=/home/rmontanana/Code PREFIX=
```
Environment variable has to be set:
```bash
export LD_LIBRARY_PATH=/usr/local/lib
```
### Release
```bash
make release
```
### Debug & Tests
```bash
make debug
```
## 1. Introduction ## 1. Introduction

View File

@@ -1,12 +0,0 @@
digraph BayesNet {
label=<BayesNet >
fontsize=30
fontcolor=blue
labelloc=t
layout=circo
class [shape=circle, fontcolor=red, fillcolor=lightblue, style=filled ]
class -> sepallength class -> sepalwidth class -> petallength class -> petalwidth petallength [shape=circle]
petallength -> sepallength petalwidth [shape=circle]
sepallength [shape=circle]
sepallength -> sepalwidth sepalwidth [shape=circle]
sepalwidth -> petalwidth }

View File

@@ -1 +0,0 @@
null

BIN
diagrams/BayesNet.pdf Executable file

Binary file not shown.

162
grid_stree.json Normal file
View File

@@ -0,0 +1,162 @@
{
"balance-scale": {
"C": 10000.0,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000
},
"balloons": {
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000
},
"breast-cancer-wisc-diag": {
"C": 0.2,
"max_iter": 10000
},
"breast-cancer-wisc-prog": {
"C": 0.2,
"max_iter": 10000
},
"breast-cancer-wisc": {},
"breast-cancer": {},
"cardiotocography-10clases": {},
"cardiotocography-3clases": {},
"conn-bench-sonar-mines-rocks": {},
"cylinder-bands": {},
"dermatology": {
"C": 55,
"max_iter": 10000
},
"echocardiogram": {
"C": 7,
"gamma": 0.1,
"kernel": "poly",
"max_features": "auto",
"max_iter": 10000
},
"fertility": {
"C": 0.05,
"max_features": "auto",
"max_iter": 10000
},
"haberman-survival": {},
"heart-hungarian": {
"C": 0.05,
"max_iter": 10000
},
"hepatitis": {
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000
},
"ilpd-indian-liver": {},
"ionosphere": {
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000
},
"iris": {},
"led-display": {},
"libras": {
"C": 0.08,
"max_iter": 10000
},
"low-res-spect": {
"C": 0.05,
"max_iter": 10000
},
"lymphography": {
"C": 0.05,
"max_iter": 10000
},
"mammographic": {},
"molec-biol-promoter": {
"C": 0.05,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000
},
"musk-1": {
"C": 0.05,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000
},
"oocytes_merluccius_nucleus_4d": {
"C": 8.25,
"gamma": 0.1,
"kernel": "poly"
},
"oocytes_merluccius_states_2f": {},
"oocytes_trisopterus_nucleus_2f": {},
"oocytes_trisopterus_states_5b": {
"C": 0.11,
"max_iter": 10000
},
"parkinsons": {},
"pima": {},
"pittsburg-bridges-MATERIAL": {
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000
},
"pittsburg-bridges-REL-L": {},
"pittsburg-bridges-SPAN": {
"C": 0.05,
"max_iter": 10000
},
"pittsburg-bridges-T-OR-D": {},
"planning": {
"C": 7,
"gamma": 10.0,
"kernel": "rbf",
"max_iter": 10000
},
"post-operative": {
"C": 55,
"degree": 5,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000
},
"seeds": {
"C": 10000.0,
"max_iter": 10000
},
"statlog-australian-credit": {
"C": 0.05,
"max_features": "auto",
"max_iter": 10000
},
"statlog-german-credit": {},
"statlog-heart": {},
"statlog-image": {
"C": 7,
"max_iter": 10000
},
"statlog-vehicle": {},
"synthetic-control": {
"C": 0.55,
"max_iter": 10000
},
"tic-tac-toe": {
"C": 0.2,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000
},
"vertebral-column-2clases": {},
"wine": {
"C": 0.55,
"max_iter": 10000
},
"zoo": {
"C": 0.1,
"max_iter": 10000
}
}

View File

@@ -4,11 +4,9 @@
#include <map> #include <map>
#include <iostream> #include <iostream>
using namespace std;
ArffFiles::ArffFiles() = default; ArffFiles::ArffFiles() = default;
vector<string> ArffFiles::getLines() const std::vector<std::string> ArffFiles::getLines() const
{ {
return lines; return lines;
} }
@@ -18,48 +16,48 @@ unsigned long int ArffFiles::getSize() const
return lines.size(); return lines.size();
} }
vector<pair<string, string>> ArffFiles::getAttributes() const std::vector<std::pair<std::string, std::string>> ArffFiles::getAttributes() const
{ {
return attributes; return attributes;
} }
string ArffFiles::getClassName() const std::string ArffFiles::getClassName() const
{ {
return className; return className;
} }
string ArffFiles::getClassType() const std::string ArffFiles::getClassType() const
{ {
return classType; return classType;
} }
vector<vector<float>>& ArffFiles::getX() std::vector<std::vector<float>>& ArffFiles::getX()
{ {
return X; return X;
} }
vector<int>& ArffFiles::getY() std::vector<int>& ArffFiles::getY()
{ {
return y; return y;
} }
void ArffFiles::loadCommon(string fileName) void ArffFiles::loadCommon(std::string fileName)
{ {
ifstream file(fileName); std::ifstream file(fileName);
if (!file.is_open()) { if (!file.is_open()) {
throw invalid_argument("Unable to open file"); throw std::invalid_argument("Unable to open file");
} }
string line; std::string line;
string keyword; std::string keyword;
string attribute; std::string attribute;
string type; std::string type;
string type_w; std::string type_w;
while (getline(file, line)) { while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue; continue;
} }
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
stringstream ss(line); std::stringstream ss(line);
ss >> keyword >> attribute; ss >> keyword >> attribute;
type = ""; type = "";
while (ss >> type_w) while (ss >> type_w)
@@ -74,35 +72,35 @@ void ArffFiles::loadCommon(string fileName)
} }
file.close(); file.close();
if (attributes.empty()) if (attributes.empty())
throw invalid_argument("No attributes found"); throw std::invalid_argument("No attributes found");
} }
void ArffFiles::load(const string& fileName, bool classLast) void ArffFiles::load(const std::string& fileName, bool classLast)
{ {
int labelIndex; int labelIndex;
loadCommon(fileName); loadCommon(fileName);
if (classLast) { if (classLast) {
className = get<0>(attributes.back()); className = std::get<0>(attributes.back());
classType = get<1>(attributes.back()); classType = std::get<1>(attributes.back());
attributes.pop_back(); attributes.pop_back();
labelIndex = static_cast<int>(attributes.size()); labelIndex = static_cast<int>(attributes.size());
} else { } else {
className = get<0>(attributes.front()); className = std::get<0>(attributes.front());
classType = get<1>(attributes.front()); classType = std::get<1>(attributes.front());
attributes.erase(attributes.begin()); attributes.erase(attributes.begin());
labelIndex = 0; labelIndex = 0;
} }
generateDataset(labelIndex); generateDataset(labelIndex);
} }
void ArffFiles::load(const string& fileName, const string& name) void ArffFiles::load(const std::string& fileName, const std::string& name)
{ {
int labelIndex; int labelIndex;
loadCommon(fileName); loadCommon(fileName);
bool found = false; bool found = false;
for (int i = 0; i < attributes.size(); ++i) { for (int i = 0; i < attributes.size(); ++i) {
if (attributes[i].first == name) { if (attributes[i].first == name) {
className = get<0>(attributes[i]); className = std::get<0>(attributes[i]);
classType = get<1>(attributes[i]); classType = std::get<1>(attributes[i]);
attributes.erase(attributes.begin() + i); attributes.erase(attributes.begin() + i);
labelIndex = i; labelIndex = i;
found = true; found = true;
@@ -110,19 +108,19 @@ void ArffFiles::load(const string& fileName, const string& name)
} }
} }
if (!found) { if (!found) {
throw invalid_argument("Class name not found"); throw std::invalid_argument("Class name not found");
} }
generateDataset(labelIndex); generateDataset(labelIndex);
} }
void ArffFiles::generateDataset(int labelIndex) void ArffFiles::generateDataset(int labelIndex)
{ {
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size())); X = std::vector<std::vector<float>>(attributes.size(), std::vector<float>(lines.size()));
auto yy = vector<string>(lines.size(), ""); auto yy = std::vector<std::string>(lines.size(), "");
auto removeLines = vector<int>(); // Lines with missing values auto removeLines = std::vector<int>(); // Lines with missing values
for (size_t i = 0; i < lines.size(); i++) { for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]); std::stringstream ss(lines[i]);
string value; std::string value;
int pos = 0; int pos = 0;
int xIndex = 0; int xIndex = 0;
while (getline(ss, value, ',')) { while (getline(ss, value, ',')) {
@@ -146,21 +144,21 @@ void ArffFiles::generateDataset(int labelIndex)
y = factorize(yy); y = factorize(yy);
} }
string ArffFiles::trim(const string& source) std::string ArffFiles::trim(const std::string& source)
{ {
string s(source); std::string s(source);
s.erase(0, s.find_first_not_of(" '\n\r\t")); s.erase(0, s.find_first_not_of(" '\n\r\t"));
s.erase(s.find_last_not_of(" '\n\r\t") + 1); s.erase(s.find_last_not_of(" '\n\r\t") + 1);
return s; return s;
} }
vector<int> ArffFiles::factorize(const vector<string>& labels_t) std::vector<int> ArffFiles::factorize(const std::vector<std::string>& labels_t)
{ {
vector<int> yy; std::vector<int> yy;
yy.reserve(labels_t.size()); yy.reserve(labels_t.size());
map<string, int> labelMap; std::map<std::string, int> labelMap;
int i = 0; int i = 0;
for (const string& label : labels_t) { for (const std::string& label : labels_t) {
if (labelMap.find(label) == labelMap.end()) { if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++; labelMap[label] = i++;
} }

View File

@@ -4,31 +4,29 @@
#include <string> #include <string>
#include <vector> #include <vector>
using namespace std;
class ArffFiles { class ArffFiles {
private: private:
vector<string> lines; std::vector<std::string> lines;
vector<pair<string, string>> attributes; std::vector<std::pair<std::string, std::string>> attributes;
string className; std::string className;
string classType; std::string classType;
vector<vector<float>> X; std::vector<std::vector<float>> X;
vector<int> y; std::vector<int> y;
void generateDataset(int); void generateDataset(int);
void loadCommon(string); void loadCommon(std::string);
public: public:
ArffFiles(); ArffFiles();
void load(const string&, bool = true); void load(const std::string&, bool = true);
void load(const string&, const string&); void load(const std::string&, const std::string&);
vector<string> getLines() const; std::vector<std::string> getLines() const;
unsigned long int getSize() const; unsigned long int getSize() const;
string getClassName() const; std::string getClassName() const;
string getClassType() const; std::string getClassType() const;
static string trim(const string&); static std::string trim(const std::string&);
vector<vector<float>>& getX(); std::vector<std::vector<float>>& getX();
vector<int>& getY(); std::vector<int>& getY();
vector<pair<string, string>> getAttributes() const; std::vector<std::pair<std::string, std::string>> getAttributes() const;
static vector<int> factorize(const vector<string>& labels_t); static std::vector<int> factorize(const std::vector<std::string>& labels_t);
}; };
#endif #endif

View File

@@ -1,2 +1 @@
add_library(ArffFiles ArffFiles.cc) add_library(ArffFiles ArffFiles.cc)
#target_link_libraries(BayesNet "${TORCH_LIBRARIES}")

1
lib/libxlsxwriter Submodule

Submodule lib/libxlsxwriter added at 29355a0887

View File

@@ -3,5 +3,6 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include)
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
add_executable(BayesNetSample sample.cc ${BayesNet_SOURCE_DIR}/src/Platform/Folding.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) add_executable(BayesNetSample sample.cc ${BayesNet_SOURCE_DIR}/src/Platform/Folding.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(BayesNetSample BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") target_link_libraries(BayesNetSample BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")

View File

@@ -3,22 +3,21 @@
#include <string> #include <string>
#include <map> #include <map>
#include <argparse/argparse.hpp> #include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include "ArffFiles.h" #include "ArffFiles.h"
#include "BayesMetrics.h" #include "BayesMetrics.h"
#include "CPPFImdlp.h" #include "CPPFImdlp.h"
#include "Folding.h" #include "Folding.h"
#include "Models.h" #include "Models.h"
#include "modelRegister.h" #include "modelRegister.h"
#include <fstream>
const std::string PATH = "../../data/";
using namespace std; pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
const string PATH = "../../data/";
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)
{ {
vector<mdlp::labels_t>Xd; std::vector<mdlp::labels_t>Xd;
map<string, int> maxes; map<std::string, int> maxes;
auto fimdlp = mdlp::CPPFImdlp(); auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) { for (int i = 0; i < X.size(); i++) {
@@ -30,7 +29,7 @@ pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t
return { Xd, maxes }; return { Xd, maxes };
} }
bool file_exists(const std::string& name) bool file_exists(const std::std::std::string& name)
{ {
if (FILE* file = fopen(name.c_str(), "r")) { if (FILE* file = fopen(name.c_str(), "r")) {
fclose(file); fclose(file);
@@ -39,12 +38,12 @@ bool file_exists(const std::string& name)
return false; return false;
} }
} }
pair<vector<vector<int>>, vector<int>> extract_indices(vector<int> indices, vector<vector<int>> X, vector<int> y) pair<std::vector<std::vector<int>>, std::vector<int>> extract_indices(std::vector<int> indices, std::vector<std::vector<int>> X, std::vector<int> y)
{ {
vector<vector<int>> Xr; // nxm std::vector<std::vector<int>> Xr; // nxm
vector<int> yr; std::vector<int> yr;
for (int col = 0; col < X.size(); ++col) { for (int col = 0; col < X.size(); ++col) {
Xr.push_back(vector<int>()); Xr.push_back(std::vector<int>());
} }
for (auto index : indices) { for (auto index : indices) {
for (int col = 0; col < X.size(); ++col) { for (int col = 0; col < X.size(); ++col) {
@@ -57,7 +56,7 @@ pair<vector<vector<int>>, vector<int>> extract_indices(vector<int> indices, vect
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
map<string, bool> datasets = { map<std::string, bool> datasets = {
{"diabetes", true}, {"diabetes", true},
{"ecoli", true}, {"ecoli", true},
{"glass", true}, {"glass", true},
@@ -67,13 +66,13 @@ int main(int argc, char** argv)
{"liver-disorders", true}, {"liver-disorders", true},
{"mfeat-factors", true}, {"mfeat-factors", true},
}; };
auto valid_datasets = vector<string>(); auto valid_datasets = std::vector<std::string>();
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets), transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
[](const pair<string, bool>& pair) { return pair.first; }); [](const pair<std::string, bool>& pair) { return pair.first; });
argparse::ArgumentParser program("BayesNetSample"); argparse::ArgumentParser program("BayesNetSample");
program.add_argument("-d", "--dataset") program.add_argument("-d", "--dataset")
.help("Dataset file name") .help("Dataset file name")
.action([valid_datasets](const std::string& value) { .action([valid_datasets](const std::std::std::string& value) {
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) { if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
return value; return value;
} }
@@ -82,23 +81,23 @@ int main(int argc, char** argv)
); );
program.add_argument("-p", "--path") program.add_argument("-p", "--path")
.help(" folder where the data files are located, default") .help(" folder where the data files are located, default")
.default_value(string{ PATH } .default_value(std::string{ PATH }
); );
program.add_argument("-m", "--model") program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->toString()) .help("Model to use " + platform::Models::instance()->tostd::string())
.action([](const std::string& value) { .action([](const std::std::std::string& value) {
static const vector<string> choices = platform::Models::instance()->getNames(); static const std::vector<std::string> choices = platform::Models::instance()->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) { if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value; return value;
} }
throw runtime_error("Model must be one of " + platform::Models::instance()->toString()); throw runtime_error("Model must be one of " + platform::Models::instance()->tostd::string());
} }
); );
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true); program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true); program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true); program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true); program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const string& value) { program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const std::std::string& value) {
try { try {
auto k = stoi(value); auto k = stoi(value);
if (k < 2) { if (k < 2) {
@@ -114,13 +113,13 @@ int main(int argc, char** argv)
}}); }});
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>(); program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
bool class_last, stratified, tensors, dump_cpt; bool class_last, stratified, tensors, dump_cpt;
string model_name, file_name, path, complete_file_name; std::string model_name, file_name, path, complete_file_name;
int nFolds, seed; int nFolds, seed;
try { try {
program.parse_args(argc, argv); program.parse_args(argc, argv);
file_name = program.get<string>("dataset"); file_name = program.get<std::string>("dataset");
path = program.get<string>("path"); path = program.get<std::string>("path");
model_name = program.get<string>("model"); model_name = program.get<std::string>("model");
complete_file_name = path + file_name + ".arff"; complete_file_name = path + file_name + ".arff";
stratified = program.get<bool>("stratified"); stratified = program.get<bool>("stratified");
tensors = program.get<bool>("tensors"); tensors = program.get<bool>("tensors");
@@ -133,7 +132,7 @@ int main(int argc, char** argv)
} }
} }
catch (const exception& err) { catch (const exception& err) {
cerr << err.what() << endl; cerr << err.what() << std::endl;
cerr << program; cerr << program;
exit(1); exit(1);
} }
@@ -144,93 +143,93 @@ int main(int argc, char** argv)
auto handler = ArffFiles(); auto handler = ArffFiles();
handler.load(complete_file_name, class_last); handler.load(complete_file_name, class_last);
// Get Dataset X, y // Get Dataset X, y
vector<mdlp::samples_t>& X = handler.getX(); std::vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY(); mdlp::labels_t& y = handler.getY();
// Get className & Features // Get className & Features
auto className = handler.getClassName(); auto className = handler.getClassName();
vector<string> features; std::vector<std::string> features;
auto attributes = handler.getAttributes(); auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), transform(attributes.begin(), attributes.end(), back_inserter(features),
[](const pair<string, string>& item) { return item.first; }); [](const pair<std::string, std::string>& item) { return item.first; });
// Discretize Dataset // Discretize Dataset
auto [Xd, maxes] = discretize(X, y, features); auto [Xd, maxes] = discretize(X, y, features);
maxes[className] = *max_element(y.begin(), y.end()) + 1; maxes[className] = *max_element(y.begin(), y.end()) + 1;
map<string, vector<int>> states; map<std::string, std::vector<int>> states;
for (auto feature : features) { for (auto feature : features) {
states[feature] = vector<int>(maxes[feature]); states[feature] = std::vector<int>(maxes[feature]);
} }
states[className] = vector<int>(maxes[className]); states[className] = std::vector<int>(maxes[className]);
auto clf = platform::Models::instance()->create(model_name); auto clf = platform::Models::instance()->create(model_name);
clf->fit(Xd, y, features, className, states); clf->fit(Xd, y, features, className, states);
if (dump_cpt) { if (dump_cpt) {
cout << "--- CPT Tables ---" << endl; std::cout << "--- CPT Tables ---" << std::endl;
clf->dump_cpt(); clf->dump_cpt();
} }
auto lines = clf->show(); auto lines = clf->show();
for (auto line : lines) { for (auto line : lines) {
cout << line << endl; std::cout << line << std::endl;
} }
cout << "--- Topological Order ---" << endl; std::cout << "--- Topological Order ---" << std::endl;
auto order = clf->topological_order(); auto order = clf->topological_order();
for (auto name : order) { for (auto name : order) {
cout << name << ", "; std::cout << name << ", ";
} }
cout << "end." << endl; std::cout << "end." << std::endl;
auto score = clf->score(Xd, y); auto score = clf->score(Xd, y);
cout << "Score: " << score << endl; std::cout << "Score: " << score << std::endl;
// auto graph = clf->graph(); auto graph = clf->graph();
// auto dot_file = model_name + "_" + file_name; auto dot_file = model_name + "_" + file_name;
// ofstream file(dot_file + ".dot"); ofstream file(dot_file + ".dot");
// file << graph; file << graph;
// file.close(); file.close();
// cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl;
// cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl;
// string stratified_string = stratified ? " Stratified" : ""; std::string stratified_std::string = stratified ? " Stratified" : "";
// cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; std::cout << nFolds << " Folds" << stratified_std::string << " Cross validation" << std::endl;
// cout << "==========================================" << endl; std::cout << "==========================================" << std::endl;
// torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32); torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
// torch::Tensor yt = torch::tensor(y, torch::kInt32); torch::Tensor yt = torch::tensor(y, torch::kInt32);
// for (int i = 0; i < features.size(); ++i) { for (int i = 0; i < features.size(); ++i) {
// Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
// } }
// float total_score = 0, total_score_train = 0, score_train, score_test; float total_score = 0, total_score_train = 0, score_train, score_test;
// Fold* fold; platform::Fold* fold;
// if (stratified) if (stratified)
// fold = new StratifiedKFold(nFolds, y, seed); fold = new platform::StratifiedKFold(nFolds, y, seed);
// else else
// fold = new KFold(nFolds, y.size(), seed); fold = new platform::KFold(nFolds, y.size(), seed);
// for (auto i = 0; i < nFolds; ++i) { for (auto i = 0; i < nFolds; ++i) {
// auto [train, test] = fold->getFold(i); auto [train, test] = fold->getFold(i);
// cout << "Fold: " << i + 1 << endl; std::cout << "Fold: " << i + 1 << std::endl;
// if (tensors) { if (tensors) {
// auto ttrain = torch::tensor(train, torch::kInt64); auto ttrain = torch::tensor(train, torch::kInt64);
// auto ttest = torch::tensor(test, torch::kInt64); auto ttest = torch::tensor(test, torch::kInt64);
// torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
// torch::Tensor ytraint = yt.index({ ttrain }); torch::Tensor ytraint = yt.index({ ttrain });
// torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
// torch::Tensor ytestt = yt.index({ ttest }); torch::Tensor ytestt = yt.index({ ttest });
// clf->fit(Xtraint, ytraint, features, className, states); clf->fit(Xtraint, ytraint, features, className, states);
// auto temp = clf->predict(Xtraint); auto temp = clf->predict(Xtraint);
// score_train = clf->score(Xtraint, ytraint); score_train = clf->score(Xtraint, ytraint);
// score_test = clf->score(Xtestt, ytestt); score_test = clf->score(Xtestt, ytestt);
// } else { } else {
// auto [Xtrain, ytrain] = extract_indices(train, Xd, y); auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
// auto [Xtest, ytest] = extract_indices(test, Xd, y); auto [Xtest, ytest] = extract_indices(test, Xd, y);
// clf->fit(Xtrain, ytrain, features, className, states); clf->fit(Xtrain, ytrain, features, className, states);
// score_train = clf->score(Xtrain, ytrain); score_train = clf->score(Xtrain, ytrain);
// score_test = clf->score(Xtest, ytest); score_test = clf->score(Xtest, ytest);
// } }
// if (dump_cpt) { if (dump_cpt) {
// cout << "--- CPT Tables ---" << endl; std::cout << "--- CPT Tables ---" << std::endl;
// clf->dump_cpt(); clf->dump_cpt();
// } }
// total_score_train += score_train; total_score_train += score_train;
// total_score += score_test; total_score += score_test;
// cout << "Score Train: " << score_train << endl; std::cout << "Score Train: " << score_train << std::endl;
// cout << "Score Test : " << score_test << endl; std::cout << "Score Test : " << score_test << std::endl;
// cout << "-------------------------------------------------------------------------------" << endl; std::cout << "-------------------------------------------------------------------------------" << std::endl;
// } }
// cout << "**********************************************************************************" << endl; std::cout << "**********************************************************************************" << std::endl;
// cout << "Average Score Train: " << total_score_train / nFolds << endl; std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl;
// cout << "Average Score Test : " << total_score / nFolds << endl;return 0; std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0;
} }

View File

@@ -2,14 +2,16 @@
namespace bayesnet { namespace bayesnet {
AODE::AODE() : Ensemble() {} AODE::AODE() : Ensemble() {}
void AODE::train() void AODE::buildModel(const torch::Tensor& weights)
{ {
models.clear(); models.clear();
for (int i = 0; i < features.size(); ++i) { for (int i = 0; i < features.size(); ++i) {
models.push_back(std::make_unique<SPODE>(i)); models.push_back(std::make_unique<SPODE>(i));
} }
n_models = models.size();
significanceModels = std::vector<double>(n_models, 1.0);
} }
vector<string> AODE::graph(const string& title) std::vector<std::string> AODE::graph(const std::string& title) const
{ {
return Ensemble::graph(title); return Ensemble::graph(title);
} }

View File

@@ -5,11 +5,11 @@
namespace bayesnet { namespace bayesnet {
class AODE : public Ensemble { class AODE : public Ensemble {
protected: protected:
void train() override; void buildModel(const torch::Tensor& weights) override;
public: public:
AODE(); AODE();
virtual ~AODE() {}; virtual ~AODE() {};
vector<string> graph(const string& title = "AODE") override; std::vector<std::string> graph(const std::string& title = "AODE") const override;
}; };
} }
#endif #endif

View File

@@ -1,33 +1,38 @@
#include "AODELd.h" #include "AODELd.h"
namespace bayesnet { namespace bayesnet {
using namespace std; AODELd::AODELd() : Ensemble(), Proposal(dataset, features, className) {}
AODELd::AODELd() : Ensemble(), Proposal(Ensemble::Xv, Ensemble::yv, features, className) {} AODELd& AODELd::fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_)
AODELd& AODELd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
{ {
checkInput(X_, y_);
features = features_; features = features_;
className = className_; className = className_;
states = states_; Xf = X_;
train(); y = y_;
for (const auto& model : models) { // Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
model->fit(X_, y_, features_, className_, states_); states = fit_local_discretization(y);
} // We have discretized the input data
n_models = models.size(); // 1st we need to fit the model to build the normal TAN structure, TAN::fit initializes the base Bayesian network
fitted = true; Ensemble::fit(dataset, features, className, states);
return *this; return *this;
} }
void AODELd::train() void AODELd::buildModel(const torch::Tensor& weights)
{ {
models.clear(); models.clear();
for (int i = 0; i < features.size(); ++i) { for (int i = 0; i < features.size(); ++i) {
models.push_back(std::make_unique<SPODELd>(i)); models.push_back(std::make_unique<SPODELd>(i));
} }
n_models = models.size();
significanceModels = std::vector<double>(n_models, 1.0);
} }
Tensor AODELd::predict(Tensor& X) void AODELd::trainModel(const torch::Tensor& weights)
{ {
return Ensemble::predict(X); for (const auto& model : models) {
model->fit(Xf, y, features, className, states);
}
} }
vector<string> AODELd::graph(const string& name) std::vector<std::string> AODELd::graph(const std::string& name) const
{ {
return Ensemble::graph(name); return Ensemble::graph(name);
} }

View File

@@ -5,16 +5,16 @@
#include "SPODELd.h" #include "SPODELd.h"
namespace bayesnet { namespace bayesnet {
using namespace std;
class AODELd : public Ensemble, public Proposal { class AODELd : public Ensemble, public Proposal {
protected:
void trainModel(const torch::Tensor& weights) override;
void buildModel(const torch::Tensor& weights) override;
public: public:
AODELd(); AODELd();
AODELd& fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_) override;
virtual ~AODELd() = default; virtual ~AODELd() = default;
AODELd& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override; std::vector<std::string> graph(const std::string& name = "AODELd") const override;
vector<string> graph(const string& name = "AODE") override; static inline std::string version() { return "0.0.1"; };
Tensor predict(Tensor& X) override;
void train() override;
static inline string version() { return "0.0.1"; };
}; };
} }
#endif // !AODELD_H #endif // !AODELD_H

View File

@@ -1,28 +1,37 @@
#ifndef BASE_H #ifndef BASE_H
#define BASE_H #define BASE_H
#include <torch/torch.h> #include <torch/torch.h>
#include <nlohmann/json.hpp>
#include <vector> #include <vector>
namespace bayesnet { namespace bayesnet {
using namespace std; enum status_t { NORMAL, WARNING, ERROR };
class BaseClassifier { class BaseClassifier {
public: public:
// X is nxm vector, y is nx1 vector // X is nxm std::vector, y is nx1 std::vector
virtual BaseClassifier& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states) = 0; virtual BaseClassifier& fit(std::vector<std::vector<int>>& X, std::vector<int>& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states) = 0;
// X is nxm tensor, y is nx1 tensor // X is nxm tensor, y is nx1 tensor
virtual BaseClassifier& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) = 0; virtual BaseClassifier& fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states) = 0;
virtual BaseClassifier& fit(torch::Tensor& dataset, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states) = 0;
virtual BaseClassifier& fit(torch::Tensor& dataset, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights) = 0;
virtual ~BaseClassifier() = default; virtual ~BaseClassifier() = default;
torch::Tensor virtual predict(torch::Tensor& X) = 0; torch::Tensor virtual predict(torch::Tensor& X) = 0;
vector<int> virtual predict(vector<vector<int>>& X) = 0; std::vector<int> virtual predict(std::vector<std::vector<int >>& X) = 0;
float virtual score(vector<vector<int>>& X, vector<int>& y) = 0; status_t virtual getStatus() const = 0;
float virtual score(std::vector<std::vector<int>>& X, std::vector<int>& y) = 0;
float virtual score(torch::Tensor& X, torch::Tensor& y) = 0; float virtual score(torch::Tensor& X, torch::Tensor& y) = 0;
int virtual getNumberOfNodes() = 0; int virtual getNumberOfNodes()const = 0;
int virtual getNumberOfEdges() = 0; int virtual getNumberOfEdges()const = 0;
int virtual getNumberOfStates() = 0; int virtual getNumberOfStates() const = 0;
vector<string> virtual show() = 0; std::vector<std::string> virtual show() const = 0;
vector<string> virtual graph(const string& title = "") = 0; std::vector<std::string> virtual graph(const std::string& title = "") const = 0;
const string inline getVersion() const { return "0.1.0"; }; virtual std::string getVersion() = 0;
vector<string> virtual topological_order() = 0; std::vector<std::string> virtual topological_order() = 0;
void virtual dump_cpt() = 0; void virtual dump_cpt()const = 0;
virtual void setHyperparameters(const nlohmann::json& hyperparameters) = 0;
std::vector<std::string>& getValidHyperparameters() { return validHyperparameters; }
protected:
virtual void trainModel(const torch::Tensor& weights) = 0;
std::vector<std::string> validHyperparameters;
}; };
} }
#endif #endif

View File

@@ -1,16 +1,16 @@
#include "BayesMetrics.h" #include "BayesMetrics.h"
#include "Mst.h" #include "Mst.h"
namespace bayesnet { namespace bayesnet {
//samples is nxm tensor used to fit the model //samples is n+1xm tensor used to fit the model
Metrics::Metrics(torch::Tensor& samples, vector<string>& features, string& className, int classNumStates) Metrics::Metrics(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int classNumStates)
: samples(samples) : samples(samples)
, features(features) , features(features)
, className(className) , className(className)
, classNumStates(classNumStates) , classNumStates(classNumStates)
{ {
} }
//samples is nxm vector used to fit the model //samples is nxm std::vector used to fit the model
Metrics::Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates) Metrics::Metrics(const std::vector<std::vector<int>>& vsamples, const std::vector<int>& labels, const std::vector<std::string>& features, const std::string& className, const int classNumStates)
: features(features) : features(features)
, className(className) , className(className)
, classNumStates(classNumStates) , classNumStates(classNumStates)
@@ -21,28 +21,57 @@ namespace bayesnet {
} }
samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32)); samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
} }
vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source) std::vector<int> Metrics::SelectKBestWeighted(const torch::Tensor& weights, bool ascending, unsigned k)
{ {
vector<pair<string, string>> result; // Return the K Best features
for (int i = 0; i < source.size(); ++i) { auto n = samples.size(0) - 1;
string temp = source[i]; if (k == 0) {
for (int j = i + 1; j < source.size(); ++j) { k = n;
result.push_back({ temp, source[j] });
}
} }
return result; // compute scores
scoresKBest.clear();
featuresKBest.clear();
auto label = samples.index({ -1, "..." });
for (int i = 0; i < n; ++i) {
scoresKBest.push_back(mutualInformation(label, samples.index({ i, "..." }), weights));
featuresKBest.push_back(i);
}
// sort & reduce scores and features
if (ascending) {
sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j)
{ return scoresKBest[i] < scoresKBest[j]; });
sort(scoresKBest.begin(), scoresKBest.end(), std::less<double>());
if (k < n) {
for (int i = 0; i < n - k; ++i) {
featuresKBest.erase(featuresKBest.begin());
scoresKBest.erase(scoresKBest.begin());
}
}
} else {
sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j)
{ return scoresKBest[i] > scoresKBest[j]; });
sort(scoresKBest.begin(), scoresKBest.end(), std::greater<double>());
featuresKBest.resize(k);
scoresKBest.resize(k);
}
return featuresKBest;
} }
torch::Tensor Metrics::conditionalEdge() std::vector<double> Metrics::getScoresKBest() const
{ {
auto result = vector<double>(); return scoresKBest;
auto source = vector<string>(features); }
torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights)
{
auto result = std::vector<double>();
auto source = std::vector<std::string>(features);
source.push_back(className); source.push_back(className);
auto combinations = doCombinations(source); auto combinations = doCombinations(source);
// Compute class prior // Compute class prior
auto margin = torch::zeros({ classNumStates }); auto margin = torch::zeros({ classNumStates }, torch::kFloat);
for (int value = 0; value < classNumStates; ++value) { for (int value = 0; value < classNumStates; ++value) {
auto mask = samples.index({ -1, "..." }) == value; auto mask = samples.index({ -1, "..." }) == value;
margin[value] = mask.sum().item<float>() / samples.size(1); margin[value] = mask.sum().item<double>() / samples.size(1);
} }
for (auto [first, second] : combinations) { for (auto [first, second] : combinations) {
int index_first = find(features.begin(), features.end(), first) - features.begin(); int index_first = find(features.begin(), features.end(), first) - features.begin();
@@ -52,8 +81,9 @@ namespace bayesnet {
auto mask = samples.index({ -1, "..." }) == value; auto mask = samples.index({ -1, "..." }) == value;
auto first_dataset = samples.index({ index_first, mask }); auto first_dataset = samples.index({ index_first, mask });
auto second_dataset = samples.index({ index_second, mask }); auto second_dataset = samples.index({ index_second, mask });
auto mi = mutualInformation(first_dataset, second_dataset); auto weights_dataset = weights.index({ mask });
auto pb = margin[value].item<float>(); auto mi = mutualInformation(first_dataset, second_dataset, weights_dataset);
auto pb = margin[value].item<double>();
accumulated += pb * mi; accumulated += pb * mi;
} }
result.push_back(accumulated); result.push_back(accumulated);
@@ -70,31 +100,32 @@ namespace bayesnet {
return matrix; return matrix;
} }
// To use in Python // To use in Python
vector<float> Metrics::conditionalEdgeWeights() std::vector<float> Metrics::conditionalEdgeWeights(std::vector<float>& weights_)
{ {
auto matrix = conditionalEdge(); const torch::Tensor weights = torch::tensor(weights_);
auto matrix = conditionalEdge(weights);
std::vector<float> v(matrix.data_ptr<float>(), matrix.data_ptr<float>() + matrix.numel()); std::vector<float> v(matrix.data_ptr<float>(), matrix.data_ptr<float>() + matrix.numel());
return v; return v;
} }
double Metrics::entropy(torch::Tensor& feature) double Metrics::entropy(const torch::Tensor& feature, const torch::Tensor& weights)
{ {
torch::Tensor counts = feature.bincount(); torch::Tensor counts = feature.bincount(weights);
int totalWeight = counts.sum().item<int>(); double totalWeight = counts.sum().item<double>();
torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
torch::Tensor logProbs = torch::log(probs); torch::Tensor logProbs = torch::log(probs);
torch::Tensor entropy = -probs * logProbs; torch::Tensor entropy = -probs * logProbs;
return entropy.nansum().item<double>(); return entropy.nansum().item<double>();
} }
// H(Y|X) = sum_{x in X} p(x) H(Y|X=x) // H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
double Metrics::conditionalEntropy(torch::Tensor& firstFeature, torch::Tensor& secondFeature) double Metrics::conditionalEntropy(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights)
{ {
int numSamples = firstFeature.sizes()[0]; int numSamples = firstFeature.sizes()[0];
torch::Tensor featureCounts = secondFeature.bincount(); torch::Tensor featureCounts = secondFeature.bincount(weights);
unordered_map<int, unordered_map<int, double>> jointCounts; std::unordered_map<int, std::unordered_map<int, double>> jointCounts;
double totalWeight = 0; double totalWeight = 0;
for (auto i = 0; i < numSamples; i++) { for (auto i = 0; i < numSamples; i++) {
jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += 1; jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += weights[i].item<double>();
totalWeight += 1; totalWeight += weights[i].item<float>();
} }
if (totalWeight == 0) if (totalWeight == 0)
return 0; return 0;
@@ -115,16 +146,16 @@ namespace bayesnet {
return entropyValue; return entropyValue;
} }
// I(X;Y) = H(Y) - H(Y|X) // I(X;Y) = H(Y) - H(Y|X)
double Metrics::mutualInformation(torch::Tensor& firstFeature, torch::Tensor& secondFeature) double Metrics::mutualInformation(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights)
{ {
return entropy(firstFeature) - conditionalEntropy(firstFeature, secondFeature); return entropy(firstFeature, weights) - conditionalEntropy(firstFeature, secondFeature, weights);
} }
/* /*
Compute the maximum spanning tree considering the weights as distances Compute the maximum spanning tree considering the weights as distances
and the indices of the weights as nodes of this square matrix using and the indices of the weights as nodes of this square matrix using
Kruskal algorithm Kruskal algorithm
*/ */
vector<pair<int, int>> Metrics::maximumSpanningTree(vector<string> features, Tensor& weights, int root) std::vector<std::pair<int, int>> Metrics::maximumSpanningTree(const std::vector<std::string>& features, const torch::Tensor& weights, const int root)
{ {
auto mst = MST(features, weights, root); auto mst = MST(features, weights, root);
return mst.maximumSpanningTree(); return mst.maximumSpanningTree();

View File

@@ -4,25 +4,46 @@
#include <vector> #include <vector>
#include <string> #include <string>
namespace bayesnet { namespace bayesnet {
using namespace std;
using namespace torch;
class Metrics { class Metrics {
private: private:
Tensor samples; // nxm tensor used to fit the model
vector<string> features;
string className;
int classNumStates = 0; int classNumStates = 0;
std::vector<double> scoresKBest;
std::vector<int> featuresKBest; // sorted indices of the features
double conditionalEntropy(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights);
protected:
torch::Tensor samples; // n+1xm torch::Tensor used to fit the model where samples[-1] is the y std::vector
std::string className;
double entropy(const torch::Tensor& feature, const torch::Tensor& weights);
std::vector<std::string> features;
template <class T>
std::vector<std::pair<T, T>> doCombinations(const std::vector<T>& source)
{
std::vector<std::pair<T, T>> result;
for (int i = 0; i < source.size(); ++i) {
T temp = source[i];
for (int j = i + 1; j < source.size(); ++j) {
result.push_back({ temp, source[j] });
}
}
return result;
}
template <class T>
T pop_first(std::vector<T>& v)
{
T temp = v[0];
v.erase(v.begin());
return temp;
}
public: public:
Metrics() = default; Metrics() = default;
Metrics(Tensor&, vector<string>&, string&, int); Metrics(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int classNumStates);
Metrics(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&, const int); Metrics(const std::vector<std::vector<int>>& vsamples, const std::vector<int>& labels, const std::vector<std::string>& features, const std::string& className, const int classNumStates);
double entropy(Tensor&); std::vector<int> SelectKBestWeighted(const torch::Tensor& weights, bool ascending = false, unsigned k = 0);
double conditionalEntropy(Tensor&, Tensor&); std::vector<double> getScoresKBest() const;
double mutualInformation(Tensor&, Tensor&); double mutualInformation(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights);
vector<float> conditionalEdgeWeights(); // To use in Python std::vector<float> conditionalEdgeWeights(std::vector<float>& weights); // To use in Python
Tensor conditionalEdge(); torch::Tensor conditionalEdge(const torch::Tensor& weights);
vector<pair<string, string>> doCombinations(const vector<string>&); std::vector<std::pair<int, int>> maximumSpanningTree(const std::vector<std::string>& features, const torch::Tensor& weights, const int root);
vector<pair<int, int>> maximumSpanningTree(vector<string> features, Tensor& weights, int root);
}; };
} }
#endif #endif

197
src/BayesNet/BoostAODE.cc Normal file
View File

@@ -0,0 +1,197 @@
#include <set>
#include <functional>
#include <limits.h>
#include "BoostAODE.h"
#include "Colors.h"
#include "Folding.h"
#include "Paths.h"
#include "CFS.h"
#include "FCBF.h"
#include "IWSS.h"
namespace bayesnet {
BoostAODE::BoostAODE() : Ensemble()
{
validHyperparameters = { "repeatSparent", "maxModels", "ascending", "convergence", "threshold", "select_features" };
}
void BoostAODE::buildModel(const torch::Tensor& weights)
{
// Models shall be built in trainModel
models.clear();
n_models = 0;
// Prepare the validation dataset
auto y_ = dataset.index({ -1, "..." });
if (convergence) {
// Prepare train & validation sets from train data
auto fold = platform::StratifiedKFold(5, y_, 271);
dataset_ = torch::clone(dataset);
// save input dataset
auto [train, test] = fold.getFold(0);
auto train_t = torch::tensor(train);
auto test_t = torch::tensor(test);
// Get train and validation sets
X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), train_t });
y_train = dataset.index({ -1, train_t });
X_test = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), test_t });
y_test = dataset.index({ -1, test_t });
dataset = X_train;
m = X_train.size(1);
auto n_classes = states.at(className).size();
metrics = Metrics(dataset, features, className, n_classes);
// Build dataset with train data
buildDataset(y_train);
} else {
// Use all data to train
X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." });
y_train = y_;
}
}
void BoostAODE::setHyperparameters(const nlohmann::json& hyperparameters)
{
if (hyperparameters.contains("repeatSparent")) {
repeatSparent = hyperparameters["repeatSparent"];
}
if (hyperparameters.contains("maxModels")) {
maxModels = hyperparameters["maxModels"];
}
if (hyperparameters.contains("ascending")) {
ascending = hyperparameters["ascending"];
}
if (hyperparameters.contains("convergence")) {
convergence = hyperparameters["convergence"];
}
if (hyperparameters.contains("threshold")) {
threshold = hyperparameters["threshold"];
}
if (hyperparameters.contains("select_features")) {
auto selectedAlgorithm = hyperparameters["select_features"];
std::vector<std::string> algos = { "IWSS", "FCBF", "CFS" };
selectFeatures = true;
algorithm = selectedAlgorithm;
if (std::find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) {
throw std::invalid_argument("Invalid selectFeatures value [IWSS, FCBF, CFS]");
}
}
}
std::unordered_set<int> BoostAODE::initializeModels()
{
std::unordered_set<int> featuresUsed;
torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
int maxFeatures = 0;
if (algorithm == "CFS") {
featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
} else if (algorithm == "IWSS") {
if (threshold < 0 || threshold >0.5) {
throw std::invalid_argument("Invalid threshold value for IWSS [0, 0.5]");
}
featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
} else if (algorithm == "FCBF") {
if (threshold < 1e-7 || threshold > 1) {
throw std::invalid_argument("Invalid threshold value [1e-7, 1]");
}
featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
}
featureSelector->fit();
auto cfsFeatures = featureSelector->getFeatures();
for (const int& feature : cfsFeatures) {
// std::cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << std::endl;
featuresUsed.insert(feature);
std::unique_ptr<Classifier> model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
models.push_back(std::move(model));
significanceModels.push_back(1.0);
n_models++;
}
delete featureSelector;
return featuresUsed;
}
void BoostAODE::trainModel(const torch::Tensor& weights)
{
std::unordered_set<int> featuresUsed;
int tolerance = 5; // number of times the accuracy can be lower than the threshold
if (selectFeatures) {
featuresUsed = initializeModels();
tolerance = 0; // Remove tolerance if features are selected
}
if (maxModels == 0)
maxModels = .1 * n > 10 ? .1 * n : n;
torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
bool exitCondition = false;
// Variables to control the accuracy finish condition
double priorAccuracy = 0.0;
double delta = 1.0;
double threshold = 1e-4;
int count = 0; // number of times the accuracy is lower than the threshold
fitted = true; // to enable predict
// Step 0: Set the finish condition
// if not repeatSparent a finish condition is run out of features
// n_models == maxModels
// epsilon sub t > 0.5 => inverse the weights policy
// validation error is not decreasing
while (!exitCondition) {
// Step 1: Build ranking with mutual information
auto featureSelection = metrics.SelectKBestWeighted(weights_, ascending, n); // Get all the features sorted
std::unique_ptr<Classifier> model;
auto feature = featureSelection[0];
if (!repeatSparent || featuresUsed.size() < featureSelection.size()) {
bool used = true;
for (const auto& feat : featureSelection) {
if (std::find(featuresUsed.begin(), featuresUsed.end(), feat) != featuresUsed.end()) {
continue;
}
used = false;
feature = feat;
break;
}
if (used) {
exitCondition = true;
continue;
}
}
featuresUsed.insert(feature);
model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
auto ypred = model->predict(X_train);
// Step 3.1: Compute the classifier amout of say
auto mask_wrong = ypred != y_train;
auto mask_right = ypred == y_train;
auto masked_weights = weights_ * mask_wrong.to(weights_.dtype());
double epsilon_t = masked_weights.sum().item<double>();
double wt = (1 - epsilon_t) / epsilon_t;
double alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt);
// Step 3.2: Update weights for next classifier
// Step 3.2.1: Update weights of wrong samples
weights_ += mask_wrong.to(weights_.dtype()) * exp(alpha_t) * weights_;
// Step 3.2.2: Update weights of right samples
weights_ += mask_right.to(weights_.dtype()) * exp(-alpha_t) * weights_;
// Step 3.3: Normalise the weights
double totalWeights = torch::sum(weights_).item<double>();
weights_ = weights_ / totalWeights;
// Step 3.4: Store classifier and its accuracy to weigh its future vote
models.push_back(std::move(model));
significanceModels.push_back(alpha_t);
n_models++;
if (convergence) {
auto y_val_predict = predict(X_test);
double accuracy = (y_val_predict == y_test).sum().item<double>() / (double)y_test.size(0);
if (priorAccuracy == 0) {
priorAccuracy = accuracy;
} else {
delta = accuracy - priorAccuracy;
}
if (delta < threshold) {
count++;
}
}
exitCondition = n_models >= maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance;
}
if (featuresUsed.size() != features.size()) {
status = WARNING;
}
}
std::vector<std::string> BoostAODE::graph(const std::string& title) const
{
return Ensemble::graph(title);
}
}

32
src/BayesNet/BoostAODE.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef BOOSTAODE_H
#define BOOSTAODE_H
#include "Ensemble.h"
#include <map>
#include "SPODE.h"
#include "FeatureSelect.h"
namespace bayesnet {
class BoostAODE : public Ensemble {
public:
BoostAODE();
virtual ~BoostAODE() = default;
std::vector<std::string> graph(const std::string& title = "BoostAODE") const override;
void setHyperparameters(const nlohmann::json& hyperparameters) override;
protected:
void buildModel(const torch::Tensor& weights) override;
void trainModel(const torch::Tensor& weights) override;
private:
torch::Tensor dataset_;
torch::Tensor X_train, y_train, X_test, y_test;
std::unordered_set<int> initializeModels();
// Hyperparameters
bool repeatSparent = false; // if true, a feature can be selected more than once
int maxModels = 0;
bool ascending = false; //Process KBest features ascending or descending order
bool convergence = false; //if true, stop when the model does not improve
bool selectFeatures = false; // if true, use feature selection
std::string algorithm = ""; // Selected feature selection algorithm
FeatureSelect* featureSelector = nullptr;
double threshold = -1;
};
}
#endif

72
src/BayesNet/CFS.cc Normal file
View File

@@ -0,0 +1,72 @@
#include "CFS.h"
#include <limits>
#include "bayesnetUtils.h"
namespace bayesnet {
void CFS::fit()
{
initialize();
computeSuLabels();
auto featureOrder = argsort(suLabels); // sort descending order
auto continueCondition = true;
auto feature = featureOrder[0];
selectedFeatures.push_back(feature);
selectedScores.push_back(suLabels[feature]);
selectedFeatures.erase(selectedFeatures.begin());
while (continueCondition) {
double merit = std::numeric_limits<double>::lowest();
int bestFeature = -1;
for (auto feature : featureOrder) {
selectedFeatures.push_back(feature);
// Compute merit with selectedFeatures
auto meritNew = computeMeritCFS();
if (meritNew > merit) {
merit = meritNew;
bestFeature = feature;
}
selectedFeatures.pop_back();
}
if (bestFeature == -1) {
// meritNew has to be nan due to constant features
break;
}
selectedFeatures.push_back(bestFeature);
selectedScores.push_back(merit);
featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
continueCondition = computeContinueCondition(featureOrder);
}
fitted = true;
}
bool CFS::computeContinueCondition(const std::vector<int>& featureOrder)
{
if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) {
return false;
}
if (selectedScores.size() >= 5) {
/*
"To prevent the best first search from exploring the entire
feature subset search space, a stopping criterion is imposed.
The search will terminate if five consecutive fully expanded
subsets show no improvement over the current best subset."
as stated in Mark A.Hall Thesis
*/
double item_ant = std::numeric_limits<double>::lowest();
int num = 0;
std::vector<double> lastFive(selectedScores.end() - 5, selectedScores.end());
for (auto item : lastFive) {
if (item_ant == std::numeric_limits<double>::lowest()) {
item_ant = item;
}
if (item > item_ant) {
break;
} else {
num++;
item_ant = item;
}
}
if (num == 5) {
return false;
}
}
return true;
}
}

20
src/BayesNet/CFS.h Normal file
View File

@@ -0,0 +1,20 @@
#ifndef CFS_H
#define CFS_H
#include <torch/torch.h>
#include <vector>
#include "FeatureSelect.h"
namespace bayesnet {
class CFS : public FeatureSelect {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
CFS(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights)
{
}
virtual ~CFS() {};
void fit() override;
private:
bool computeContinueCondition(const std::vector<int>& featureOrder);
};
}
#endif

View File

@@ -1,5 +1,12 @@
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
include_directories(${BayesNet_SOURCE_DIR}/src/PyClassifiers)
include_directories(${Python3_INCLUDE_DIRS})
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc Mst.cc Proposal.cc) KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
target_link_libraries(BayesNet mdlp ArffFiles "${TORCH_LIBRARIES}") Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")

View File

@@ -2,113 +2,125 @@
#include "bayesnetUtils.h" #include "bayesnetUtils.h"
namespace bayesnet { namespace bayesnet {
using namespace torch;
Classifier::Classifier(Network model) : model(model), m(0), n(0), metrics(Metrics()), fitted(false) {} Classifier::Classifier(Network model) : model(model), m(0), n(0), metrics(Metrics()), fitted(false) {}
Classifier& Classifier::build(vector<string>& features, string className, map<string, vector<int>>& states) Classifier& Classifier::build(const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights)
{ {
Tensor ytmp = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
samples = torch::cat({ X, ytmp }, 0);
this->features = features; this->features = features;
this->className = className; this->className = className;
this->states = states; this->states = states;
m = dataset.size(1);
n = dataset.size(0) - 1;
checkFitParameters(); checkFitParameters();
auto n_classes = states[className].size(); auto n_classes = states.at(className).size();
metrics = Metrics(samples, features, className, n_classes); metrics = Metrics(dataset, features, className, n_classes);
model.initialize(); model.initialize();
train(); buildModel(weights);
if (Xv.empty()) { trainModel(weights);
// fit with tensors
model.fit(X, y, features, className);
} else {
// fit with vectors
model.fit(Xv, yv, features, className);
}
fitted = true; fitted = true;
return *this; return *this;
} }
// X is nxm where n is the number of features and m the number of samples void Classifier::buildDataset(torch::Tensor& ytmp)
Classifier& Classifier::fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states)
{ {
this->X = X; try {
this->y = y; auto yresized = torch::transpose(ytmp.view({ ytmp.size(0), 1 }), 0, 1);
Xv = vector<vector<int>>(); dataset = torch::cat({ dataset, yresized }, 0);
yv = vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0)); }
return build(features, className, states); catch (const std::exception& e) {
} std::cerr << e.what() << '\n';
void Classifier::generateTensorXFromVector() std::cout << "X dimensions: " << dataset.sizes() << "\n";
{ std::cout << "y dimensions: " << ytmp.sizes() << "\n";
X = torch::zeros({ static_cast<int>(Xv.size()), static_cast<int>(Xv[0].size()) }, kInt32); exit(1);
for (int i = 0; i < Xv.size(); ++i) {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], kInt32));
} }
} }
// X is nxm where n is the number of features and m the number of samples void Classifier::trainModel(const torch::Tensor& weights)
Classifier& Classifier::fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states)
{ {
Xv = X; model.fit(dataset, weights, features, className, states);
generateTensorXFromVector(); }
this->y = torch::tensor(y, kInt32); // X is nxm where n is the number of features and m the number of samples
yv = y; Classifier& Classifier::fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states)
return build(features, className, states); {
dataset = X;
buildDataset(y);
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble);
return build(features, className, states, weights);
}
// X is nxm where n is the number of features and m the number of samples
Classifier& Classifier::fit(std::vector<std::vector<int>>& X, std::vector<int>& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states)
{
dataset = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kInt32);
for (int i = 0; i < X.size(); ++i) {
dataset.index_put_({ i, "..." }, torch::tensor(X[i], torch::kInt32));
}
auto ytmp = torch::tensor(y, torch::kInt32);
buildDataset(ytmp);
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble);
return build(features, className, states, weights);
}
Classifier& Classifier::fit(torch::Tensor& dataset, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states)
{
this->dataset = dataset;
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble);
return build(features, className, states, weights);
}
Classifier& Classifier::fit(torch::Tensor& dataset, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights)
{
this->dataset = dataset;
return build(features, className, states, weights);
} }
void Classifier::checkFitParameters() void Classifier::checkFitParameters()
{ {
auto sizes = X.sizes(); if (torch::is_floating_point(dataset)) {
m = sizes[1]; throw std::invalid_argument("dataset (X, y) must be of type Integer");
n = sizes[0];
if (m != y.size(0)) {
throw invalid_argument("X and y must have the same number of samples");
} }
if (n != features.size()) { if (n != features.size()) {
throw invalid_argument("X and features must have the same number of features"); throw std::invalid_argument("Classifier: X " + std::to_string(n) + " and features " + std::to_string(features.size()) + " must have the same number of features");
} }
if (states.find(className) == states.end()) { if (states.find(className) == states.end()) {
throw invalid_argument("className not found in states"); throw std::invalid_argument("className not found in states");
} }
for (auto feature : features) { for (auto feature : features) {
if (states.find(feature) == states.end()) { if (states.find(feature) == states.end()) {
throw invalid_argument("feature [" + feature + "] not found in states"); throw std::invalid_argument("feature [" + feature + "] not found in states");
} }
} }
} }
Tensor Classifier::predict(Tensor& X) torch::Tensor Classifier::predict(torch::Tensor& X)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("Classifier has not been fitted"); throw std::logic_error("Classifier has not been fitted");
} }
return model.predict(X); return model.predict(X);
} }
vector<int> Classifier::predict(vector<vector<int>>& X) std::vector<int> Classifier::predict(std::vector<std::vector<int>>& X)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("Classifier has not been fitted"); throw std::logic_error("Classifier has not been fitted");
} }
auto m_ = X[0].size(); auto m_ = X[0].size();
auto n_ = X.size(); auto n_ = X.size();
vector<vector<int>> Xd(n_, vector<int>(m_, 0)); std::vector<std::vector<int>> Xd(n_, std::vector<int>(m_, 0));
for (auto i = 0; i < n_; i++) { for (auto i = 0; i < n_; i++) {
Xd[i] = vector<int>(X[i].begin(), X[i].end()); Xd[i] = std::vector<int>(X[i].begin(), X[i].end());
} }
auto yp = model.predict(Xd); auto yp = model.predict(Xd);
return yp; return yp;
} }
float Classifier::score(Tensor& X, Tensor& y) float Classifier::score(torch::Tensor& X, torch::Tensor& y)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("Classifier has not been fitted"); throw std::logic_error("Classifier has not been fitted");
} }
Tensor y_pred = predict(X); torch::Tensor y_pred = predict(X);
return (y_pred == y).sum().item<float>() / y.size(0); return (y_pred == y).sum().item<float>() / y.size(0);
} }
float Classifier::score(vector<vector<int>>& X, vector<int>& y) float Classifier::score(std::vector<std::vector<int>>& X, std::vector<int>& y)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("Classifier has not been fitted"); throw std::logic_error("Classifier has not been fitted");
} }
return model.score(X, y); return model.score(X, y);
} }
vector<string> Classifier::show() std::vector<std::string> Classifier::show() const
{ {
return model.show(); return model.show();
} }
@@ -120,26 +132,29 @@ namespace bayesnet {
} }
model.addNode(className); model.addNode(className);
} }
int Classifier::getNumberOfNodes() int Classifier::getNumberOfNodes() const
{ {
// Features does not include class // Features does not include class
return fitted ? model.getFeatures().size() + 1 : 0; return fitted ? model.getFeatures().size() : 0;
} }
int Classifier::getNumberOfEdges() int Classifier::getNumberOfEdges() const
{ {
return fitted ? model.getEdges().size() : 0; return fitted ? model.getNumEdges() : 0;
} }
int Classifier::getNumberOfStates() int Classifier::getNumberOfStates() const
{ {
return fitted ? model.getStates() : 0; return fitted ? model.getStates() : 0;
} }
vector<string> Classifier::topological_order() std::vector<std::string> Classifier::topological_order()
{ {
return model.topological_sort(); return model.topological_sort();
} }
void Classifier::dump_cpt() void Classifier::dump_cpt() const
{ {
model.dump_cpt(); model.dump_cpt();
} }
void Classifier::setHyperparameters(const nlohmann::json& hyperparameters)
{
//For classifiers that don't have hyperparameters
}
} }

View File

@@ -4,45 +4,46 @@
#include "BaseClassifier.h" #include "BaseClassifier.h"
#include "Network.h" #include "Network.h"
#include "BayesMetrics.h" #include "BayesMetrics.h"
using namespace std;
using namespace torch;
namespace bayesnet { namespace bayesnet {
class Classifier : public BaseClassifier { class Classifier : public BaseClassifier {
private: private:
bool fitted; Classifier& build(const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights);
Classifier& build(vector<string>& features, string className, map<string, vector<int>>& states);
protected: protected:
Network model; bool fitted;
int m, n; // m: number of samples, n: number of features int m, n; // m: number of samples, n: number of features
Tensor X; // nxm tensor Network model;
vector<vector<int>> Xv; // nxm vector
Tensor y;
vector<int> yv;
Tensor samples; // (n+1)xm tensor
Metrics metrics; Metrics metrics;
vector<string> features; std::vector<std::string> features;
string className; std::string className;
map<string, vector<int>> states; std::map<std::string, std::vector<int>> states;
torch::Tensor dataset; // (n+1)xm tensor
status_t status = NORMAL;
void checkFitParameters(); void checkFitParameters();
void generateTensorXFromVector(); virtual void buildModel(const torch::Tensor& weights) = 0;
virtual void train() = 0; void trainModel(const torch::Tensor& weights) override;
void buildDataset(torch::Tensor& y);
public: public:
Classifier(Network model); Classifier(Network model);
virtual ~Classifier() = default; virtual ~Classifier() = default;
Classifier& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states) override; Classifier& fit(std::vector<std::vector<int>>& X, std::vector<int>& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states) override;
Classifier& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override; Classifier& fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states) override;
Classifier& fit(torch::Tensor& dataset, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states) override;
Classifier& fit(torch::Tensor& dataset, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights) override;
void addNodes(); void addNodes();
int getNumberOfNodes() override; int getNumberOfNodes() const override;
int getNumberOfEdges() override; int getNumberOfEdges() const override;
int getNumberOfStates() override; int getNumberOfStates() const override;
Tensor predict(Tensor& X) override; torch::Tensor predict(torch::Tensor& X) override;
vector<int> predict(vector<vector<int>>& X) override; status_t getStatus() const override { return status; }
float score(Tensor& X, Tensor& y) override; std::string getVersion() override { return "0.2.0"; };
float score(vector<vector<int>>& X, vector<int>& y) override; std::vector<int> predict(std::vector<std::vector<int>>& X) override;
vector<string> show() override; float score(torch::Tensor& X, torch::Tensor& y) override;
vector<string> topological_order() override; float score(std::vector<std::vector<int>>& X, std::vector<int>& y) override;
void dump_cpt() override; std::vector<std::string> show() const override;
std::vector<std::string> topological_order() override;
void dump_cpt() const override;
void setHyperparameters(const nlohmann::json& hyperparameters) override; //For classifiers that don't have hyperparameters
}; };
} }
#endif #endif

View File

@@ -1,65 +1,29 @@
#include "Ensemble.h" #include "Ensemble.h"
namespace bayesnet { namespace bayesnet {
using namespace torch;
Ensemble::Ensemble() : n_models(0), metrics(Metrics()), fitted(false) {} Ensemble::Ensemble() : Classifier(Network()), n_models(0) {}
Ensemble& Ensemble::build(vector<string>& features, string className, map<string, vector<int>>& states)
void Ensemble::trainModel(const torch::Tensor& weights)
{ {
Tensor ytmp = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
samples = torch::cat({ X, ytmp }, 0);
this->features = features;
this->className = className;
this->states = states;
auto n_classes = states[className].size();
metrics = Metrics(samples, features, className, n_classes);
// Build models
train();
// Train models
n_models = models.size(); n_models = models.size();
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
if (Xv.empty()) { // fit with std::vectors
// fit with tensors models[i]->fit(dataset, features, className, states);
models[i]->fit(X, y, features, className, states);
} else {
// fit with vectors
models[i]->fit(Xv, yv, features, className, states);
}
}
fitted = true;
return *this;
}
void Ensemble::generateTensorXFromVector()
{
X = torch::zeros({ static_cast<int>(Xv.size()), static_cast<int>(Xv[0].size()) }, kInt32);
for (int i = 0; i < Xv.size(); ++i) {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], kInt32));
} }
} }
Ensemble& Ensemble::fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) std::vector<int> Ensemble::voting(torch::Tensor& y_pred)
{
this->X = X;
this->y = y;
Xv = vector<vector<int>>();
yv = vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0));
return build(features, className, states);
}
Ensemble& Ensemble::fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states)
{
Xv = X;
generateTensorXFromVector();
this->y = torch::tensor(y, kInt32);
yv = y;
return build(features, className, states);
}
vector<int> Ensemble::voting(Tensor& y_pred)
{ {
auto y_pred_ = y_pred.accessor<int, 2>(); auto y_pred_ = y_pred.accessor<int, 2>();
vector<int> y_pred_final; std::vector<int> y_pred_final;
int numClasses = states.at(className).size();
// y_pred is m x n_models with the prediction of every model for each sample
for (int i = 0; i < y_pred.size(0); ++i) { for (int i = 0; i < y_pred.size(0); ++i) {
vector<float> votes(y_pred.size(1), 0); // votes store in each index (value of class) the significance added by each model
for (int j = 0; j < y_pred.size(1); ++j) { // i.e. votes[0] contains how much value has the value 0 of class. That value is generated by the models predictions
votes[y_pred_[i][j]] += 1; std::vector<double> votes(numClasses, 0.0);
for (int j = 0; j < n_models; ++j) {
votes[y_pred_[i][j]] += significanceModels.at(j);
} }
// argsort in descending order // argsort in descending order
auto indices = argsort(votes); auto indices = argsort(votes);
@@ -67,19 +31,18 @@ namespace bayesnet {
} }
return y_pred_final; return y_pred_final;
} }
Tensor Ensemble::predict(Tensor& X) torch::Tensor Ensemble::predict(torch::Tensor& X)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("Ensemble has not been fitted"); throw std::logic_error("Ensemble has not been fitted");
} }
Tensor y_pred = torch::zeros({ X.size(1), n_models }, kInt32); torch::Tensor y_pred = torch::zeros({ X.size(1), n_models }, torch::kInt32);
//Create a threadpool auto threads{ std::vector<std::thread>() };
auto threads{ vector<thread>() }; std::mutex mtx;
mutex mtx;
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
threads.push_back(thread([&, i]() { threads.push_back(std::thread([&, i]() {
auto ypredict = models[i]->predict(X); auto ypredict = models[i]->predict(X);
lock_guard<mutex> lock(mtx); std::lock_guard<std::mutex> lock(mtx);
y_pred.index_put_({ "...", i }, ypredict); y_pred.index_put_({ "...", i }, ypredict);
})); }));
} }
@@ -88,27 +51,27 @@ namespace bayesnet {
} }
return torch::tensor(voting(y_pred)); return torch::tensor(voting(y_pred));
} }
vector<int> Ensemble::predict(vector<vector<int>>& X) std::vector<int> Ensemble::predict(std::vector<std::vector<int>>& X)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("Ensemble has not been fitted"); throw std::logic_error("Ensemble has not been fitted");
} }
long m_ = X[0].size(); long m_ = X[0].size();
long n_ = X.size(); long n_ = X.size();
vector<vector<int>> Xd(n_, vector<int>(m_, 0)); std::vector<std::vector<int>> Xd(n_, std::vector<int>(m_, 0));
for (auto i = 0; i < n_; i++) { for (auto i = 0; i < n_; i++) {
Xd[i] = vector<int>(X[i].begin(), X[i].end()); Xd[i] = std::vector<int>(X[i].begin(), X[i].end());
} }
Tensor y_pred = torch::zeros({ m_, n_models }, kInt32); torch::Tensor y_pred = torch::zeros({ m_, n_models }, torch::kInt32);
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
y_pred.index_put_({ "...", i }, torch::tensor(models[i]->predict(Xd), kInt32)); y_pred.index_put_({ "...", i }, torch::tensor(models[i]->predict(Xd), torch::kInt32));
} }
return voting(y_pred); return voting(y_pred);
} }
float Ensemble::score(Tensor& X, Tensor& y) float Ensemble::score(torch::Tensor& X, torch::Tensor& y)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("Ensemble has not been fitted"); throw std::logic_error("Ensemble has not been fitted");
} }
auto y_pred = predict(X); auto y_pred = predict(X);
int correct = 0; int correct = 0;
@@ -119,10 +82,10 @@ namespace bayesnet {
} }
return (double)correct / y_pred.size(0); return (double)correct / y_pred.size(0);
} }
float Ensemble::score(vector<vector<int>>& X, vector<int>& y) float Ensemble::score(std::vector<std::vector<int>>& X, std::vector<int>& y)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("Ensemble has not been fitted"); throw std::logic_error("Ensemble has not been fitted");
} }
auto y_pred = predict(X); auto y_pred = predict(X);
int correct = 0; int correct = 0;
@@ -132,27 +95,26 @@ namespace bayesnet {
} }
} }
return (double)correct / y_pred.size(); return (double)correct / y_pred.size();
} }
vector<string> Ensemble::show() std::vector<std::string> Ensemble::show() const
{ {
auto result = vector<string>(); auto result = std::vector<std::string>();
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
auto res = models[i]->show(); auto res = models[i]->show();
result.insert(result.end(), res.begin(), res.end()); result.insert(result.end(), res.begin(), res.end());
} }
return result; return result;
} }
vector<string> Ensemble::graph(const string& title) std::vector<std::string> Ensemble::graph(const std::string& title) const
{ {
auto result = vector<string>(); auto result = std::vector<std::string>();
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
auto res = models[i]->graph(title + "_" + to_string(i)); auto res = models[i]->graph(title + "_" + std::to_string(i));
result.insert(result.end(), res.begin(), res.end()); result.insert(result.end(), res.begin(), res.end());
} }
return result; return result;
} }
int Ensemble::getNumberOfNodes() int Ensemble::getNumberOfNodes() const
{ {
int nodes = 0; int nodes = 0;
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
@@ -160,7 +122,7 @@ namespace bayesnet {
} }
return nodes; return nodes;
} }
int Ensemble::getNumberOfEdges() int Ensemble::getNumberOfEdges() const
{ {
int edges = 0; int edges = 0;
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
@@ -168,7 +130,7 @@ namespace bayesnet {
} }
return edges; return edges;
} }
int Ensemble::getNumberOfStates() int Ensemble::getNumberOfStates() const
{ {
int nstates = 0; int nstates = 0;
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {

View File

@@ -4,48 +4,34 @@
#include "Classifier.h" #include "Classifier.h"
#include "BayesMetrics.h" #include "BayesMetrics.h"
#include "bayesnetUtils.h" #include "bayesnetUtils.h"
using namespace std;
using namespace torch;
namespace bayesnet { namespace bayesnet {
class Ensemble : public BaseClassifier { class Ensemble : public Classifier {
private: private:
Ensemble& build(vector<string>& features, string className, map<string, vector<int>>& states); Ensemble& build(std::vector<std::string>& features, std::string className, std::map<std::string, std::vector<int>>& states);
protected: protected:
unsigned n_models; unsigned n_models;
bool fitted; std::vector<std::unique_ptr<Classifier>> models;
vector<unique_ptr<Classifier>> models; std::vector<double> significanceModels;
Tensor X; void trainModel(const torch::Tensor& weights) override;
vector<vector<int>> Xv; std::vector<int> voting(torch::Tensor& y_pred);
Tensor y;
vector<int> yv;
Tensor samples;
Metrics metrics;
vector<string> features;
string className;
map<string, vector<int>> states;
void virtual train() = 0;
vector<int> voting(Tensor& y_pred);
void generateTensorXFromVector();
public: public:
Ensemble(); Ensemble();
virtual ~Ensemble() = default; virtual ~Ensemble() = default;
Ensemble& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states) override; torch::Tensor predict(torch::Tensor& X) override;
Ensemble& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override; std::vector<int> predict(std::vector<std::vector<int>>& X) override;
Tensor predict(Tensor& X) override; float score(torch::Tensor& X, torch::Tensor& y) override;
vector<int> predict(vector<vector<int>>& X) override; float score(std::vector<std::vector<int>>& X, std::vector<int>& y) override;
float score(Tensor& X, Tensor& y) override; int getNumberOfNodes() const override;
float score(vector<vector<int>>& X, vector<int>& y) override; int getNumberOfEdges() const override;
int getNumberOfNodes() override; int getNumberOfStates() const override;
int getNumberOfEdges() override; std::vector<std::string> show() const override;
int getNumberOfStates() override; std::vector<std::string> graph(const std::string& title) const override;
vector<string> show() override; std::vector<std::string> topological_order() override
vector<string> graph(const string& title) override;
vector<string> topological_order() override
{ {
return vector<string>(); return std::vector<std::string>();
} }
void dump_cpt() override void dump_cpt() const override
{ {
} }
}; };

44
src/BayesNet/FCBF.cc Normal file
View File

@@ -0,0 +1,44 @@
#include "bayesnetUtils.h"
#include "FCBF.h"
namespace bayesnet {
FCBF::FCBF(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
{
if (threshold < 1e-7) {
throw std::invalid_argument("Threshold cannot be less than 1e-7");
}
}
void FCBF::fit()
{
initialize();
computeSuLabels();
auto featureOrder = argsort(suLabels); // sort descending order
auto featureOrderCopy = featureOrder;
for (const auto& feature : featureOrder) {
// Don't self compare
featureOrderCopy.erase(featureOrderCopy.begin());
if (suLabels.at(feature) == 0.0) {
// The feature has been removed from the list
continue;
}
if (suLabels.at(feature) < threshold) {
break;
}
// Remove redundant features
for (const auto& featureCopy : featureOrderCopy) {
double value = computeSuFeatures(feature, featureCopy);
if (value >= suLabels.at(featureCopy)) {
// Remove feature from list
suLabels[featureCopy] = 0.0;
}
}
selectedFeatures.push_back(feature);
selectedScores.push_back(suLabels[feature]);
if (selectedFeatures.size() == maxFeatures) {
break;
}
}
fitted = true;
}
}

17
src/BayesNet/FCBF.h Normal file
View File

@@ -0,0 +1,17 @@
#ifndef FCBF_H
#define FCBF_H
#include <torch/torch.h>
#include <vector>
#include "FeatureSelect.h"
namespace bayesnet {
class FCBF : public FeatureSelect {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
FCBF(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
virtual ~FCBF() {};
void fit() override;
private:
double threshold = -1;
};
}
#endif

View File

@@ -0,0 +1,79 @@
#include "FeatureSelect.h"
#include <limits>
#include "bayesnetUtils.h"
namespace bayesnet {
FeatureSelect::FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
{
}
void FeatureSelect::initialize()
{
selectedFeatures.clear();
selectedScores.clear();
}
double FeatureSelect::symmetricalUncertainty(int a, int b)
{
/*
Compute symmetrical uncertainty. Normalize* information gain (mutual
information) with the entropies of the features in order to compensate
the bias due to high cardinality features. *Range [0, 1]
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
*/
auto x = samples.index({ a, "..." });
auto y = samples.index({ b, "..." });
auto mu = mutualInformation(x, y, weights);
auto hx = entropy(x, weights);
auto hy = entropy(y, weights);
return 2.0 * mu / (hx + hy);
}
void FeatureSelect::computeSuLabels()
{
// Compute Simmetrical Uncertainty between features and labels
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
for (int i = 0; i < features.size(); ++i) {
suLabels.push_back(symmetricalUncertainty(i, -1));
}
}
double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
{
// Compute Simmetrical Uncertainty between features
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
try {
return suFeatures.at({ firstFeature, secondFeature });
}
catch (const std::out_of_range& e) {
double result = symmetricalUncertainty(firstFeature, secondFeature);
suFeatures[{firstFeature, secondFeature}] = result;
return result;
}
}
double FeatureSelect::computeMeritCFS()
{
double result;
double rcf = 0;
for (auto feature : selectedFeatures) {
rcf += suLabels[feature];
}
double rff = 0;
int n = selectedFeatures.size();
for (const auto& item : doCombinations(selectedFeatures)) {
rff += computeSuFeatures(item.first, item.second);
}
return rcf / sqrt(n + (n * n - n) * rff);
}
std::vector<int> FeatureSelect::getFeatures() const
{
if (!fitted) {
throw std::runtime_error("FeatureSelect not fitted");
}
return selectedFeatures;
}
std::vector<double> FeatureSelect::getScores() const
{
if (!fitted) {
throw std::runtime_error("FeatureSelect not fitted");
}
return selectedScores;
}
}

View File

@@ -0,0 +1,30 @@
#ifndef FEATURE_SELECT_H
#define FEATURE_SELECT_H
#include <torch/torch.h>
#include <vector>
#include "BayesMetrics.h"
namespace bayesnet {
class FeatureSelect : public Metrics {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
virtual ~FeatureSelect() {};
virtual void fit() = 0;
std::vector<int> getFeatures() const;
std::vector<double> getScores() const;
protected:
void initialize();
void computeSuLabels();
double computeSuFeatures(const int a, const int b);
double symmetricalUncertainty(int a, int b);
double computeMeritCFS();
const torch::Tensor& weights;
int maxFeatures;
std::vector<int> selectedFeatures;
std::vector<double> selectedScores;
std::vector<double> suLabels;
std::map<std::pair<int, int>, double> suFeatures;
bool fitted = false;
};
}
#endif

47
src/BayesNet/IWSS.cc Normal file
View File

@@ -0,0 +1,47 @@
#include "IWSS.h"
#include <limits>
#include "bayesnetUtils.h"
namespace bayesnet {
IWSS::IWSS(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
{
if (threshold < 0 || threshold > .5) {
throw std::invalid_argument("Threshold has to be in [0, 0.5]");
}
}
void IWSS::fit()
{
initialize();
computeSuLabels();
auto featureOrder = argsort(suLabels); // sort descending order
auto featureOrderCopy = featureOrder;
// Add first and second features to result
// First with its own score
auto first_feature = pop_first(featureOrderCopy);
selectedFeatures.push_back(first_feature);
selectedScores.push_back(suLabels.at(first_feature));
// Second with the score of the candidates
selectedFeatures.push_back(pop_first(featureOrderCopy));
auto merit = computeMeritCFS();
selectedScores.push_back(merit);
for (const auto feature : featureOrderCopy) {
selectedFeatures.push_back(feature);
// Compute merit with selectedFeatures
auto meritNew = computeMeritCFS();
double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0;
if (meritNew > merit || delta < threshold) {
if (meritNew > merit) {
merit = meritNew;
}
selectedScores.push_back(meritNew);
} else {
selectedFeatures.pop_back();
break;
}
if (selectedFeatures.size() == maxFeatures) {
break;
}
}
fitted = true;
}
}

17
src/BayesNet/IWSS.h Normal file
View File

@@ -0,0 +1,17 @@
#ifndef IWSS_H
#define IWSS_H
#include <torch/torch.h>
#include <vector>
#include "FeatureSelect.h"
namespace bayesnet {
class IWSS : public FeatureSelect {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
IWSS(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
virtual ~IWSS() {};
void fit() override;
private:
double threshold = -1;
};
}
#endif

View File

@@ -1,10 +1,21 @@
#include "KDB.h" #include "KDB.h"
namespace bayesnet { namespace bayesnet {
using namespace torch; KDB::KDB(int k, float theta) : Classifier(Network()), k(k), theta(theta)
{
validHyperparameters = { "k", "theta" };
KDB::KDB(int k, float theta) : Classifier(Network()), k(k), theta(theta) {} }
void KDB::train() void KDB::setHyperparameters(const nlohmann::json& hyperparameters)
{
if (hyperparameters.contains("k")) {
k = hyperparameters["k"];
}
if (hyperparameters.contains("theta")) {
theta = hyperparameters["theta"];
}
}
void KDB::buildModel(const torch::Tensor& weights)
{ {
/* /*
1. For each feature Xi, compute mutual information, I(X;C), 1. For each feature Xi, compute mutual information, I(X;C),
@@ -28,15 +39,16 @@ namespace bayesnet {
// 1. For each feature Xi, compute mutual information, I(X;C), // 1. For each feature Xi, compute mutual information, I(X;C),
// where C is the class. // where C is the class.
addNodes(); addNodes();
vector <float> mi; const torch::Tensor& y = dataset.index({ -1, "..." });
std::vector<double> mi;
for (auto i = 0; i < features.size(); i++) { for (auto i = 0; i < features.size(); i++) {
Tensor firstFeature = X.index({ i, "..." }); torch::Tensor firstFeature = dataset.index({ i, "..." });
mi.push_back(metrics.mutualInformation(firstFeature, y)); mi.push_back(metrics.mutualInformation(firstFeature, y, weights));
} }
// 2. Compute class conditional mutual information I(Xi;XjIC), f or each // 2. Compute class conditional mutual information I(Xi;XjIC), f or each
auto conditionalEdgeWeights = metrics.conditionalEdge(); auto conditionalEdgeWeights = metrics.conditionalEdge(weights);
// 3. Let the used variable list, S, be empty. // 3. Let the used variable list, S, be empty.
vector<int> S; std::vector<int> S;
// 4. Let the DAG network being constructed, BN, begin with a single // 4. Let the DAG network being constructed, BN, begin with a single
// class node, C. // class node, C.
// 5. Repeat until S includes all domain features // 5. Repeat until S includes all domain features
@@ -54,9 +66,9 @@ namespace bayesnet {
S.push_back(idx); S.push_back(idx);
} }
} }
void KDB::add_m_edges(int idx, vector<int>& S, Tensor& weights) void KDB::add_m_edges(int idx, std::vector<int>& S, torch::Tensor& weights)
{ {
auto n_edges = min(k, static_cast<int>(S.size())); auto n_edges = std::min(k, static_cast<int>(S.size()));
auto cond_w = clone(weights); auto cond_w = clone(weights);
bool exit_cond = k == 0; bool exit_cond = k == 0;
int num = 0; int num = 0;
@@ -68,7 +80,7 @@ namespace bayesnet {
model.addEdge(features[max_minfo], features[idx]); model.addEdge(features[max_minfo], features[idx]);
num++; num++;
} }
catch (const invalid_argument& e) { catch (const std::invalid_argument& e) {
// Loops are not allowed // Loops are not allowed
} }
} }
@@ -78,11 +90,11 @@ namespace bayesnet {
exit_cond = num == n_edges || candidates.size(0) == 0; exit_cond = num == n_edges || candidates.size(0) == 0;
} }
} }
vector<string> KDB::graph(const string& title) std::vector<std::string> KDB::graph(const std::string& title) const
{ {
string header{ title }; std::string header{ title };
if (title == "KDB") { if (title == "KDB") {
header += " (k=" + to_string(k) + ", theta=" + to_string(theta) + ")"; header += " (k=" + std::to_string(k) + ", theta=" + std::to_string(theta) + ")";
} }
return model.graph(header); return model.graph(header);
} }

View File

@@ -1,21 +1,21 @@
#ifndef KDB_H #ifndef KDB_H
#define KDB_H #define KDB_H
#include <torch/torch.h>
#include "Classifier.h" #include "Classifier.h"
#include "bayesnetUtils.h" #include "bayesnetUtils.h"
namespace bayesnet { namespace bayesnet {
using namespace std;
using namespace torch;
class KDB : public Classifier { class KDB : public Classifier {
private: private:
int k; int k;
float theta; float theta;
void add_m_edges(int idx, vector<int>& S, Tensor& weights); void add_m_edges(int idx, std::vector<int>& S, torch::Tensor& weights);
protected: protected:
void train() override; void buildModel(const torch::Tensor& weights) override;
public: public:
explicit KDB(int k, float theta = 0.03); explicit KDB(int k, float theta = 0.03);
virtual ~KDB() {}; virtual ~KDB() = default;
vector<string> graph(const string& name = "KDB") override; void setHyperparameters(const nlohmann::json& hyperparameters) override;
std::vector<std::string> graph(const std::string& name = "KDB") const override;
}; };
} }
#endif #endif

View File

@@ -1,34 +1,28 @@
#include "KDBLd.h" #include "KDBLd.h"
namespace bayesnet { namespace bayesnet {
using namespace std; KDBLd::KDBLd(int k) : KDB(k), Proposal(dataset, features, className) {}
KDBLd::KDBLd(int k) : KDB(k), Proposal(KDB::Xv, KDB::yv, features, className) {} KDBLd& KDBLd::fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_)
KDBLd& KDBLd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
{ {
// This first part should go in a Classifier method called fit_local_discretization o fit_float... checkInput(X_, y_);
features = features_; features = features_;
className = className_; className = className_;
Xf = X_; Xf = X_;
y = y_; y = y_;
// Fills vectors Xv & yv with the data from tensors X_ (discretized) & y // Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
fit_local_discretization(states, y); states = fit_local_discretization(y);
generateTensorXFromVector();
// We have discretized the input data // We have discretized the input data
// 1st we need to fit the model to build the normal KDB structure, KDB::fit initializes the base Bayesian network // 1st we need to fit the model to build the normal KDB structure, KDB::fit initializes the base Bayesian network
KDB::fit(KDB::Xv, KDB::yv, features, className, states); KDB::fit(dataset, features, className, states);
localDiscretizationProposal(states, model); states = localDiscretizationProposal(states, model);
generateTensorXFromVector();
Tensor ytmp = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
samples = torch::cat({ X, ytmp }, 0);
model.fit(KDB::Xv, KDB::yv, features, className);
return *this; return *this;
} }
Tensor KDBLd::predict(Tensor& X) torch::Tensor KDBLd::predict(torch::Tensor& X)
{ {
auto Xt = prepareX(X); auto Xt = prepareX(X);
return KDB::predict(Xt); return KDB::predict(Xt);
} }
vector<string> KDBLd::graph(const string& name) std::vector<std::string> KDBLd::graph(const std::string& name) const
{ {
return KDB::graph(name); return KDB::graph(name);
} }

View File

@@ -4,16 +4,15 @@
#include "Proposal.h" #include "Proposal.h"
namespace bayesnet { namespace bayesnet {
using namespace std;
class KDBLd : public KDB, public Proposal { class KDBLd : public KDB, public Proposal {
private: private:
public: public:
explicit KDBLd(int k); explicit KDBLd(int k);
virtual ~KDBLd() = default; virtual ~KDBLd() = default;
KDBLd& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override; KDBLd& fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states) override;
vector<string> graph(const string& name = "KDB") override; std::vector<std::string> graph(const std::string& name = "KDB") const override;
Tensor predict(Tensor& X) override; torch::Tensor predict(torch::Tensor& X) override;
static inline string version() { return "0.0.1"; }; static inline std::string version() { return "0.0.1"; };
}; };
} }
#endif // !KDBLD_H #endif // !KDBLD_H

View File

@@ -1,13 +1,13 @@
#include "Mst.h" #include "Mst.h"
#include <vector> #include <vector>
#include <list>
/* /*
Based on the code from https://www.softwaretestinghelp.com/minimum-spanning-tree-tutorial/ Based on the code from https://www.softwaretestinghelp.com/minimum-spanning-tree-tutorial/
*/ */
namespace bayesnet { namespace bayesnet {
using namespace std; Graph::Graph(int V) : V(V), parent(std::vector<int>(V))
Graph::Graph(int V) : V(V), parent(vector<int>(V))
{ {
for (int i = 0; i < V; i++) for (int i = 0; i < V; i++)
parent[i] = i; parent[i] = i;
@@ -34,36 +34,45 @@ namespace bayesnet {
void Graph::kruskal_algorithm() void Graph::kruskal_algorithm()
{ {
// sort the edges ordered on decreasing weight // sort the edges ordered on decreasing weight
sort(G.begin(), G.end(), [](const auto& left, const auto& right) {return left.first > right.first;}); stable_sort(G.begin(), G.end(), [](const auto& left, const auto& right) {return left.first > right.first;});
for (int i = 0; i < G.size(); i++) { for (int i = 0; i < G.size(); i++) {
int uSt, vEd; int uSt, vEd;
uSt = find_set(G[i].second.first); uSt = find_set(G[i].second.first);
vEd = find_set(G[i].second.second); vEd = find_set(G[i].second.second);
if (uSt != vEd) { if (uSt != vEd) {
T.push_back(G[i]); // add to mst vector T.push_back(G[i]); // add to mst std::vector
union_set(uSt, vEd); union_set(uSt, vEd);
} }
} }
} }
void Graph::display_mst() void Graph::display_mst()
{ {
cout << "Edge :" << " Weight" << endl; std::cout << "Edge :" << " Weight" << std::endl;
for (int i = 0; i < T.size(); i++) { for (int i = 0; i < T.size(); i++) {
cout << T[i].second.first << " - " << T[i].second.second << " : " std::cout << T[i].second.first << " - " << T[i].second.second << " : "
<< T[i].first; << T[i].first;
cout << endl; std::cout << std::endl;
} }
} }
vector<pair<int, int>> reorder(vector<pair<float, pair<int, int>>> T, int root_original) void insertElement(std::list<int>& variables, int variable)
{ {
auto result = vector<pair<int, int>>(); if (std::find(variables.begin(), variables.end(), variable) == variables.end()) {
auto visited = vector<int>(); variables.push_front(variable);
auto nextVariables = unordered_set<int>(); }
nextVariables.emplace(root_original); }
std::vector<std::pair<int, int>> reorder(std::vector<std::pair<float, std::pair<int, int>>> T, int root_original)
{
// Create the edges of a DAG from the MST
// replacing unordered_set with list because unordered_set cannot guarantee the order of the elements inserted
auto result = std::vector<std::pair<int, int>>();
auto visited = std::vector<int>();
auto nextVariables = std::list<int>();
nextVariables.push_front(root_original);
while (nextVariables.size() > 0) { while (nextVariables.size() > 0) {
int root = *nextVariables.begin(); int root = nextVariables.front();
nextVariables.erase(nextVariables.begin()); nextVariables.pop_front();
for (int i = 0; i < T.size(); ++i) { for (int i = 0; i < T.size(); ++i) {
auto [weight, edge] = T[i]; auto [weight, edge] = T[i];
auto [from, to] = edge; auto [from, to] = edge;
@@ -71,10 +80,10 @@ namespace bayesnet {
visited.insert(visited.begin(), i); visited.insert(visited.begin(), i);
if (from == root) { if (from == root) {
result.push_back({ from, to }); result.push_back({ from, to });
nextVariables.emplace(to); insertElement(nextVariables, to);
} else { } else {
result.push_back({ to, from }); result.push_back({ to, from });
nextVariables.emplace(from); insertElement(nextVariables, from);
} }
} }
} }
@@ -94,12 +103,11 @@ namespace bayesnet {
return result; return result;
} }
MST::MST(vector<string>& features, Tensor& weights, int root) : features(features), weights(weights), root(root) {} MST::MST(const std::vector<std::string>& features, const torch::Tensor& weights, const int root) : features(features), weights(weights), root(root) {}
vector<pair<int, int>> MST::maximumSpanningTree() std::vector<std::pair<int, int>> MST::maximumSpanningTree()
{ {
auto num_features = features.size(); auto num_features = features.size();
Graph g(num_features); Graph g(num_features);
// Make a complete graph // Make a complete graph
for (int i = 0; i < num_features - 1; ++i) { for (int i = 0; i < num_features - 1; ++i) {
for (int j = i + 1; j < num_features; ++j) { for (int j = i + 1; j < num_features; ++j) {

View File

@@ -4,24 +4,22 @@
#include <vector> #include <vector>
#include <string> #include <string>
namespace bayesnet { namespace bayesnet {
using namespace std;
using namespace torch;
class MST { class MST {
private: private:
Tensor weights; torch::Tensor weights;
vector<string> features; std::vector<std::string> features;
int root = 0; int root = 0;
public: public:
MST() = default; MST() = default;
MST(vector<string>& features, Tensor& weights, int root); MST(const std::vector<std::string>& features, const torch::Tensor& weights, const int root);
vector<pair<int, int>> maximumSpanningTree(); std::vector<std::pair<int, int>> maximumSpanningTree();
}; };
class Graph { class Graph {
private: private:
int V; // number of nodes in graph int V; // number of nodes in graph
vector <pair<float, pair<int, int>>> G; // vector for graph std::vector <std::pair<float, std::pair<int, int>>> G; // std::vector for graph
vector <pair<float, pair<int, int>>> T; // vector for mst std::vector <std::pair<float, std::pair<int, int>>> T; // std::vector for mst
vector<int> parent; std::vector<int> parent;
public: public:
explicit Graph(int V); explicit Graph(int V);
void addEdge(int u, int v, float wt); void addEdge(int u, int v, float wt);
@@ -29,7 +27,7 @@ namespace bayesnet {
void union_set(int u, int v); void union_set(int u, int v);
void kruskal_algorithm(); void kruskal_algorithm();
void display_mst(); void display_mst();
vector <pair<float, pair<int, int>>> get_mst() { return T; } std::vector <std::pair<float, std::pair<int, int>>> get_mst() { return T; }
}; };
} }
#endif #endif

View File

@@ -3,24 +3,22 @@
#include "Network.h" #include "Network.h"
#include "bayesnetUtils.h" #include "bayesnetUtils.h"
namespace bayesnet { namespace bayesnet {
Network::Network() : features(vector<string>()), className(""), classNumStates(0), fitted(false) {} Network::Network() : features(std::vector<std::string>()), className(""), classNumStates(0), fitted(false), laplaceSmoothing(0) {}
Network::Network(float maxT) : features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {} Network::Network(float maxT) : features(std::vector<std::string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false), laplaceSmoothing(0) {}
Network::Network(float maxT, int smoothing) : laplaceSmoothing(smoothing), features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {}
Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other. Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other.
getmaxThreads()), fitted(other.fitted) getmaxThreads()), fitted(other.fitted)
{ {
for (const auto& pair : other.nodes) { for (const auto& node : other.nodes) {
nodes[pair.first] = std::make_unique<Node>(*pair.second); nodes[node.first] = std::make_unique<Node>(*node.second);
} }
} }
void Network::initialize() void Network::initialize()
{ {
features = vector<string>(); features = std::vector<std::string>();
className = ""; className = "";
classNumStates = 0; classNumStates = 0;
fitted = false; fitted = false;
nodes.clear(); nodes.clear();
dataset.clear();
samples = torch::Tensor(); samples = torch::Tensor();
} }
float Network::getmaxThreads() float Network::getmaxThreads()
@@ -31,10 +29,10 @@ namespace bayesnet {
{ {
return samples; return samples;
} }
void Network::addNode(const string& name) void Network::addNode(const std::string& name)
{ {
if (name == "") { if (name == "") {
throw invalid_argument("Node name cannot be empty"); throw std::invalid_argument("Node name cannot be empty");
} }
if (nodes.find(name) != nodes.end()) { if (nodes.find(name) != nodes.end()) {
return; return;
@@ -44,15 +42,15 @@ namespace bayesnet {
} }
nodes[name] = std::make_unique<Node>(name); nodes[name] = std::make_unique<Node>(name);
} }
vector<string> Network::getFeatures() std::vector<std::string> Network::getFeatures() const
{ {
return features; return features;
} }
int Network::getClassNumStates() int Network::getClassNumStates() const
{ {
return classNumStates; return classNumStates;
} }
int Network::getStates() int Network::getStates() const
{ {
int result = 0; int result = 0;
for (auto& node : nodes) { for (auto& node : nodes) {
@@ -60,11 +58,11 @@ namespace bayesnet {
} }
return result; return result;
} }
string Network::getClassName() std::string Network::getClassName() const
{ {
return className; return className;
} }
bool Network::isCyclic(const string& nodeId, unordered_set<string>& visited, unordered_set<string>& recStack) bool Network::isCyclic(const std::string& nodeId, std::unordered_set<std::string>& visited, std::unordered_set<std::string>& recStack)
{ {
if (visited.find(nodeId) == visited.end()) // if node hasn't been visited yet if (visited.find(nodeId) == visited.end()) // if node hasn't been visited yet
{ {
@@ -80,124 +78,107 @@ namespace bayesnet {
recStack.erase(nodeId); // remove node from recursion stack before function ends recStack.erase(nodeId); // remove node from recursion stack before function ends
return false; return false;
} }
void Network::addEdge(const string& parent, const string& child) void Network::addEdge(const std::string& parent, const std::string& child)
{ {
if (nodes.find(parent) == nodes.end()) { if (nodes.find(parent) == nodes.end()) {
throw invalid_argument("Parent node " + parent + " does not exist"); throw std::invalid_argument("Parent node " + parent + " does not exist");
} }
if (nodes.find(child) == nodes.end()) { if (nodes.find(child) == nodes.end()) {
throw invalid_argument("Child node " + child + " does not exist"); throw std::invalid_argument("Child node " + child + " does not exist");
} }
// Temporarily add edge to check for cycles // Temporarily add edge to check for cycles
nodes[parent]->addChild(nodes[child].get()); nodes[parent]->addChild(nodes[child].get());
nodes[child]->addParent(nodes[parent].get()); nodes[child]->addParent(nodes[parent].get());
unordered_set<string> visited; std::unordered_set<std::string> visited;
unordered_set<string> recStack; std::unordered_set<std::string> recStack;
if (isCyclic(nodes[child]->getName(), visited, recStack)) // if adding this edge forms a cycle if (isCyclic(nodes[child]->getName(), visited, recStack)) // if adding this edge forms a cycle
{ {
// remove problematic edge // remove problematic edge
nodes[parent]->removeChild(nodes[child].get()); nodes[parent]->removeChild(nodes[child].get());
nodes[child]->removeParent(nodes[parent].get()); nodes[child]->removeParent(nodes[parent].get());
throw invalid_argument("Adding this edge forms a cycle in the graph."); throw std::invalid_argument("Adding this edge forms a cycle in the graph.");
} }
} }
map<string, std::unique_ptr<Node>>& Network::getNodes() std::map<std::string, std::unique_ptr<Node>>& Network::getNodes()
{ {
return nodes; return nodes;
} }
void Network::checkFitData(int n_samples, int n_features, int n_samples_y, const vector<string>& featureNames, const string& className) void Network::checkFitData(int n_samples, int n_features, int n_samples_y, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights)
{ {
if (weights.size(0) != n_samples) {
throw std::invalid_argument("Weights (" + std::to_string(weights.size(0)) + ") must have the same number of elements as samples (" + std::to_string(n_samples) + ") in Network::fit");
}
if (n_samples != n_samples_y) { if (n_samples != n_samples_y) {
throw invalid_argument("X and y must have the same number of samples in Network::fit (" + to_string(n_samples) + " != " + to_string(n_samples_y) + ")"); throw std::invalid_argument("X and y must have the same number of samples in Network::fit (" + std::to_string(n_samples) + " != " + std::to_string(n_samples_y) + ")");
} }
if (n_features != featureNames.size()) { if (n_features != featureNames.size()) {
throw invalid_argument("X and features must have the same number of features in Network::fit (" + to_string(n_features) + " != " + to_string(featureNames.size()) + ")"); throw std::invalid_argument("X and features must have the same number of features in Network::fit (" + std::to_string(n_features) + " != " + std::to_string(featureNames.size()) + ")");
} }
if (n_features != features.size() - 1) { if (n_features != features.size() - 1) {
throw invalid_argument("X and local features must have the same number of features in Network::fit (" + to_string(n_features) + " != " + to_string(features.size() - 1) + ")"); throw std::invalid_argument("X and local features must have the same number of features in Network::fit (" + std::to_string(n_features) + " != " + std::to_string(features.size() - 1) + ")");
} }
if (find(features.begin(), features.end(), className) == features.end()) { if (find(features.begin(), features.end(), className) == features.end()) {
throw invalid_argument("className not found in Network::features"); throw std::invalid_argument("className not found in Network::features");
} }
for (auto& feature : featureNames) { for (auto& feature : featureNames) {
if (find(features.begin(), features.end(), feature) == features.end()) { if (find(features.begin(), features.end(), feature) == features.end()) {
throw invalid_argument("Feature " + feature + " not found in Network::features"); throw std::invalid_argument("Feature " + feature + " not found in Network::features");
}
if (states.find(feature) == states.end()) {
throw std::invalid_argument("Feature " + feature + " not found in states");
} }
} }
} }
void Network::setStates() void Network::setStates(const std::map<std::string, std::vector<int>>& states)
{ {
// Set states to every Node in the network // Set states to every Node in the network
for (int i = 0; i < features.size(); ++i) { for_each(features.begin(), features.end(), [this, &states](const std::string& feature) {
nodes[features[i]]->setNumStates(static_cast<int>(torch::max(samples.index({ i, "..." })).item<int>()) + 1); nodes.at(feature)->setNumStates(states.at(feature).size());
} });
classNumStates = nodes[className]->getNumStates(); classNumStates = nodes.at(className)->getNumStates();
} }
// X comes in nxm, where n is the number of features and m the number of samples // X comes in nxm, where n is the number of features and m the number of samples
void Network::fit(torch::Tensor& X, torch::Tensor& y, const vector<string>& featureNames, const string& className) void Network::fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states)
{ {
checkFitData(X.size(1), X.size(0), y.size(0), featureNames, className); checkFitData(X.size(1), X.size(0), y.size(0), featureNames, className, states, weights);
this->className = className; this->className = className;
dataset.clear(); torch::Tensor ytmp = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
Tensor ytmp = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
samples = torch::cat({ X , ytmp }, 0); samples = torch::cat({ X , ytmp }, 0);
for (int i = 0; i < featureNames.size(); ++i) { for (int i = 0; i < featureNames.size(); ++i) {
auto row_feature = X.index({ i, "..." }); auto row_feature = X.index({ i, "..." });
dataset[featureNames[i]] = vector<int>(row_feature.data_ptr<int>(), row_feature.data_ptr<int>() + row_feature.size(0));;
} }
dataset[className] = vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0)); completeFit(states, weights);
completeFit(); }
void Network::fit(const torch::Tensor& samples, const torch::Tensor& weights, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states)
{
checkFitData(samples.size(1), samples.size(0) - 1, samples.size(1), featureNames, className, states, weights);
this->className = className;
this->samples = samples;
completeFit(states, weights);
} }
// input_data comes in nxm, where n is the number of features and m the number of samples // input_data comes in nxm, where n is the number of features and m the number of samples
void Network::fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<string>& featureNames, const string& className) void Network::fit(const std::vector<std::vector<int>>& input_data, const std::vector<int>& labels, const std::vector<double>& weights_, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states)
{ {
checkFitData(input_data[0].size(), input_data.size(), labels.size(), featureNames, className); const torch::Tensor weights = torch::tensor(weights_, torch::kFloat64);
checkFitData(input_data[0].size(), input_data.size(), labels.size(), featureNames, className, states, weights);
this->className = className; this->className = className;
dataset.clear(); // Build tensor of samples (nxm) (n+1 because of the class)
// Build dataset & tensor of samples (nxm) (n+1 because of the class)
samples = torch::zeros({ static_cast<int>(input_data.size() + 1), static_cast<int>(input_data[0].size()) }, torch::kInt32); samples = torch::zeros({ static_cast<int>(input_data.size() + 1), static_cast<int>(input_data[0].size()) }, torch::kInt32);
for (int i = 0; i < featureNames.size(); ++i) { for (int i = 0; i < featureNames.size(); ++i) {
dataset[featureNames[i]] = input_data[i];
samples.index_put_({ i, "..." }, torch::tensor(input_data[i], torch::kInt32)); samples.index_put_({ i, "..." }, torch::tensor(input_data[i], torch::kInt32));
} }
dataset[className] = labels;
samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32)); samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
completeFit(); completeFit(states, weights);
} }
void Network::completeFit() void Network::completeFit(const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights)
{ {
setStates(); setStates(states);
int maxThreadsRunning = static_cast<int>(std::thread::hardware_concurrency() * maxThreads); laplaceSmoothing = 1.0 / samples.size(1); // To use in CPT computation
if (maxThreadsRunning < 1) { std::vector<std::thread> threads;
maxThreadsRunning = 1; for (auto& node : nodes) {
} threads.emplace_back([this, &node, &weights]() {
vector<thread> threads; node.second->computeCPT(samples, features, laplaceSmoothing, weights);
mutex mtx;
condition_variable cv;
int activeThreads = 0;
int nextNodeIndex = 0;
while (nextNodeIndex < nodes.size()) {
unique_lock<mutex> lock(mtx);
cv.wait(lock, [&activeThreads, &maxThreadsRunning]() { return activeThreads < maxThreadsRunning; });
threads.emplace_back([this, &nextNodeIndex, &mtx, &cv, &activeThreads]() {
while (true) {
unique_lock<mutex> lock(mtx);
if (nextNodeIndex >= nodes.size()) {
break; // No more work remaining
}
auto& pair = *std::next(nodes.begin(), nextNodeIndex);
++nextNodeIndex;
lock.unlock();
pair.second->computeCPT(dataset, laplaceSmoothing);
lock.lock();
nodes[pair.first] = std::move(pair.second);
lock.unlock();
}
lock_guard<mutex> lock(mtx);
--activeThreads;
cv.notify_one();
}); });
++activeThreads;
} }
for (auto& thread : threads) { for (auto& thread : threads) {
thread.join(); thread.join();
@@ -207,12 +188,12 @@ namespace bayesnet {
torch::Tensor Network::predict_tensor(const torch::Tensor& samples, const bool proba) torch::Tensor Network::predict_tensor(const torch::Tensor& samples, const bool proba)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("You must call fit() before calling predict()"); throw std::logic_error("You must call fit() before calling predict()");
} }
torch::Tensor result; torch::Tensor result;
result = torch::zeros({ samples.size(1), classNumStates }, torch::kFloat64); result = torch::zeros({ samples.size(1), classNumStates }, torch::kFloat64);
for (int i = 0; i < samples.size(1); ++i) { for (int i = 0; i < samples.size(1); ++i) {
auto sample = samples.index({ "...", i }); const torch::Tensor sample = samples.index({ "...", i });
auto psample = predict_sample(sample); auto psample = predict_sample(sample);
auto temp = torch::tensor(psample, torch::kFloat64); auto temp = torch::tensor(psample, torch::kFloat64);
// result.index_put_({ i, "..." }, torch::tensor(predict_sample(sample), torch::kFloat64)); // result.index_put_({ i, "..." }, torch::tensor(predict_sample(sample), torch::kFloat64));
@@ -220,36 +201,35 @@ namespace bayesnet {
} }
if (proba) if (proba)
return result; return result;
else return result.argmax(1);
return result.argmax(1);
} }
// Return mxn tensor of probabilities // Return mxn tensor of probabilities
Tensor Network::predict_proba(const Tensor& samples) torch::Tensor Network::predict_proba(const torch::Tensor& samples)
{ {
return predict_tensor(samples, true); return predict_tensor(samples, true);
} }
// Return mxn tensor of probabilities // Return mxn tensor of probabilities
Tensor Network::predict(const Tensor& samples) torch::Tensor Network::predict(const torch::Tensor& samples)
{ {
return predict_tensor(samples, false); return predict_tensor(samples, false);
} }
// Return mx1 vector of predictions // Return mx1 std::vector of predictions
// tsamples is nxm vector of samples // tsamples is nxm std::vector of samples
vector<int> Network::predict(const vector<vector<int>>& tsamples) std::vector<int> Network::predict(const std::vector<std::vector<int>>& tsamples)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("You must call fit() before calling predict()"); throw std::logic_error("You must call fit() before calling predict()");
} }
vector<int> predictions; std::vector<int> predictions;
vector<int> sample; std::vector<int> sample;
for (int row = 0; row < tsamples[0].size(); ++row) { for (int row = 0; row < tsamples[0].size(); ++row) {
sample.clear(); sample.clear();
for (int col = 0; col < tsamples.size(); ++col) { for (int col = 0; col < tsamples.size(); ++col) {
sample.push_back(tsamples[col][row]); sample.push_back(tsamples[col][row]);
} }
vector<double> classProbabilities = predict_sample(sample); std::vector<double> classProbabilities = predict_sample(sample);
// Find the class with the maximum posterior probability // Find the class with the maximum posterior probability
auto maxElem = max_element(classProbabilities.begin(), classProbabilities.end()); auto maxElem = max_element(classProbabilities.begin(), classProbabilities.end());
int predictedClass = distance(classProbabilities.begin(), maxElem); int predictedClass = distance(classProbabilities.begin(), maxElem);
@@ -257,14 +237,14 @@ namespace bayesnet {
} }
return predictions; return predictions;
} }
// Return mxn vector of probabilities // Return mxn std::vector of probabilities
vector<vector<double>> Network::predict_proba(const vector<vector<int>>& tsamples) std::vector<std::vector<double>> Network::predict_proba(const std::vector<std::vector<int>>& tsamples)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("You must call fit() before calling predict_proba()"); throw std::logic_error("You must call fit() before calling predict_proba()");
} }
vector<vector<double>> predictions; std::vector<std::vector<double>> predictions;
vector<int> sample; std::vector<int> sample;
for (int row = 0; row < tsamples[0].size(); ++row) { for (int row = 0; row < tsamples[0].size(); ++row) {
sample.clear(); sample.clear();
for (int col = 0; col < tsamples.size(); ++col) { for (int col = 0; col < tsamples.size(); ++col) {
@@ -274,9 +254,9 @@ namespace bayesnet {
} }
return predictions; return predictions;
} }
double Network::score(const vector<vector<int>>& tsamples, const vector<int>& labels) double Network::score(const std::vector<std::vector<int>>& tsamples, const std::vector<int>& labels)
{ {
vector<int> y_pred = predict(tsamples); std::vector<int> y_pred = predict(tsamples);
int correct = 0; int correct = 0;
for (int i = 0; i < y_pred.size(); ++i) { for (int i = 0; i < y_pred.size(); ++i) {
if (y_pred[i] == labels[i]) { if (y_pred[i] == labels[i]) {
@@ -285,35 +265,35 @@ namespace bayesnet {
} }
return (double)correct / y_pred.size(); return (double)correct / y_pred.size();
} }
// Return 1xn vector of probabilities // Return 1xn std::vector of probabilities
vector<double> Network::predict_sample(const vector<int>& sample) std::vector<double> Network::predict_sample(const std::vector<int>& sample)
{ {
// Ensure the sample size is equal to the number of features // Ensure the sample size is equal to the number of features
if (sample.size() != features.size() - 1) { if (sample.size() != features.size() - 1) {
throw invalid_argument("Sample size (" + to_string(sample.size()) + throw std::invalid_argument("Sample size (" + std::to_string(sample.size()) +
") does not match the number of features (" + to_string(features.size() - 1) + ")"); ") does not match the number of features (" + std::to_string(features.size() - 1) + ")");
} }
map<string, int> evidence; std::map<std::string, int> evidence;
for (int i = 0; i < sample.size(); ++i) { for (int i = 0; i < sample.size(); ++i) {
evidence[features[i]] = sample[i]; evidence[features[i]] = sample[i];
} }
return exactInference(evidence); return exactInference(evidence);
} }
// Return 1xn vector of probabilities // Return 1xn std::vector of probabilities
vector<double> Network::predict_sample(const Tensor& sample) std::vector<double> Network::predict_sample(const torch::Tensor& sample)
{ {
// Ensure the sample size is equal to the number of features // Ensure the sample size is equal to the number of features
if (sample.size(0) != features.size() - 1) { if (sample.size(0) != features.size() - 1) {
throw invalid_argument("Sample size (" + to_string(sample.size(0)) + throw std::invalid_argument("Sample size (" + std::to_string(sample.size(0)) +
") does not match the number of features (" + to_string(features.size() - 1) + ")"); ") does not match the number of features (" + std::to_string(features.size() - 1) + ")");
} }
map<string, int> evidence; std::map<std::string, int> evidence;
for (int i = 0; i < sample.size(0); ++i) { for (int i = 0; i < sample.size(0); ++i) {
evidence[features[i]] = sample[i].item<int>(); evidence[features[i]] = sample[i].item<int>();
} }
return exactInference(evidence); return exactInference(evidence);
} }
double Network::computeFactor(map<string, int>& completeEvidence) double Network::computeFactor(std::map<std::string, int>& completeEvidence)
{ {
double result = 1.0; double result = 1.0;
for (auto& node : getNodes()) { for (auto& node : getNodes()) {
@@ -321,34 +301,34 @@ namespace bayesnet {
} }
return result; return result;
} }
vector<double> Network::exactInference(map<string, int>& evidence) std::vector<double> Network::exactInference(std::map<std::string, int>& evidence)
{ {
vector<double> result(classNumStates, 0.0); std::vector<double> result(classNumStates, 0.0);
vector<thread> threads; std::vector<std::thread> threads;
mutex mtx; std::mutex mtx;
for (int i = 0; i < classNumStates; ++i) { for (int i = 0; i < classNumStates; ++i) {
threads.emplace_back([this, &result, &evidence, i, &mtx]() { threads.emplace_back([this, &result, &evidence, i, &mtx]() {
auto completeEvidence = map<string, int>(evidence); auto completeEvidence = std::map<std::string, int>(evidence);
completeEvidence[getClassName()] = i; completeEvidence[getClassName()] = i;
double factor = computeFactor(completeEvidence); double factor = computeFactor(completeEvidence);
lock_guard<mutex> lock(mtx); std::lock_guard<std::mutex> lock(mtx);
result[i] = factor; result[i] = factor;
}); });
} }
for (auto& thread : threads) { for (auto& thread : threads) {
thread.join(); thread.join();
} }
// Normalize result // Normalize result
double sum = accumulate(result.begin(), result.end(), 0.0); double sum = accumulate(result.begin(), result.end(), 0.0);
transform(result.begin(), result.end(), result.begin(), [sum](double& value) { return value / sum; }); transform(result.begin(), result.end(), result.begin(), [sum](const double& value) { return value / sum; });
return result; return result;
} }
vector<string> Network::show() std::vector<std::string> Network::show() const
{ {
vector<string> result; std::vector<std::string> result;
// Draw the network // Draw the network
for (auto& node : nodes) { for (auto& node : nodes) {
string line = node.first + " -> "; std::string line = node.first + " -> ";
for (auto child : node.second->getChildren()) { for (auto child : node.second->getChildren()) {
line += child->getName() + ", "; line += child->getName() + ", ";
} }
@@ -356,12 +336,12 @@ namespace bayesnet {
} }
return result; return result;
} }
vector<string> Network::graph(const string& title) std::vector<std::string> Network::graph(const std::string& title) const
{ {
auto output = vector<string>(); auto output = std::vector<std::string>();
auto prefix = "digraph BayesNet {\nlabel=<BayesNet "; auto prefix = "digraph BayesNet {\nlabel=<BayesNet ";
auto suffix = ">\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n"; auto suffix = ">\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n";
string header = prefix + title + suffix; std::string header = prefix + title + suffix;
output.push_back(header); output.push_back(header);
for (auto& node : nodes) { for (auto& node : nodes) {
auto result = node.second->graph(className); auto result = node.second->graph(className);
@@ -370,9 +350,9 @@ namespace bayesnet {
output.push_back("}\n"); output.push_back("}\n");
return output; return output;
} }
vector<pair<string, string>> Network::getEdges() std::vector<std::pair<std::string, std::string>> Network::getEdges() const
{ {
auto edges = vector<pair<string, string>>(); auto edges = std::vector<std::pair<std::string, std::string>>();
for (const auto& node : nodes) { for (const auto& node : nodes) {
auto head = node.first; auto head = node.first;
for (const auto& child : node.second->getChildren()) { for (const auto& child : node.second->getChildren()) {
@@ -382,13 +362,16 @@ namespace bayesnet {
} }
return edges; return edges;
} }
vector<string> Network::topological_sort() int Network::getNumEdges() const
{
return getEdges().size();
}
std::vector<std::string> Network::topological_sort()
{ {
/* Check if al the fathers of every node are before the node */ /* Check if al the fathers of every node are before the node */
auto result = features; auto result = features;
result.erase(remove(result.begin(), result.end(), className), result.end()); result.erase(remove(result.begin(), result.end(), className), result.end());
bool ending{ false }; bool ending{ false };
int idx = 0;
while (!ending) { while (!ending) {
ending = true; ending = true;
for (auto feature : features) { for (auto feature : features) {
@@ -410,20 +393,21 @@ namespace bayesnet {
ending = false; ending = false;
} }
} else { } else {
throw logic_error("Error in topological sort because of node " + feature + " is not in result"); throw std::logic_error("Error in topological sort because of node " + feature + " is not in result");
} }
} else { } else {
throw logic_error("Error in topological sort because of node father " + fatherName + " is not in result"); throw std::logic_error("Error in topological sort because of node father " + fatherName + " is not in result");
} }
} }
} }
} }
return result; return result;
} }
void Network::dump_cpt() void Network::dump_cpt() const
{ {
for (auto& node : nodes) { for (auto& node : nodes) {
cout << "* " << node.first << ": (" << node.second->getNumStates() << ") : " << node.second->getCPT().sizes() << endl; std::cout << "* " << node.first << ": (" << node.second->getNumStates() << ") : " << node.second->getCPT().sizes() << std::endl;
std::cout << node.second->getCPT() << std::endl;
} }
} }
} }

View File

@@ -7,58 +7,56 @@
namespace bayesnet { namespace bayesnet {
class Network { class Network {
private: private:
map<string, unique_ptr<Node>> nodes; std::map<std::string, std::unique_ptr<Node>> nodes;
map<string, vector<int>> dataset;
bool fitted; bool fitted;
float maxThreads = 0.95; float maxThreads = 0.95;
int classNumStates; int classNumStates;
vector<string> features; // Including class std::vector<std::string> features; // Including classname
string className; std::string className;
int laplaceSmoothing = 1; double laplaceSmoothing;
torch::Tensor samples; // nxm tensor used to fit the model torch::Tensor samples; // nxm tensor used to fit the model
bool isCyclic(const std::string&, std::unordered_set<std::string>&, std::unordered_set<std::string>&); bool isCyclic(const std::string&, std::unordered_set<std::string>&, std::unordered_set<std::string>&);
vector<double> predict_sample(const vector<int>&); std::vector<double> predict_sample(const std::vector<int>&);
vector<double> predict_sample(const torch::Tensor&); std::vector<double> predict_sample(const torch::Tensor&);
vector<double> exactInference(map<string, int>&); std::vector<double> exactInference(std::map<std::string, int>&);
double computeFactor(map<string, int>&); double computeFactor(std::map<std::string, int>&);
double mutual_info(torch::Tensor&, torch::Tensor&); void completeFit(const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights);
double entropy(torch::Tensor&); void checkFitData(int n_features, int n_samples, int n_samples_y, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights);
double conditionalEntropy(torch::Tensor&, torch::Tensor&); void setStates(const std::map<std::string, std::vector<int>>&);
double mutualInformation(torch::Tensor&, torch::Tensor&);
void completeFit();
void checkFitData(int n_features, int n_samples, int n_samples_y, const vector<string>& featureNames, const string& className);
void setStates();
public: public:
Network(); Network();
explicit Network(float, int);
explicit Network(float); explicit Network(float);
explicit Network(Network&); explicit Network(Network&);
~Network() = default;
torch::Tensor& getSamples(); torch::Tensor& getSamples();
float getmaxThreads(); float getmaxThreads();
void addNode(const string&); void addNode(const std::string&);
void addEdge(const string&, const string&); void addEdge(const std::string&, const std::string&);
map<string, std::unique_ptr<Node>>& getNodes(); std::map<std::string, std::unique_ptr<Node>>& getNodes();
vector<string> getFeatures(); std::vector<std::string> getFeatures() const;
int getStates(); int getStates() const;
vector<pair<string, string>> getEdges(); std::vector<std::pair<std::string, std::string>> getEdges() const;
int getClassNumStates(); int getNumEdges() const;
string getClassName(); int getClassNumStates() const;
void fit(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&); std::string getClassName() const;
void fit(torch::Tensor&, torch::Tensor&, const vector<string>&, const string&); /*
vector<int> predict(const vector<vector<int>>&); // Return mx1 vector of predictions Notice: Nodes have to be inserted in the same order as they are in the dataset, i.e., first node is first column and so on.
*/
void fit(const std::vector<std::vector<int>>& input_data, const std::vector<int>& labels, const std::vector<double>& weights, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states);
void fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states);
void fit(const torch::Tensor& samples, const torch::Tensor& weights, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states);
std::vector<int> predict(const std::vector<std::vector<int>>&); // Return mx1 std::vector of predictions
torch::Tensor predict(const torch::Tensor&); // Return mx1 tensor of predictions torch::Tensor predict(const torch::Tensor&); // Return mx1 tensor of predictions
//Computes the conditional edge weight of variable index u and v conditioned on class_node
torch::Tensor conditionalEdgeWeight();
torch::Tensor predict_tensor(const torch::Tensor& samples, const bool proba); torch::Tensor predict_tensor(const torch::Tensor& samples, const bool proba);
vector<vector<double>> predict_proba(const vector<vector<int>>&); // Return mxn vector of probabilities std::vector<std::vector<double>> predict_proba(const std::vector<std::vector<int>>&); // Return mxn std::vector of probabilities
torch::Tensor predict_proba(const torch::Tensor&); // Return mxn tensor of probabilities torch::Tensor predict_proba(const torch::Tensor&); // Return mxn tensor of probabilities
double score(const vector<vector<int>>&, const vector<int>&); double score(const std::vector<std::vector<int>>&, const std::vector<int>&);
vector<string> topological_sort(); std::vector<std::string> topological_sort();
vector<string> show(); std::vector<std::string> show() const;
vector<string> graph(const string& title); // Returns a vector of strings representing the graph in graphviz format std::vector<std::string> graph(const std::string& title) const; // Returns a std::vector of std::strings representing the graph in graphviz format
void initialize(); void initialize();
void dump_cpt(); void dump_cpt() const;
inline string version() { return "0.1.0"; } inline std::string version() { return "0.2.0"; }
}; };
} }
#endif #endif

View File

@@ -3,7 +3,7 @@
namespace bayesnet { namespace bayesnet {
Node::Node(const std::string& name) Node::Node(const std::string& name)
: name(name), numStates(0), cpTable(torch::Tensor()), parents(vector<Node*>()), children(vector<Node*>()) : name(name), numStates(0), cpTable(torch::Tensor()), parents(std::vector<Node*>()), children(std::vector<Node*>())
{ {
} }
void Node::clear() void Node::clear()
@@ -14,7 +14,7 @@ namespace bayesnet {
dimensions.clear(); dimensions.clear();
numStates = 0; numStates = 0;
} }
string Node::getName() const std::string Node::getName() const
{ {
return name; return name;
} }
@@ -34,11 +34,11 @@ namespace bayesnet {
{ {
children.push_back(child); children.push_back(child);
} }
vector<Node*>& Node::getParents() std::vector<Node*>& Node::getParents()
{ {
return parents; return parents;
} }
vector<Node*>& Node::getChildren() std::vector<Node*>& Node::getChildren()
{ {
return children; return children;
} }
@@ -63,28 +63,28 @@ namespace bayesnet {
*/ */
unsigned Node::minFill() unsigned Node::minFill()
{ {
unordered_set<string> neighbors; std::unordered_set<std::string> neighbors;
for (auto child : children) { for (auto child : children) {
neighbors.emplace(child->getName()); neighbors.emplace(child->getName());
} }
for (auto parent : parents) { for (auto parent : parents) {
neighbors.emplace(parent->getName()); neighbors.emplace(parent->getName());
} }
auto source = vector<string>(neighbors.begin(), neighbors.end()); auto source = std::vector<std::string>(neighbors.begin(), neighbors.end());
return combinations(source).size(); return combinations(source).size();
} }
vector<pair<string, string>> Node::combinations(const vector<string>& source) std::vector<std::pair<std::string, std::string>> Node::combinations(const std::vector<std::string>& source)
{ {
vector<pair<string, string>> result; std::vector<std::pair<std::string, std::string>> result;
for (int i = 0; i < source.size(); ++i) { for (int i = 0; i < source.size(); ++i) {
string temp = source[i]; std::string temp = source[i];
for (int j = i + 1; j < source.size(); ++j) { for (int j = i + 1; j < source.size(); ++j) {
result.push_back({ temp, source[j] }); result.push_back({ temp, source[j] });
} }
} }
return result; return result;
} }
void Node::computeCPT(map<string, vector<int>>& dataset, const int laplaceSmoothing) void Node::computeCPT(const torch::Tensor& dataset, const std::vector<std::string>& features, const double laplaceSmoothing, const torch::Tensor& weights)
{ {
dimensions.clear(); dimensions.clear();
// Get dimensions of the CPT // Get dimensions of the CPT
@@ -94,27 +94,39 @@ namespace bayesnet {
// Create a tensor of zeros with the dimensions of the CPT // Create a tensor of zeros with the dimensions of the CPT
cpTable = torch::zeros(dimensions, torch::kFloat) + laplaceSmoothing; cpTable = torch::zeros(dimensions, torch::kFloat) + laplaceSmoothing;
// Fill table with counts // Fill table with counts
for (int n_sample = 0; n_sample < dataset[name].size(); ++n_sample) { auto pos = find(features.begin(), features.end(), name);
torch::List<c10::optional<torch::Tensor>> coordinates; if (pos == features.end()) {
coordinates.push_back(torch::tensor(dataset[name][n_sample])); throw std::logic_error("Feature " + name + " not found in dataset");
transform(parents.begin(), parents.end(), back_inserter(coordinates), [&dataset, &n_sample](const auto& parent) { return torch::tensor(dataset[parent->getName()][n_sample]); }); }
int name_index = pos - features.begin();
for (int n_sample = 0; n_sample < dataset.size(1); ++n_sample) {
c10::List<c10::optional<at::Tensor>> coordinates;
coordinates.push_back(dataset.index({ name_index, n_sample }));
for (auto parent : parents) {
pos = find(features.begin(), features.end(), parent->getName());
if (pos == features.end()) {
throw std::logic_error("Feature parent " + parent->getName() + " not found in dataset");
}
int parent_index = pos - features.begin();
coordinates.push_back(dataset.index({ parent_index, n_sample }));
}
// Increment the count of the corresponding coordinate // Increment the count of the corresponding coordinate
cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + 1); cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + weights.index({ n_sample }).item<double>());
} }
// Normalize the counts // Normalize the counts
cpTable = cpTable / cpTable.sum(0); cpTable = cpTable / cpTable.sum(0);
} }
float Node::getFactorValue(map<string, int>& evidence) float Node::getFactorValue(std::map<std::string, int>& evidence)
{ {
torch::List<c10::optional<torch::Tensor>> coordinates; c10::List<c10::optional<at::Tensor>> coordinates;
// following predetermined order of indices in the cpTable (see Node.h) // following predetermined order of indices in the cpTable (see Node.h)
coordinates.push_back(torch::tensor(evidence[name])); coordinates.push_back(at::tensor(evidence[name]));
transform(parents.begin(), parents.end(), back_inserter(coordinates), [&evidence](const auto& parent) { return torch::tensor(evidence[parent->getName()]); }); transform(parents.begin(), parents.end(), std::back_inserter(coordinates), [&evidence](const auto& parent) { return at::tensor(evidence[parent->getName()]); });
return cpTable.index({ coordinates }).item<float>(); return cpTable.index({ coordinates }).item<float>();
} }
vector<string> Node::graph(const string& className) std::vector<std::string> Node::graph(const std::string& className)
{ {
auto output = vector<string>(); auto output = std::vector<std::string>();
auto suffix = name == className ? ", fontcolor=red, fillcolor=lightblue, style=filled " : ""; auto suffix = name == className ? ", fontcolor=red, fillcolor=lightblue, style=filled " : "";
output.push_back(name + " [shape=circle" + suffix + "] \n"); output.push_back(name + " [shape=circle" + suffix + "] \n");
transform(children.begin(), children.end(), back_inserter(output), [this](const auto& child) { return name + " -> " + child->getName(); }); transform(children.begin(), children.end(), back_inserter(output), [this](const auto& child) { return name + " -> " + child->getName(); });

View File

@@ -5,33 +5,32 @@
#include <vector> #include <vector>
#include <string> #include <string>
namespace bayesnet { namespace bayesnet {
using namespace std;
class Node { class Node {
private: private:
string name; std::string name;
vector<Node*> parents; std::vector<Node*> parents;
vector<Node*> children; std::vector<Node*> children;
int numStates; // number of states of the variable int numStates; // number of states of the variable
torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ... torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ...
vector<int64_t> dimensions; // dimensions of the cpTable std::vector<int64_t> dimensions; // dimensions of the cpTable
std::vector<std::pair<std::string, std::string>> combinations(const std::vector<std::string>&);
public: public:
vector<pair<string, string>> combinations(const vector<string>&); explicit Node(const std::string&);
explicit Node(const string&);
void clear(); void clear();
void addParent(Node*); void addParent(Node*);
void addChild(Node*); void addChild(Node*);
void removeParent(Node*); void removeParent(Node*);
void removeChild(Node*); void removeChild(Node*);
string getName() const; std::string getName() const;
vector<Node*>& getParents(); std::vector<Node*>& getParents();
vector<Node*>& getChildren(); std::vector<Node*>& getChildren();
torch::Tensor& getCPT(); torch::Tensor& getCPT();
void computeCPT(map<string, vector<int>>&, const int); void computeCPT(const torch::Tensor& dataset, const std::vector<std::string>& features, const double laplaceSmoothing, const torch::Tensor& weights);
int getNumStates() const; int getNumStates() const;
void setNumStates(int); void setNumStates(int);
unsigned minFill(); unsigned minFill();
vector<string> graph(const string& clasName); // Returns a vector of strings representing the graph in graphviz format std::vector<std::string> graph(const std::string& clasName); // Returns a std::vector of std::strings representing the graph in graphviz format
float getFactorValue(map<string, int>&); float getFactorValue(std::map<std::string, int>&);
}; };
} }
#endif #endif

View File

@@ -2,21 +2,30 @@
#include "ArffFiles.h" #include "ArffFiles.h"
namespace bayesnet { namespace bayesnet {
Proposal::Proposal(vector<vector<int>>& Xv_, vector<int>& yv_, vector<string>& features_, string& className_) : Xv(Xv_), yv(yv_), pFeatures(features_), pClassName(className_) {} Proposal::Proposal(torch::Tensor& dataset_, std::vector<std::string>& features_, std::string& className_) : pDataset(dataset_), pFeatures(features_), pClassName(className_) {}
Proposal::~Proposal() Proposal::~Proposal()
{ {
for (auto& [key, value] : discretizers) { for (auto& [key, value] : discretizers) {
delete value; delete value;
} }
} }
void Proposal::localDiscretizationProposal(map<string, vector<int>>& states, Network& model) void Proposal::checkInput(const torch::Tensor& X, const torch::Tensor& y)
{
if (!torch::is_floating_point(X)) {
throw std::invalid_argument("X must be a floating point tensor");
}
if (torch::is_floating_point(y)) {
throw std::invalid_argument("y must be an integer tensor");
}
}
map<std::string, std::vector<int>> Proposal::localDiscretizationProposal(const map<std::string, std::vector<int>>& oldStates, Network& model)
{ {
// order of local discretization is important. no good 0, 1, 2... // order of local discretization is important. no good 0, 1, 2...
// although we rediscretize features after the local discretization of every feature // although we rediscretize features after the local discretization of every feature
auto order = model.topological_sort(); auto order = model.topological_sort();
auto& nodes = model.getNodes(); auto& nodes = model.getNodes();
vector<int> indicesToReDiscretize; map<std::string, std::vector<int>> states = oldStates;
auto n_samples = Xf.size(1); std::vector<int> indicesToReDiscretize;
bool upgrade = false; // Flag to check if we need to upgrade the model bool upgrade = false; // Flag to check if we need to upgrade the model
for (auto feature : order) { for (auto feature : order) {
auto nodeParents = nodes[feature]->getParents(); auto nodeParents = nodes[feature]->getParents();
@@ -24,76 +33,75 @@ namespace bayesnet {
upgrade = true; upgrade = true;
int index = find(pFeatures.begin(), pFeatures.end(), feature) - pFeatures.begin(); int index = find(pFeatures.begin(), pFeatures.end(), feature) - pFeatures.begin();
indicesToReDiscretize.push_back(index); // We need to re-discretize this feature indicesToReDiscretize.push_back(index); // We need to re-discretize this feature
vector<string> parents; std::vector<std::string> parents;
transform(nodeParents.begin(), nodeParents.end(), back_inserter(parents), [](const auto& p) { return p->getName(); }); transform(nodeParents.begin(), nodeParents.end(), back_inserter(parents), [](const auto& p) { return p->getName(); });
// Remove class as parent as it will be added later // Remove class as parent as it will be added later
parents.erase(remove(parents.begin(), parents.end(), pClassName), parents.end()); parents.erase(remove(parents.begin(), parents.end(), pClassName), parents.end());
// Get the indices of the parents // Get the indices of the parents
vector<int> indices; std::vector<int> indices;
indices.push_back(-1); // Add class index
transform(parents.begin(), parents.end(), back_inserter(indices), [&](const auto& p) {return find(pFeatures.begin(), pFeatures.end(), p) - pFeatures.begin(); }); transform(parents.begin(), parents.end(), back_inserter(indices), [&](const auto& p) {return find(pFeatures.begin(), pFeatures.end(), p) - pFeatures.begin(); });
// Now we fit the discretizer of the feature, conditioned on its parents and the class i.e. discretizer.fit(X[index], X[indices] + y) // Now we fit the discretizer of the feature, conditioned on its parents and the class i.e. discretizer.fit(X[index], X[indices] + y)
vector<string> yJoinParents; std::vector<std::string> yJoinParents(Xf.size(1));
transform(yv.begin(), yv.end(), back_inserter(yJoinParents), [&](const auto& p) {return to_string(p); });
for (auto idx : indices) { for (auto idx : indices) {
for (int i = 0; i < n_samples; ++i) { for (int i = 0; i < Xf.size(1); ++i) {
yJoinParents[i] += to_string(Xv[idx][i]); yJoinParents[i] += to_string(pDataset.index({ idx, i }).item<int>());
} }
} }
auto arff = ArffFiles(); auto arff = ArffFiles();
auto yxv = arff.factorize(yJoinParents); auto yxv = arff.factorize(yJoinParents);
auto xvf_ptr = Xf.index({ index }).data_ptr<float>(); auto xvf_ptr = Xf.index({ index }).data_ptr<float>();
auto xvf = vector<mdlp::precision_t>(xvf_ptr, xvf_ptr + Xf.size(1)); auto xvf = std::vector<mdlp::precision_t>(xvf_ptr, xvf_ptr + Xf.size(1));
discretizers[feature]->fit(xvf, yxv); discretizers[feature]->fit(xvf, yxv);
//
//
//
// auto tmp = discretizers[feature]->transform(xvf);
// Xv[index] = tmp;
// auto xStates = vector<int>(discretizers[pFeatures[index]]->getCutPoints().size() + 1);
// iota(xStates.begin(), xStates.end(), 0);
// //Update new states of the feature/node
// states[feature] = xStates;
} }
if (upgrade) { if (upgrade) {
// Discretize again X (only the affected indices) with the new fitted discretizers // Discretize again X (only the affected indices) with the new fitted discretizers
for (auto index : indicesToReDiscretize) { for (auto index : indicesToReDiscretize) {
auto Xt_ptr = Xf.index({ index }).data_ptr<float>(); auto Xt_ptr = Xf.index({ index }).data_ptr<float>();
auto Xt = vector<float>(Xt_ptr, Xt_ptr + Xf.size(1)); auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
Xv[index] = discretizers[pFeatures[index]]->transform(Xt); pDataset.index_put_({ index, "..." }, torch::tensor(discretizers[pFeatures[index]]->transform(Xt)));
auto xStates = vector<int>(discretizers[pFeatures[index]]->getCutPoints().size() + 1); auto xStates = std::vector<int>(discretizers[pFeatures[index]]->getCutPoints().size() + 1);
iota(xStates.begin(), xStates.end(), 0); iota(xStates.begin(), xStates.end(), 0);
//Update new states of the feature/node //Update new states of the feature/node
states[pFeatures[index]] = xStates; states[pFeatures[index]] = xStates;
} }
const torch::Tensor weights = torch::full({ pDataset.size(1) }, 1.0 / pDataset.size(1), torch::kDouble);
model.fit(pDataset, weights, pFeatures, pClassName, states);
} }
return states;
} }
void Proposal::fit_local_discretization(map<string, vector<int>>& states, torch::Tensor& y) map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y)
{ {
// Sharing Xv and yv with Classifier // Discretize the continuous input data and build pDataset (Classifier::dataset)
Xv = vector<vector<int>>(); int m = Xf.size(1);
yv = vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0)); int n = Xf.size(0);
map<std::string, std::vector<int>> states;
pDataset = torch::zeros({ n + 1, m }, torch::kInt32);
auto yv = std::vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0));
// discretize input data by feature(row) // discretize input data by feature(row)
for (int i = 0; i < pFeatures.size(); ++i) { for (auto i = 0; i < pFeatures.size(); ++i) {
auto* discretizer = new mdlp::CPPFImdlp(); auto* discretizer = new mdlp::CPPFImdlp();
auto Xt_ptr = Xf.index({ i }).data_ptr<float>(); auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
auto Xt = vector<float>(Xt_ptr, Xt_ptr + Xf.size(1)); auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
discretizer->fit(Xt, yv); discretizer->fit(Xt, yv);
Xv.push_back(discretizer->transform(Xt)); pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->transform(Xt)));
auto xStates = vector<int>(discretizer->getCutPoints().size() + 1); auto xStates = std::vector<int>(discretizer->getCutPoints().size() + 1);
iota(xStates.begin(), xStates.end(), 0); iota(xStates.begin(), xStates.end(), 0);
states[pFeatures[i]] = xStates; states[pFeatures[i]] = xStates;
discretizers[pFeatures[i]] = discretizer; discretizers[pFeatures[i]] = discretizer;
} }
int n_classes = torch::max(y).item<int>() + 1; int n_classes = torch::max(y).item<int>() + 1;
auto yStates = vector<int>(n_classes); auto yStates = std::vector<int>(n_classes);
iota(yStates.begin(), yStates.end(), 0); iota(yStates.begin(), yStates.end(), 0);
states[pClassName] = yStates; states[pClassName] = yStates;
pDataset.index_put_({ n, "..." }, y);
return states;
} }
torch::Tensor Proposal::prepareX(torch::Tensor& X) torch::Tensor Proposal::prepareX(torch::Tensor& X)
{ {
auto Xtd = torch::zeros_like(X, torch::kInt32); auto Xtd = torch::zeros_like(X, torch::kInt32);
for (int i = 0; i < X.size(0); ++i) { for (int i = 0; i < X.size(0); ++i) {
auto Xt = vector<float>(X[i].data_ptr<float>(), X[i].data_ptr<float>() + X.size(1)); auto Xt = std::vector<float>(X[i].data_ptr<float>(), X[i].data_ptr<float>() + X.size(1));
auto Xd = discretizers[pFeatures[i]]->transform(Xt); auto Xd = discretizers[pFeatures[i]]->transform(Xt);
Xtd.index_put_({ i }, torch::tensor(Xd, torch::kInt32)); Xtd.index_put_({ i }, torch::tensor(Xd, torch::kInt32));
} }

View File

@@ -10,20 +10,21 @@
namespace bayesnet { namespace bayesnet {
class Proposal { class Proposal {
public: public:
Proposal(vector<vector<int>>& Xv_, vector<int>& yv_, vector<string>& features_, string& className_); Proposal(torch::Tensor& pDataset, std::vector<std::string>& features_, std::string& className_);
virtual ~Proposal(); virtual ~Proposal();
protected: protected:
void checkInput(const torch::Tensor& X, const torch::Tensor& y);
torch::Tensor prepareX(torch::Tensor& X); torch::Tensor prepareX(torch::Tensor& X);
void localDiscretizationProposal(map<string, vector<int>>& states, Network& model); map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
void fit_local_discretization(map<string, vector<int>>& states, torch::Tensor& y); map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y);
torch::Tensor Xf; // X continuous nxm tensor torch::Tensor Xf; // X continuous nxm tensor
map<string, mdlp::CPPFImdlp*> discretizers; torch::Tensor y; // y discrete nx1 tensor
map<std::string, mdlp::CPPFImdlp*> discretizers;
private: private:
vector<string>& pFeatures; torch::Tensor& pDataset; // (n+1)xm tensor
string& pClassName; std::vector<std::string>& pFeatures;
vector<vector<int>>& Xv; // X discrete nxm vector std::string& pClassName;
vector<int>& yv;
}; };
} }
#endif #endif

View File

@@ -4,7 +4,7 @@ namespace bayesnet {
SPODE::SPODE(int root) : Classifier(Network()), root(root) {} SPODE::SPODE(int root) : Classifier(Network()), root(root) {}
void SPODE::train() void SPODE::buildModel(const torch::Tensor& weights)
{ {
// 0. Add all nodes to the model // 0. Add all nodes to the model
addNodes(); addNodes();
@@ -17,7 +17,7 @@ namespace bayesnet {
} }
} }
} }
vector<string> SPODE::graph(const string& name) std::vector<std::string> SPODE::graph(const std::string& name) const
{ {
return model.graph(name); return model.graph(name);
} }

View File

@@ -7,11 +7,11 @@ namespace bayesnet {
private: private:
int root; int root;
protected: protected:
void train() override; void buildModel(const torch::Tensor& weights) override;
public: public:
explicit SPODE(int root); explicit SPODE(int root);
virtual ~SPODE() {}; virtual ~SPODE() = default;
vector<string> graph(const string& name = "SPODE") override; std::vector<std::string> graph(const std::string& name = "SPODE") const override;
}; };
} }
#endif #endif

View File

@@ -1,34 +1,46 @@
#include "SPODELd.h" #include "SPODELd.h"
namespace bayesnet { namespace bayesnet {
using namespace std; SPODELd::SPODELd(int root) : SPODE(root), Proposal(dataset, features, className) {}
SPODELd::SPODELd(int root) : SPODE(root), Proposal(SPODE::Xv, SPODE::yv, features, className) {} SPODELd& SPODELd::fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_)
SPODELd& SPODELd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
{ {
// This first part should go in a Classifier method called fit_local_discretization o fit_float... checkInput(X_, y_);
features = features_; features = features_;
className = className_; className = className_;
Xf = X_; Xf = X_;
y = y_; y = y_;
// Fills vectors Xv & yv with the data from tensors X_ (discretized) & y // Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
fit_local_discretization(states, y); states = fit_local_discretization(y);
generateTensorXFromVector();
// We have discretized the input data // We have discretized the input data
// 1st we need to fit the model to build the normal SPODE structure, SPODE::fit initializes the base Bayesian network // 1st we need to fit the model to build the normal SPODE structure, SPODE::fit initializes the base Bayesian network
SPODE::fit(SPODE::Xv, SPODE::yv, features, className, states); SPODE::fit(dataset, features, className, states);
localDiscretizationProposal(states, model); states = localDiscretizationProposal(states, model);
generateTensorXFromVector();
Tensor ytmp = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
samples = torch::cat({ X, ytmp }, 0);
model.fit(SPODE::Xv, SPODE::yv, features, className);
return *this; return *this;
} }
Tensor SPODELd::predict(Tensor& X) SPODELd& SPODELd::fit(torch::Tensor& dataset, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_)
{
if (!torch::is_floating_point(dataset)) {
throw std::runtime_error("Dataset must be a floating point tensor");
}
Xf = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }).clone();
y = dataset.index({ -1, "..." }).clone();
features = features_;
className = className_;
// Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
states = fit_local_discretization(y);
// We have discretized the input data
// 1st we need to fit the model to build the normal SPODE structure, SPODE::fit initializes the base Bayesian network
SPODE::fit(dataset, features, className, states);
states = localDiscretizationProposal(states, model);
return *this;
}
torch::Tensor SPODELd::predict(torch::Tensor& X)
{ {
auto Xt = prepareX(X); auto Xt = prepareX(X);
return SPODE::predict(Xt); return SPODE::predict(Xt);
} }
vector<string> SPODELd::graph(const string& name) std::vector<std::string> SPODELd::graph(const std::string& name) const
{ {
return SPODE::graph(name); return SPODE::graph(name);
} }

View File

@@ -4,16 +4,15 @@
#include "Proposal.h" #include "Proposal.h"
namespace bayesnet { namespace bayesnet {
using namespace std;
class SPODELd : public SPODE, public Proposal { class SPODELd : public SPODE, public Proposal {
private:
public: public:
explicit SPODELd(int root); explicit SPODELd(int root);
virtual ~SPODELd() = default; virtual ~SPODELd() = default;
SPODELd& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override; SPODELd& fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states) override;
vector<string> graph(const string& name = "SPODE") override; SPODELd& fit(torch::Tensor& dataset, const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states) override;
Tensor predict(Tensor& X) override; std::vector<std::string> graph(const std::string& name = "SPODE") const override;
static inline string version() { return "0.0.1"; }; torch::Tensor predict(torch::Tensor& X) override;
static inline std::string version() { return "0.0.1"; };
}; };
} }
#endif // !SPODELD_H #endif // !SPODELD_H

View File

@@ -1,29 +1,27 @@
#include "TAN.h" #include "TAN.h"
namespace bayesnet { namespace bayesnet {
using namespace torch;
TAN::TAN() : Classifier(Network()) {} TAN::TAN() : Classifier(Network()) {}
void TAN::train() void TAN::buildModel(const torch::Tensor& weights)
{ {
// 0. Add all nodes to the model // 0. Add all nodes to the model
addNodes(); addNodes();
// 1. Compute mutual information between each feature and the class and set the root node // 1. Compute mutual information between each feature and the class and set the root node
// as the highest mutual information with the class // as the highest mutual information with the class
auto mi = vector <pair<int, float >>(); auto mi = std::vector <std::pair<int, float >>();
Tensor class_dataset = samples.index({ -1, "..." }); torch::Tensor class_dataset = dataset.index({ -1, "..." });
for (int i = 0; i < static_cast<int>(features.size()); ++i) { for (int i = 0; i < static_cast<int>(features.size()); ++i) {
Tensor feature_dataset = samples.index({ i, "..." }); torch::Tensor feature_dataset = dataset.index({ i, "..." });
auto mi_value = metrics.mutualInformation(class_dataset, feature_dataset); auto mi_value = metrics.mutualInformation(class_dataset, feature_dataset, weights);
mi.push_back({ i, mi_value }); mi.push_back({ i, mi_value });
} }
sort(mi.begin(), mi.end(), [](const auto& left, const auto& right) {return left.second < right.second;}); sort(mi.begin(), mi.end(), [](const auto& left, const auto& right) {return left.second < right.second;});
auto root = mi[mi.size() - 1].first; auto root = mi[mi.size() - 1].first;
// 2. Compute mutual information between each feature and the class // 2. Compute mutual information between each feature and the class
auto weights = metrics.conditionalEdge(); auto weights_matrix = metrics.conditionalEdge(weights);
// 3. Compute the maximum spanning tree // 3. Compute the maximum spanning tree
auto mst = metrics.maximumSpanningTree(features, weights, root); auto mst = metrics.maximumSpanningTree(features, weights_matrix, root);
// 4. Add edges from the maximum spanning tree to the model // 4. Add edges from the maximum spanning tree to the model
for (auto i = 0; i < mst.size(); ++i) { for (auto i = 0; i < mst.size(); ++i) {
auto [from, to] = mst[i]; auto [from, to] = mst[i];
@@ -34,7 +32,7 @@ namespace bayesnet {
model.addEdge(className, feature); model.addEdge(className, feature);
} }
} }
vector<string> TAN::graph(const string& title) std::vector<std::string> TAN::graph(const std::string& title) const
{ {
return model.graph(title); return model.graph(title);
} }

View File

@@ -2,16 +2,14 @@
#define TAN_H #define TAN_H
#include "Classifier.h" #include "Classifier.h"
namespace bayesnet { namespace bayesnet {
using namespace std;
using namespace torch;
class TAN : public Classifier { class TAN : public Classifier {
private: private:
protected: protected:
void train() override; void buildModel(const torch::Tensor& weights) override;
public: public:
TAN(); TAN();
virtual ~TAN() {}; virtual ~TAN() = default;
vector<string> graph(const string& name = "TAN") override; std::vector<std::string> graph(const std::string& name = "TAN") const override;
}; };
} }
#endif #endif

View File

@@ -1,34 +1,29 @@
#include "TANLd.h" #include "TANLd.h"
namespace bayesnet { namespace bayesnet {
using namespace std; TANLd::TANLd() : TAN(), Proposal(dataset, features, className) {}
TANLd::TANLd() : TAN(), Proposal(TAN::Xv, TAN::yv, features, className) {} TANLd& TANLd::fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_)
TANLd& TANLd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
{ {
// This first part should go in a Classifier method called fit_local_discretization o fit_float... checkInput(X_, y_);
features = features_; features = features_;
className = className_; className = className_;
Xf = X_; Xf = X_;
y = y_; y = y_;
// Fills vectors Xv & yv with the data from tensors X_ (discretized) & y // Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
fit_local_discretization(states, y); states = fit_local_discretization(y);
generateTensorXFromVector();
// We have discretized the input data // We have discretized the input data
// 1st we need to fit the model to build the normal TAN structure, TAN::fit initializes the base Bayesian network // 1st we need to fit the model to build the normal TAN structure, TAN::fit initializes the base Bayesian network
TAN::fit(TAN::Xv, TAN::yv, features, className, states); TAN::fit(dataset, features, className, states);
localDiscretizationProposal(states, model); states = localDiscretizationProposal(states, model);
generateTensorXFromVector();
Tensor ytmp = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
samples = torch::cat({ X, ytmp }, 0);
model.fit(TAN::Xv, TAN::yv, features, className);
return *this; return *this;
} }
Tensor TANLd::predict(Tensor& X) torch::Tensor TANLd::predict(torch::Tensor& X)
{ {
auto Xt = prepareX(X); auto Xt = prepareX(X);
return TAN::predict(Xt); return TAN::predict(Xt);
} }
vector<string> TANLd::graph(const string& name) std::vector<std::string> TANLd::graph(const std::string& name) const
{ {
return TAN::graph(name); return TAN::graph(name);
} }

View File

@@ -4,16 +4,15 @@
#include "Proposal.h" #include "Proposal.h"
namespace bayesnet { namespace bayesnet {
using namespace std;
class TANLd : public TAN, public Proposal { class TANLd : public TAN, public Proposal {
private: private:
public: public:
TANLd(); TANLd();
virtual ~TANLd() = default; virtual ~TANLd() = default;
TANLd& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override; TANLd& fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states) override;
vector<string> graph(const string& name = "TAN") override; std::vector<std::string> graph(const std::string& name = "TAN") const override;
Tensor predict(Tensor& X) override; torch::Tensor predict(torch::Tensor& X) override;
static inline string version() { return "0.0.1"; }; static inline std::string version() { return "0.0.1"; };
}; };
} }
#endif // !TANLD_H #endif // !TANLD_H

View File

@@ -1,25 +1,23 @@
#include "bayesnetUtils.h" #include "bayesnetUtils.h"
namespace bayesnet { namespace bayesnet {
using namespace std;
using namespace torch;
// Return the indices in descending order // Return the indices in descending order
vector<int> argsort(vector<float>& nums) std::vector<int> argsort(std::vector<double>& nums)
{ {
int n = nums.size(); int n = nums.size();
vector<int> indices(n); std::vector<int> indices(n);
iota(indices.begin(), indices.end(), 0); iota(indices.begin(), indices.end(), 0);
sort(indices.begin(), indices.end(), [&nums](int i, int j) {return nums[i] > nums[j];}); sort(indices.begin(), indices.end(), [&nums](int i, int j) {return nums[i] > nums[j];});
return indices; return indices;
} }
vector<vector<int>> tensorToVector(Tensor& tensor) std::vector<std::vector<int>> tensorToVector(torch::Tensor& tensor)
{ {
// convert mxn tensor to nxm vector // convert mxn tensor to nxm std::vector
vector<vector<int>> result; std::vector<std::vector<int>> result;
// Iterate over cols // Iterate over cols
for (int i = 0; i < tensor.size(1); ++i) { for (int i = 0; i < tensor.size(1); ++i) {
auto col_tensor = tensor.index({ "...", i }); auto col_tensor = tensor.index({ "...", i });
auto col = vector<int>(col_tensor.data_ptr<int>(), col_tensor.data_ptr<int>() + tensor.size(0)); auto col = std::vector<int>(col_tensor.data_ptr<int>(), col_tensor.data_ptr<int>() + tensor.size(0));
result.push_back(col); result.push_back(col);
} }
return result; return result;

View File

@@ -3,9 +3,7 @@
#include <torch/torch.h> #include <torch/torch.h>
#include <vector> #include <vector>
namespace bayesnet { namespace bayesnet {
using namespace std; std::vector<int> argsort(std::vector<double>& nums);
using namespace torch; std::vector<std::vector<int>> tensorToVector(torch::Tensor& tensor);
vector<int> argsort(vector<float>& nums);
vector<vector<int>> tensorToVector(Tensor& tensor);
} }
#endif //BAYESNET_UTILS_H #endif //BAYESNET_UTILS_H

343
src/Platform/BestResults.cc Normal file
View File

@@ -0,0 +1,343 @@
#include <filesystem>
#include <set>
#include <fstream>
#include <iostream>
#include <sstream>
#include <algorithm>
#include "BestResults.h"
#include "Result.h"
#include "Colors.h"
#include "Statistics.h"
#include "BestResultsExcel.h"
#include "CLocale.h"
namespace fs = std::filesystem;
// function ftime_to_std::string, Code taken from
// https://stackoverflow.com/a/58237530/1389271
template <typename TP>
std::string ftime_to_string(TP tp)
{
auto sctp = std::chrono::time_point_cast<std::chrono::system_clock::duration>(tp - TP::clock::now()
+ std::chrono::system_clock::now());
auto tt = std::chrono::system_clock::to_time_t(sctp);
std::tm* gmt = std::gmtime(&tt);
std::stringstream buffer;
buffer << std::put_time(gmt, "%Y-%m-%d %H:%M");
return buffer.str();
}
namespace platform {
std::string BestResults::build()
{
auto files = loadResultFiles();
if (files.size() == 0) {
std::cerr << Colors::MAGENTA() << "No result files were found!" << Colors::RESET() << std::endl;
exit(1);
}
json bests;
for (const auto& file : files) {
auto result = Result(path, file);
auto data = result.load();
for (auto const& item : data.at("results")) {
bool update = false;
// Check if results file contains only one dataset
auto datasetName = item.at("dataset").get<std::string>();
if (bests.contains(datasetName)) {
if (item.at("score").get<double>() > bests[datasetName].at(0).get<double>()) {
update = true;
}
} else {
update = true;
}
if (update) {
bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file };
}
}
}
std::string bestFileName = path + bestResultFile();
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest);
std::cout << Colors::MAGENTA() << "File " << bestFileName << " already exists and it shall be overwritten." << Colors::RESET() << std::endl;
}
std::ofstream file(bestFileName);
file << bests;
file.close();
return bestFileName;
}
std::string BestResults::bestResultFile()
{
return "best_results_" + score + "_" + model + ".json";
}
std::pair<std::string, std::string> getModelScore(std::string name)
{
// results_accuracy_BoostAODE_MacBookpro16_2023-09-06_12:27:00_1.json
int i = 0;
auto pos = name.find("_");
auto pos2 = name.find("_", pos + 1);
std::string score = name.substr(pos + 1, pos2 - pos - 1);
pos = name.find("_", pos2 + 1);
std::string model = name.substr(pos2 + 1, pos - pos2 - 1);
return { model, score };
}
std::vector<std::string> BestResults::loadResultFiles()
{
std::vector<std::string> files;
using std::filesystem::directory_iterator;
std::string fileModel, fileScore;
for (const auto& file : directory_iterator(path)) {
auto fileName = file.path().filename().string();
if (fileName.find(".json") != std::string::npos && fileName.find("results_") == 0) {
tie(fileModel, fileScore) = getModelScore(fileName);
if (score == fileScore && (model == fileModel || model == "any")) {
files.push_back(fileName);
}
}
}
return files;
}
json BestResults::loadFile(const std::string& fileName)
{
std::ifstream resultData(fileName);
if (resultData.is_open()) {
json data = json::parse(resultData);
return data;
}
throw std::invalid_argument("Unable to open result file. [" + fileName + "]");
}
std::vector<std::string> BestResults::getModels()
{
std::set<std::string> models;
std::vector<std::string> result;
auto files = loadResultFiles();
if (files.size() == 0) {
std::cerr << Colors::MAGENTA() << "No result files were found!" << Colors::RESET() << std::endl;
exit(1);
}
std::string fileModel, fileScore;
for (const auto& file : files) {
// extract the model from the file name
tie(fileModel, fileScore) = getModelScore(file);
// add the model to the std::vector of models
models.insert(fileModel);
}
result = std::vector<std::string>(models.begin(), models.end());
return result;
}
std::vector<std::string> BestResults::getDatasets(json table)
{
std::vector<std::string> datasets;
for (const auto& dataset : table.items()) {
datasets.push_back(dataset.key());
}
return datasets;
}
void BestResults::buildAll()
{
auto models = getModels();
for (const auto& model : models) {
std::cout << "Building best results for model: " << model << std::endl;
this->model = model;
build();
}
model = "any";
}
void BestResults::listFile()
{
std::string bestFileName = path + bestResultFile();
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest);
} else {
std::cerr << Colors::MAGENTA() << "File " << bestFileName << " doesn't exist." << Colors::RESET() << std::endl;
exit(1);
}
auto temp = ConfigLocale();
auto date = ftime_to_string(std::filesystem::last_write_time(bestFileName));
auto data = loadFile(bestFileName);
auto datasets = getDatasets(data);
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
int maxFileName = 0;
int maxHyper = 15;
for (auto const& item : data.items()) {
maxHyper = std::max(maxHyper, (int)item.value().at(1).dump().size());
maxFileName = std::max(maxFileName, (int)item.value().at(2).get<std::string>().size());
}
std::stringstream oss;
oss << Colors::GREEN() << "Best results for " << model << " as of " << date << std::endl;
std::cout << oss.str();
std::cout << std::string(oss.str().size() - 8, '-') << std::endl;
std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << "Dataset" << "Score " << std::setw(maxFileName) << "File" << " Hyperparameters" << std::endl;
std::cout << "=== " << std::string(maxDatasetName, '=') << " =========== " << std::string(maxFileName, '=') << " " << std::string(maxHyper, '=') << std::endl;
auto i = 0;
bool odd = true;
double total = 0;
for (auto const& item : data.items()) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
double value = item.value().at(0).get<double>();
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
std::cout << std::setw(maxDatasetName) << std::left << item.key() << " ";
std::cout << std::setw(11) << std::setprecision(9) << std::fixed << value << " ";
std::cout << std::setw(maxFileName) << item.value().at(2).get<std::string>() << " ";
std::cout << item.value().at(1) << " ";
std::cout << std::endl;
total += value;
odd = !odd;
}
std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ===========" << std::endl;
std::cout << std::setw(5 + maxDatasetName) << "Total.................. " << std::setw(11) << std::setprecision(8) << std::fixed << total << std::endl;
}
json BestResults::buildTableResults(std::vector<std::string> models)
{
json table;
auto maxDate = std::filesystem::file_time_type::max();
for (const auto& model : models) {
this->model = model;
std::string bestFileName = path + bestResultFile();
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest);
} else {
std::cerr << Colors::MAGENTA() << "File " << bestFileName << " doesn't exist." << Colors::RESET() << std::endl;
exit(1);
}
auto dateWrite = std::filesystem::last_write_time(bestFileName);
if (dateWrite < maxDate) {
maxDate = dateWrite;
}
auto data = loadFile(bestFileName);
table[model] = data;
}
table["dateTable"] = ftime_to_string(maxDate);
return table;
}
void BestResults::printTableResults(std::vector<std::string> models, json table)
{
std::stringstream oss;
oss << Colors::GREEN() << "Best results for " << score << " as of " << table.at("dateTable").get<std::string>() << std::endl;
std::cout << oss.str();
std::cout << std::string(oss.str().size() - 8, '-') << std::endl;
std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << std::string("Dataset");
for (const auto& model : models) {
std::cout << std::setw(maxModelName) << std::left << model << " ";
}
std::cout << std::endl;
std::cout << "=== " << std::string(maxDatasetName, '=') << " ";
for (const auto& model : models) {
std::cout << std::string(maxModelName, '=') << " ";
}
std::cout << std::endl;
auto i = 0;
bool odd = true;
std::map<std::string, double> totals;
int nDatasets = table.begin().value().size();
for (const auto& model : models) {
totals[model] = 0.0;
}
auto datasets = getDatasets(table.begin().value());
for (auto const& dataset : datasets) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
std::cout << std::setw(maxDatasetName) << std::left << dataset << " ";
double maxValue = 0;
// Find out the max value for this dataset
for (const auto& model : models) {
double value = table[model].at(dataset).at(0).get<double>();
if (value > maxValue) {
maxValue = value;
}
}
// Print the row with red colors on max values
for (const auto& model : models) {
std::string efectiveColor = color;
double value = table[model].at(dataset).at(0).get<double>();
if (value == maxValue) {
efectiveColor = Colors::RED();
}
totals[model] += value;
std::cout << efectiveColor << std::setw(maxModelName) << std::setprecision(maxModelName - 2) << std::fixed << value << " ";
}
std::cout << std::endl;
odd = !odd;
}
std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ";
for (const auto& model : models) {
std::cout << std::string(maxModelName, '=') << " ";
}
std::cout << std::endl;
std::cout << Colors::GREEN() << std::setw(5 + maxDatasetName) << " Totals...................";
double max = 0.0;
for (const auto& total : totals) {
if (total.second > max) {
max = total.second;
}
}
for (const auto& model : models) {
std::string efectiveColor = Colors::GREEN();
if (totals[model] == max) {
efectiveColor = Colors::RED();
}
std::cout << efectiveColor << std::right << std::setw(maxModelName) << std::setprecision(maxModelName - 4) << std::fixed << totals[model] << " ";
}
std::cout << std::endl;
}
void BestResults::reportSingle(bool excel)
{
listFile();
if (excel) {
auto models = getModels();
// Build the table of results
json table = buildTableResults(models);
std::vector<std::string> datasets = getDatasets(table.begin().value());
BestResultsExcel excel(score, datasets);
excel.reportSingle(model, path + bestResultFile());
messageExcelFile(excel.getFileName());
}
}
void BestResults::reportAll(bool excel)
{
auto models = getModels();
// Build the table of results
json table = buildTableResults(models);
std::vector<std::string> datasets = getDatasets(table.begin().value());
maxModelName = (*max_element(models.begin(), models.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
maxModelName = std::max(12, maxModelName);
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
maxDatasetName = std::max(25, maxDatasetName);
// Print the table of results
printTableResults(models, table);
// Compute the Friedman test
std::map<std::string, std::map<std::string, float>> ranksModels;
if (friedman) {
Statistics stats(models, datasets, table, significance);
auto result = stats.friedmanTest();
stats.postHocHolmTest(result);
ranksModels = stats.getRanks();
}
if (excel) {
BestResultsExcel excel(score, datasets);
excel.reportAll(models, table, ranksModels, friedman, significance);
if (friedman) {
int idx = -1;
double min = 2000;
// Find out the control model
auto totals = std::vector<double>(models.size(), 0.0);
for (const auto& dataset : datasets) {
for (int i = 0; i < models.size(); ++i) {
totals[i] += ranksModels[dataset][models[i]];
}
}
for (int i = 0; i < models.size(); ++i) {
if (totals[i] < min) {
min = totals[i];
idx = i;
}
}
model = models.at(idx);
excel.reportSingle(model, path + bestResultFile());
}
messageExcelFile(excel.getFileName());
}
}
void BestResults::messageExcelFile(const std::string& fileName)
{
std::cout << Colors::YELLOW() << "** Excel file generated: " << fileName << Colors::RESET() << std::endl;
}
}

View File

@@ -0,0 +1,36 @@
#ifndef BESTRESULTS_H
#define BESTRESULTS_H
#include <string>
#include <nlohmann/json.hpp>
using json = nlohmann::json;
namespace platform {
class BestResults {
public:
explicit BestResults(const std::string& path, const std::string& score, const std::string& model, bool friedman, double significance = 0.05)
: path(path), score(score), model(model), friedman(friedman), significance(significance)
{
}
std::string build();
void reportSingle(bool excel);
void reportAll(bool excel);
void buildAll();
private:
std::vector<std::string> getModels();
std::vector<std::string> getDatasets(json table);
std::vector<std::string> loadResultFiles();
void messageExcelFile(const std::string& fileName);
json buildTableResults(std::vector<std::string> models);
void printTableResults(std::vector<std::string> models, json table);
std::string bestResultFile();
json loadFile(const std::string& fileName);
void listFile();
std::string path;
std::string score;
std::string model;
bool friedman;
double significance;
int maxModelName = 0;
int maxDatasetName = 0;
};
}
#endif //BESTRESULTS_H

View File

@@ -0,0 +1,300 @@
#include <sstream>
#include "BestResultsExcel.h"
#include "Paths.h"
#include <map>
#include <nlohmann/json.hpp>
#include "Statistics.h"
#include "ReportExcel.h"
namespace platform {
json loadResultData(const std::string& fileName)
{
json data;
std::ifstream resultData(fileName);
if (resultData.is_open()) {
data = json::parse(resultData);
} else {
throw std::invalid_argument("Unable to open result file. [" + fileName + "]");
}
return data;
}
std::string getColumnName(int colNum)
{
std::string columnName = "";
if (colNum == 0)
return "A";
while (colNum > 0) {
int modulo = colNum % 26;
columnName = char(65 + modulo) + columnName;
colNum = (int)((colNum - modulo) / 26);
}
return columnName;
}
BestResultsExcel::BestResultsExcel(const std::string& score, const std::vector<std::string>& datasets) : score(score), datasets(datasets)
{
workbook = workbook_new((Paths::excel() + fileName).c_str());
setProperties("Best Results");
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
datasetNameSize = std::max(datasetNameSize, maxDatasetName);
createFormats();
}
void BestResultsExcel::reportAll(const std::vector<std::string>& models, const json& table, const std::map<std::string, std::map<std::string, float>>& ranks, bool friedman, double significance)
{
this->table = table;
this->models = models;
ranksModels = ranks;
this->friedman = friedman;
this->significance = significance;
worksheet = workbook_add_worksheet(workbook, "Best Results");
int maxModelName = (*std::max_element(models.begin(), models.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
modelNameSize = std::max(modelNameSize, maxModelName);
formatColumns();
build();
}
void BestResultsExcel::reportSingle(const std::string& model, const std::string& fileName)
{
worksheet = workbook_add_worksheet(workbook, "Report");
if (FILE* fileTest = fopen(fileName.c_str(), "r")) {
fclose(fileTest);
} else {
std::cerr << "File " << fileName << " doesn't exist." << std::endl;
exit(1);
}
json data = loadResultData(fileName);
std::string title = "Best results for " + model;
worksheet_merge_range(worksheet, 0, 0, 0, 4, title.c_str(), styles["headerFirst"]);
// Body header
row = 3;
int col = 1;
writeString(row, 0, "", "bodyHeader");
writeString(row, 1, "Dataset", "bodyHeader");
writeString(row, 2, "Score", "bodyHeader");
writeString(row, 3, "File", "bodyHeader");
writeString(row, 4, "Hyperparameters", "bodyHeader");
auto i = 0;
std::string hyperparameters;
int hypSize = 22;
std::map<std::string, std::string> files; // map of files imported and their tabs
for (auto const& item : data.items()) {
row++;
writeInt(row, 0, i++, "ints");
writeString(row, 1, item.key().c_str(), "text");
writeDouble(row, 2, item.value().at(0).get<double>(), "result");
auto fileName = item.value().at(2).get<std::string>();
std::string hyperlink = "";
try {
hyperlink = files.at(fileName);
}
catch (const std::out_of_range& oor) {
auto tabName = "table_" + std::to_string(i);
auto worksheetNew = workbook_add_worksheet(workbook, tabName.c_str());
json data = loadResultData(Paths::results() + fileName);
auto report = ReportExcel(data, false, workbook, worksheetNew);
report.show();
hyperlink = "#table_" + std::to_string(i);
files[fileName] = hyperlink;
}
hyperlink += "!H" + std::to_string(i + 6);
std::string fileNameText = "=HYPERLINK(\"" + hyperlink + "\",\"" + fileName + "\")";
worksheet_write_formula(worksheet, row, 3, fileNameText.c_str(), efectiveStyle("text"));
hyperparameters = item.value().at(1).dump();
if (hyperparameters.size() > hypSize) {
hypSize = hyperparameters.size();
}
writeString(row, 4, hyperparameters, "text");
}
row++;
// Set Totals
writeString(row, 1, "Total", "bodyHeader");
std::stringstream oss;
auto colName = getColumnName(2);
oss << "=sum(" << colName << "5:" << colName << row << ")";
worksheet_write_formula(worksheet, row, 2, oss.str().c_str(), styles["bodyHeader_odd"]);
// Set format
worksheet_freeze_panes(worksheet, 4, 2);
std::vector<int> columns_sizes = { 5, datasetNameSize, modelNameSize, 66, hypSize + 1 };
for (int i = 0; i < columns_sizes.size(); ++i) {
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
}
}
BestResultsExcel::~BestResultsExcel()
{
workbook_close(workbook);
}
void BestResultsExcel::formatColumns()
{
worksheet_freeze_panes(worksheet, 4, 2);
std::vector<int> columns_sizes = { 5, datasetNameSize };
for (int i = 0; i < models.size(); ++i) {
columns_sizes.push_back(modelNameSize);
}
for (int i = 0; i < columns_sizes.size(); ++i) {
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
}
}
void BestResultsExcel::addConditionalFormat(std::string formula)
{
// Add conditional format for max/min values in scores/ranks sheets
lxw_format* custom_format = workbook_add_format(workbook);
format_set_bg_color(custom_format, 0xFFC7CE);
format_set_font_color(custom_format, 0x9C0006);
// Create a conditional format object. A static object would also work.
lxw_conditional_format* conditional_format = (lxw_conditional_format*)calloc(1, sizeof(lxw_conditional_format));
conditional_format->type = LXW_CONDITIONAL_TYPE_FORMULA;
std::string col = getColumnName(models.size() + 1);
std::stringstream oss;
oss << "=C5=" << formula << "($C5:$" << col << "5)";
auto formulaValue = oss.str();
conditional_format->value_string = formulaValue.c_str();
conditional_format->format = custom_format;
worksheet_conditional_format_range(worksheet, 4, 2, datasets.size() + 3, models.size() + 1, conditional_format);
}
void BestResultsExcel::build()
{
// Create Sheet with scores
header(false);
body(false);
// Add conditional format for max values
addConditionalFormat("max");
footer(false);
if (friedman) {
// Create Sheet with ranks
worksheet = workbook_add_worksheet(workbook, "Ranks");
formatColumns();
header(true);
body(true);
addConditionalFormat("min");
footer(true);
// Create Sheet with Friedman Test
doFriedman();
}
}
std::string BestResultsExcel::getFileName()
{
return Paths::excel() + fileName;
}
void BestResultsExcel::header(bool ranks)
{
row = 0;
std::string message = ranks ? "Ranks for score " + score : "Best results for " + score;
worksheet_merge_range(worksheet, 0, 0, 0, 1 + models.size(), message.c_str(), styles["headerFirst"]);
// Body header
row = 3;
int col = 1;
writeString(row, 0, "", "bodyHeader");
writeString(row, 1, "Dataset", "bodyHeader");
for (const auto& model : models) {
writeString(row, ++col, model.c_str(), "bodyHeader");
}
}
void BestResultsExcel::body(bool ranks)
{
row = 4;
int i = 0;
json origin = table.begin().value();
for (auto const& item : origin.items()) {
writeInt(row, 0, i++, "ints");
writeString(row, 1, item.key().c_str(), "text");
int col = 1;
for (const auto& model : models) {
double value = ranks ? ranksModels[item.key()][model] : table[model].at(item.key()).at(0).get<double>();
writeDouble(row, ++col, value, "result");
}
++row;
}
}
void BestResultsExcel::footer(bool ranks)
{
// Set Totals
writeString(row, 1, "Total", "bodyHeader");
int col = 1;
for (const auto& model : models) {
std::stringstream oss;
auto colName = getColumnName(col + 1);
oss << "=SUM(" << colName << "5:" << colName << row << ")";
worksheet_write_formula(worksheet, row, ++col, oss.str().c_str(), styles["bodyHeader_odd"]);
}
if (ranks) {
row++;
writeString(row, 1, "Average ranks", "bodyHeader");
int col = 1;
for (const auto& model : models) {
auto colName = getColumnName(col + 1);
std::stringstream oss;
oss << "=SUM(" << colName << "5:" << colName << row - 1 << ")/" << datasets.size();
worksheet_write_formula(worksheet, row, ++col, oss.str().c_str(), styles["bodyHeader_odd"]);
}
}
}
void BestResultsExcel::doFriedman()
{
worksheet = workbook_add_worksheet(workbook, "Friedman");
std::vector<int> columns_sizes = { 5, datasetNameSize };
for (int i = 0; i < models.size(); ++i) {
columns_sizes.push_back(modelNameSize);
}
for (int i = 0; i < columns_sizes.size(); ++i) {
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
}
worksheet_merge_range(worksheet, 0, 0, 0, 1 + models.size(), "Friedman Test", styles["headerFirst"]);
row = 2;
Statistics stats(models, datasets, table, significance, false);
auto result = stats.friedmanTest();
stats.postHocHolmTest(result);
auto friedmanResult = stats.getFriedmanResult();
auto holmResult = stats.getHolmResult();
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Null hypothesis: H0 'There is no significant differences between all the classifiers.'", styles["headerSmall"]);
row += 2;
writeString(row, 1, "Friedman Q", "bodyHeader");
writeDouble(row, 2, friedmanResult.statistic, "bodyHeader");
row++;
writeString(row, 1, "Critical χ2 value", "bodyHeader");
writeDouble(row, 2, friedmanResult.criticalValue, "bodyHeader");
row++;
writeString(row, 1, "p-value", "bodyHeader");
writeDouble(row, 2, friedmanResult.pvalue, "bodyHeader");
writeString(row, 3, friedmanResult.reject ? "<" : ">", "bodyHeader");
writeDouble(row, 4, significance, "bodyHeader");
writeString(row, 5, friedmanResult.reject ? "Reject H0" : "Accept H0", "bodyHeader");
row += 3;
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Holm Test", styles["headerFirst"]);
row += 2;
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Null hypothesis: H0 'There is no significant differences between the control model and the other models.'", styles["headerSmall"]);
row += 2;
std::string controlModel = "Control Model: " + holmResult.model;
worksheet_merge_range(worksheet, row, 1, row, 7, controlModel.c_str(), styles["bodyHeader_odd"]);
row++;
writeString(row, 1, "Model", "bodyHeader");
writeString(row, 2, "p-value", "bodyHeader");
writeString(row, 3, "Rank", "bodyHeader");
writeString(row, 4, "Win", "bodyHeader");
writeString(row, 5, "Tie", "bodyHeader");
writeString(row, 6, "Loss", "bodyHeader");
writeString(row, 7, "Reject H0", "bodyHeader");
row++;
bool first = true;
for (const auto& item : holmResult.holmLines) {
writeString(row, 1, item.model, "text");
if (first) {
// Control model info
first = false;
writeString(row, 2, "", "text");
writeDouble(row, 3, item.rank, "result");
writeString(row, 4, "", "text");
writeString(row, 5, "", "text");
writeString(row, 6, "", "text");
writeString(row, 7, "", "textCentered");
} else {
// Rest of the models info
writeDouble(row, 2, item.pvalue, "result");
writeDouble(row, 3, item.rank, "result");
writeInt(row, 4, item.wtl.win, "ints");
writeInt(row, 5, item.wtl.tie, "ints");
writeInt(row, 6, item.wtl.loss, "ints");
writeString(row, 7, item.reject ? "Yes" : "No", "textCentered");
}
row++;
}
}
}

View File

@@ -0,0 +1,39 @@
#ifndef BESTRESULTS_EXCEL_H
#define BESTRESULTS_EXCEL_H
#include "ExcelFile.h"
#include <vector>
#include <map>
#include <nlohmann/json.hpp>
using json = nlohmann::json;
namespace platform {
class BestResultsExcel : ExcelFile {
public:
BestResultsExcel(const std::string& score, const std::vector<std::string>& datasets);
~BestResultsExcel();
void reportAll(const std::vector<std::string>& models, const json& table, const std::map<std::string, std::map<std::string, float>>& ranks, bool friedman, double significance);
void reportSingle(const std::string& model, const std::string& fileName);
std::string getFileName();
private:
void build();
void header(bool ranks);
void body(bool ranks);
void footer(bool ranks);
void formatColumns();
void doFriedman();
void addConditionalFormat(std::string formula);
const std::string fileName = "BestResults.xlsx";
std::string score;
std::vector<std::string> models;
std::vector<std::string> datasets;
json table;
std::map<std::string, std::map<std::string, float>> ranksModels;
bool friedman;
double significance;
int modelNameSize = 12; // Min size of the column
int datasetNameSize = 25; // Min size of the column
};
}
#endif //BESTRESULTS_EXCEL_H

28
src/Platform/BestScore.h Normal file
View File

@@ -0,0 +1,28 @@
#ifndef BESTSCORE_H
#define BESTSCORE_H
#include <string>
#include <map>
#include <utility>
#include "DotEnv.h"
namespace platform {
class BestScore {
public:
static std::pair<std::string, double> getScore(const std::string& metric)
{
static std::map<std::pair<std::string, std::string>, std::pair<std::string, double>> data = {
{{"discretiz", "accuracy"}, {"STree_default (linear-ovo)", 22.109799}},
{{"odte", "accuracy"}, {"STree_default (linear-ovo)", 22.109799}},
};
auto env = platform::DotEnv();
std::string experiment = env.get("experiment");
try {
return data[{experiment, metric}];
}
catch (...) {
return { "", 0.0 };
}
}
};
}
#endif

22
src/Platform/CLocale.h Normal file
View File

@@ -0,0 +1,22 @@
#ifndef LOCALE_H
#define LOCALE_H
#include <locale>
#include <iostream>
#include <string>
namespace platform {
struct separation : std::numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
std::string do_grouping() const { return "\03"; }
};
class ConfigLocale {
public:
explicit ConfigLocale()
{
std::locale mylocale(std::cout.getloc(), new separation);
std::locale::global(mylocale);
std::cout.imbue(mylocale);
}
};
}
#endif

View File

@@ -1,8 +1,21 @@
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/src/Platform) include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
include_directories(${BayesNet_SOURCE_DIR}/src/PyClassifiers)
include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include)
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include) include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc Report.cc) include_directories(${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/include)
target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") include_directories(${Python3_INCLUDE_DIRS})
add_executable(b_best b_best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
add_executable(b_grid b_grid.cc GridSearch.cc GridData.cc HyperParameters.cc Folding.cc Datasets.cc Dataset.cc)
add_executable(b_list b_list.cc Datasets.cc Dataset.cc)
add_executable(b_main b_main.cc Folding.cc Experiment.cc Datasets.cc Dataset.cc Models.cc HyperParameters.cc ReportConsole.cc ReportBase.cc)
add_executable(b_manage b_manage.cc Results.cc ManageResults.cc CommandParser.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}" "${TORCH_LIBRARIES}" ArffFiles mdlp)
target_link_libraries(b_grid BayesNet PyWrap)
target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}")
target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}" PyWrap)
target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)

14
src/Platform/Colors.h Normal file
View File

@@ -0,0 +1,14 @@
#ifndef COLORS_H
#define COLORS_H
class Colors {
public:
static std::string MAGENTA() { return "\033[1;35m"; }
static std::string BLUE() { return "\033[1;34m"; }
static std::string CYAN() { return "\033[1;36m"; }
static std::string GREEN() { return "\033[1;32m"; }
static std::string YELLOW() { return "\033[1;33m"; }
static std::string RED() { return "\033[1;31m"; }
static std::string WHITE() { return "\033[1;37m"; }
static std::string RESET() { return "\033[0m"; }
};
#endif // COLORS_H

View File

@@ -0,0 +1,87 @@
#include "CommandParser.h"
#include <iostream>
#include <sstream>
#include <algorithm>
#include "Colors.h"
#include "Utils.h"
namespace platform {
void CommandParser::messageError(const std::string& message)
{
std::cout << Colors::RED() << message << Colors::RESET() << std::endl;
}
std::pair<char, int> CommandParser::parse(const std::string& color, const std::vector<std::tuple<std::string, char, bool>>& options, const char defaultCommand, const int maxIndex)
{
bool finished = false;
while (!finished) {
std::stringstream oss;
std::string line;
oss << color << "Choose option (";
bool first = true;
for (auto& option : options) {
if (first) {
first = false;
} else {
oss << ", ";
}
oss << std::get<char>(option) << "=" << std::get<std::string>(option);
}
oss << "): ";
std::cout << oss.str();
getline(std::cin, line);
std::cout << Colors::RESET();
line = trim(line);
if (line.size() == 0)
continue;
if (all_of(line.begin(), line.end(), ::isdigit)) {
command = defaultCommand;
index = stoi(line);
if (index > maxIndex || index < 0) {
messageError("Index out of range");
continue;
}
finished = true;
break;
}
bool found = false;
for (auto& option : options) {
if (line[0] == std::get<char>(option)) {
found = true;
// it's a match
line.erase(line.begin());
line = trim(line);
if (std::get<bool>(option)) {
// The option requires a value
if (line.size() == 0) {
messageError("Option " + std::get<std::string>(option) + " requires a value");
break;
}
try {
index = stoi(line);
if (index > maxIndex || index < 0) {
messageError("Index out of range");
break;
}
}
catch (const std::invalid_argument& ia) {
messageError("Invalid value: " + line);
break;
}
} else {
if (line.size() > 0) {
messageError("option " + std::get<std::string>(option) + " doesn't accept values");
break;
}
}
command = std::get<char>(option);
finished = true;
break;
}
}
if (!found) {
messageError("I don't know " + line);
}
}
return { command, index };
}
} /* namespace platform */

View File

@@ -0,0 +1,20 @@
#ifndef COMMAND_PARSER_H
#define COMMAND_PARSER_H
#include <string>
#include <vector>
#include <tuple>
namespace platform {
class CommandParser {
public:
CommandParser() = default;
std::pair<char, int> parse(const std::string& color, const std::vector<std::tuple<std::string, char, bool>>& options, const char defaultCommand, const int maxIndex);
char getCommand() const { return command; };
int getIndex() const { return index; };
private:
void messageError(const std::string& message);
char command;
int index;
};
} /* namespace platform */
#endif /* COMMAND_PARSER_H */

215
src/Platform/Dataset.cc Normal file
View File

@@ -0,0 +1,215 @@
#include "Dataset.h"
#include "ArffFiles.h"
#include <fstream>
namespace platform {
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
{
}
std::string Dataset::getName() const
{
return name;
}
std::string Dataset::getClassName() const
{
return className;
}
std::vector<std::string> Dataset::getFeatures() const
{
if (loaded) {
return features;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures() const
{
if (loaded) {
return n_features;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples() const
{
if (loaded) {
return n_samples;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
std::map<std::string, std::vector<int>> Dataset::getStates() const
{
if (loaded) {
return states;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<int>>&, std::vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
std::string line;
getline(file, line);
std::vector<std::string> tokens = split(line, ',');
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
if (className == "-1") {
className = tokens.back();
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw std::invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = std::vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
}
states[className] = std::vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
}
std::vector<std::string> tokenize(std::string line)
{
std::vector<std::string> tokens;
for (auto i = 0; i < line.size(); ++i) {
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
std::string token = line.substr(0, i);
tokens.push_back(token);
line.erase(line.begin(), line.begin() + i + 1);
i = 0;
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
line.erase(line.begin(), line.begin() + i + 1);
}
}
if (line.size() > 0) {
tokens.push_back(line);
}
return tokens;
}
void Dataset::load_rdata()
{
ifstream file(path + "/" + name + "_R.dat");
if (file.is_open()) {
std::string line;
getline(file, line);
line = ArffFiles::trim(line);
std::vector<std::string> tokens = tokenize(line);
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
if (className == "-1") {
className = ArffFiles::trim(tokens.back());
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = tokenize(line);
// We have to skip the first token, which is the instance number.
for (auto i = 1; i < features.size() + 1; ++i) {
const float value = stof(tokens[i]);
Xv[i - 1].push_back(value);
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw std::invalid_argument("Unable to open dataset file.");
}
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
} else if (fileType == RDATA) {
load_rdata();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
}
n_samples = Xv[0].size();
n_features = Xv.size();
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
}
y = torch::tensor(yv, torch::kInt32);
}
std::vector<mdlp::labels_t> Dataset::discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
{
std::vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
}

78
src/Platform/Dataset.h Normal file
View File

@@ -0,0 +1,78 @@
#ifndef DATASET_H
#define DATASET_H
#include <torch/torch.h>
#include <map>
#include <vector>
#include <string>
#include "CPPFImdlp.h"
#include "Utils.h"
namespace platform {
enum fileType_t { CSV, ARFF, RDATA };
class SourceData {
public:
SourceData(std::string source)
{
if (source == "Surcov") {
path = "datasets/";
fileType = CSV;
} else if (source == "Arff") {
path = "datasets/";
fileType = ARFF;
} else if (source == "Tanveer") {
path = "data/";
fileType = RDATA;
} else {
throw std::invalid_argument("Unknown source.");
}
}
std::string getPath()
{
return path;
}
fileType_t getFileType()
{
return fileType;
}
private:
std::string path;
fileType_t fileType;
};
class Dataset {
private:
std::string path;
std::string name;
fileType_t fileType;
std::string className;
int n_samples{ 0 }, n_features{ 0 };
std::vector<std::string> features;
std::map<std::string, std::vector<int>> states;
bool loaded;
bool discretize;
torch::Tensor X, y;
std::vector<std::vector<float>> Xv;
std::vector<std::vector<int>> Xd;
std::vector<int> yv;
void buildTensors();
void load_csv();
void load_arff();
void load_rdata();
void computeStates();
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y);
public:
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
explicit Dataset(const Dataset&);
std::string getName() const;
std::string getClassName() const;
std::vector<string> getFeatures() const;
std::map<std::string, std::vector<int>> getStates() const;
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures() const;
int getNSamples() const;
void load();
const bool inline isLoaded() const { return loaded; };
};
};
#endif

View File

@@ -1,231 +1,129 @@
#include "Datasets.h" #include "Datasets.h"
#include "platformUtils.h" #include <fstream>
#include "ArffFiles.h"
namespace platform { namespace platform {
void Datasets::load() void Datasets::load()
{ {
ifstream catalog(path + "/all.txt"); auto sd = SourceData(sfileType);
fileType = sd.getFileType();
path = sd.getPath();
ifstream catalog(path + "all.txt");
if (catalog.is_open()) { if (catalog.is_open()) {
string line; std::string line;
while (getline(catalog, line)) { while (getline(catalog, line)) {
vector<string> tokens = split(line, ','); if (line.empty() || line[0] == '#') {
string name = tokens[0]; continue;
string className = tokens[1]; }
std::vector<std::string> tokens = split(line, ',');
std::string name = tokens[0];
std::string className;
if (tokens.size() == 1) {
className = "-1";
} else {
className = tokens[1];
}
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType); datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
} }
catalog.close(); catalog.close();
} else { } else {
throw invalid_argument("Unable to open catalog file. [" + path + "/all.txt" + "]"); throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
} }
} }
vector<string> Datasets::getNames() std::vector<std::string> Datasets::getNames()
{ {
vector<string> result; std::vector<std::string> result;
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; }); transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
return result; return result;
} }
vector<string> Datasets::getFeatures(string name) std::vector<std::string> Datasets::getFeatures(const std::string& name) const
{ {
if (datasets[name]->isLoaded()) { if (datasets.at(name)->isLoaded()) {
return datasets[name]->getFeatures(); return datasets.at(name)->getFeatures();
} else { } else {
throw invalid_argument("Dataset not loaded."); throw std::invalid_argument("Dataset not loaded.");
} }
} }
map<string, vector<int>> Datasets::getStates(string name) map<std::string, std::vector<int>> Datasets::getStates(const std::string& name) const
{ {
if (datasets[name]->isLoaded()) { if (datasets.at(name)->isLoaded()) {
return datasets[name]->getStates(); return datasets.at(name)->getStates();
} else { } else {
throw invalid_argument("Dataset not loaded."); throw std::invalid_argument("Dataset not loaded.");
} }
} }
string Datasets::getClassName(string name) void Datasets::loadDataset(const std::string& name) const
{ {
if (datasets[name]->isLoaded()) { if (datasets.at(name)->isLoaded()) {
return datasets[name]->getClassName(); return;
} else { } else {
throw invalid_argument("Dataset not loaded."); datasets.at(name)->load();
} }
} }
int Datasets::getNSamples(string name) std::string Datasets::getClassName(const std::string& name) const
{ {
if (datasets[name]->isLoaded()) { if (datasets.at(name)->isLoaded()) {
return datasets[name]->getNSamples(); return datasets.at(name)->getClassName();
} else { } else {
throw invalid_argument("Dataset not loaded."); throw std::invalid_argument("Dataset not loaded.");
} }
} }
pair<vector<vector<float>>&, vector<int>&> Datasets::getVectors(string name) int Datasets::getNSamples(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getNSamples();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNClasses(const std::string& name)
{
if (datasets.at(name)->isLoaded()) {
auto className = datasets.at(name)->getClassName();
if (discretize) {
auto states = getStates(name);
return states.at(className).size();
}
auto [Xv, yv] = getVectors(name);
return *std::max_element(yv.begin(), yv.end()) + 1;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
std::vector<int> Datasets::getClassesCounts(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
auto [Xv, yv] = datasets.at(name)->getVectors();
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
for (auto y : yv) {
counts[y]++;
}
return counts;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<float>>&, std::vector<int>&> Datasets::getVectors(const std::string& name)
{ {
if (!datasets[name]->isLoaded()) { if (!datasets[name]->isLoaded()) {
datasets[name]->load(); datasets[name]->load();
} }
return datasets[name]->getVectors(); return datasets[name]->getVectors();
} }
pair<vector<vector<int>>&, vector<int>&> Datasets::getVectorsDiscretized(string name) pair<std::vector<std::vector<int>>&, std::vector<int>&> Datasets::getVectorsDiscretized(const std::string& name)
{ {
if (!datasets[name]->isLoaded()) { if (!datasets[name]->isLoaded()) {
datasets[name]->load(); datasets[name]->load();
} }
return datasets[name]->getVectorsDiscretized(); return datasets[name]->getVectorsDiscretized();
} }
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(string name) pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(const std::string& name)
{ {
if (!datasets[name]->isLoaded()) { if (!datasets[name]->isLoaded()) {
datasets[name]->load(); datasets[name]->load();
} }
return datasets[name]->getTensors(); return datasets[name]->getTensors();
} }
bool Datasets::isDataset(const string& name) bool Datasets::isDataset(const std::string& name) const
{ {
return datasets.find(name) != datasets.end(); return datasets.find(name) != datasets.end();
} }
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
{
}
string Dataset::getName()
{
return name;
}
string Dataset::getClassName()
{
return className;
}
vector<string> Dataset::getFeatures()
{
if (loaded) {
return features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures()
{
if (loaded) {
return n_features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples()
{
if (loaded) {
return n_samples;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Dataset::getStates()
{
if (loaded) {
return states;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<int>>&, vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
string line;
getline(file, line);
vector<string> tokens = split(line, ',');
features = vector<string>(tokens.begin(), tokens.end() - 1);
className = tokens.back();
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
iota(begin(states[features[i]]), end(states[features[i]]), 0);
}
states[className] = vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states[className]), end(states[className]), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
}
n_samples = Xv[0].size();
n_features = Xv.size();
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
}
y = torch::tensor(yv, torch::kInt32);
}
} }

View File

@@ -1,64 +1,29 @@
#ifndef DATASETS_H #ifndef DATASETS_H
#define DATASETS_H #define DATASETS_H
#include <torch/torch.h> #include "Dataset.h"
#include <map>
#include <vector>
#include <string>
namespace platform { namespace platform {
using namespace std;
enum fileType_t { CSV, ARFF };
class Dataset {
private:
string path;
string name;
fileType_t fileType;
string className;
int n_samples{ 0 }, n_features{ 0 };
vector<string> features;
map<string, vector<int>> states;
bool loaded;
bool discretize;
torch::Tensor X, y;
vector<vector<float>> Xv;
vector<vector<int>> Xd;
vector<int> yv;
void buildTensors();
void load_csv();
void load_arff();
void computeStates();
public:
Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
explicit Dataset(const Dataset&);
string getName();
string getClassName();
vector<string> getFeatures();
map<string, vector<int>> getStates();
pair<vector<vector<float>>&, vector<int>&> getVectors();
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized();
pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures();
int getNSamples();
void load();
const bool inline isLoaded() const { return loaded; };
};
class Datasets { class Datasets {
private: private:
string path; std::string path;
fileType_t fileType; fileType_t fileType;
map<string, unique_ptr<Dataset>> datasets; std::string sfileType;
std::map<std::string, std::unique_ptr<Dataset>> datasets;
bool discretize; bool discretize;
void load(); // Loads the list of datasets void load(); // Loads the list of datasets
public: public:
explicit Datasets(const string& path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; explicit Datasets(bool discretize, std::string sfileType) : discretize(discretize), sfileType(sfileType) { load(); };
vector<string> getNames(); std::vector<string> getNames();
vector<string> getFeatures(string name); std::vector<string> getFeatures(const std::string& name) const;
int getNSamples(string name); int getNSamples(const std::string& name) const;
string getClassName(string name); std::string getClassName(const std::string& name) const;
map<string, vector<int>> getStates(string name); int getNClasses(const std::string& name);
pair<vector<vector<float>>&, vector<int>&> getVectors(string name); std::vector<int> getClassesCounts(const std::string& name) const;
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized(string name); std::map<std::string, std::vector<int>> getStates(const std::string& name) const;
pair<torch::Tensor&, torch::Tensor&> getTensors(string name); std::pair<std::vector<std::vector<float>>&, std::vector<int>&> getVectors(const std::string& name);
bool isDataset(const string& name); std::pair<std::vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized(const std::string& name);
std::pair<torch::Tensor&, torch::Tensor&> getTensors(const std::string& name);
bool isDataset(const std::string& name) const;
void loadDataset(const std::string& name) const;
}; };
}; };

View File

@@ -4,22 +4,15 @@
#include <map> #include <map>
#include <fstream> #include <fstream>
#include <sstream> #include <sstream>
#include "platformUtils.h" #include <algorithm>
#include <iostream>
#include "Utils.h"
//#include "Dataset.h"
namespace platform { namespace platform {
class DotEnv { class DotEnv {
private: private:
std::map<std::string, std::string> env; std::map<std::string, std::string> env;
std::string trim(const std::string& str)
{
std::string result = str;
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
return !std::isspace(ch);
}));
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
return !std::isspace(ch);
}).base(), result.end());
return result;
}
public: public:
DotEnv() DotEnv()
{ {
@@ -43,7 +36,7 @@ namespace platform {
} }
std::string get(const std::string& key) std::string get(const std::string& key)
{ {
return env[key]; return env.at(key);
} }
std::vector<int> getSeeds() std::vector<int> getSeeds()
{ {

168
src/Platform/ExcelFile.cc Normal file
View File

@@ -0,0 +1,168 @@
#include "ExcelFile.h"
namespace platform {
ExcelFile::ExcelFile()
{
setDefault();
}
ExcelFile::ExcelFile(lxw_workbook* workbook) : workbook(workbook)
{
setDefault();
}
ExcelFile::ExcelFile(lxw_workbook* workbook, lxw_worksheet* worksheet) : workbook(workbook), worksheet(worksheet)
{
setDefault();
}
void ExcelFile::setDefault()
{
normalSize = 14; //font size for report body
row = 0;
colorTitle = 0xB1A0C7;
colorOdd = 0xDCE6F1;
colorEven = 0xFDE9D9;
}
lxw_workbook* ExcelFile::getWorkbook()
{
return workbook;
}
void ExcelFile::setProperties(std::string title)
{
char line[title.size() + 1];
strcpy(line, title.c_str());
lxw_doc_properties properties = {
.title = line,
.subject = (char*)"Machine learning results",
.author = (char*)"Ricardo Montañana Gómez",
.manager = (char*)"Dr. J. A. Gámez, Dr. J. M. Puerta",
.company = (char*)"UCLM",
.comments = (char*)"Created with libxlsxwriter and c++",
};
workbook_set_properties(workbook, &properties);
}
lxw_format* ExcelFile::efectiveStyle(const std::string& style)
{
lxw_format* efectiveStyle = NULL;
if (style != "") {
std::string suffix = row % 2 ? "_odd" : "_even";
try {
efectiveStyle = styles.at(style + suffix);
}
catch (const std::out_of_range& oor) {
try {
efectiveStyle = styles.at(style);
}
catch (const std::out_of_range& oor) {
throw std::invalid_argument("Style " + style + " not found");
}
}
}
return efectiveStyle;
}
void ExcelFile::writeString(int row, int col, const std::string& text, const std::string& style)
{
worksheet_write_string(worksheet, row, col, text.c_str(), efectiveStyle(style));
}
void ExcelFile::writeInt(int row, int col, const int number, const std::string& style)
{
worksheet_write_number(worksheet, row, col, number, efectiveStyle(style));
}
void ExcelFile::writeDouble(int row, int col, const double number, const std::string& style)
{
worksheet_write_number(worksheet, row, col, number, efectiveStyle(style));
}
void ExcelFile::addColor(lxw_format* style, bool odd)
{
uint32_t efectiveColor = odd ? colorEven : colorOdd;
format_set_bg_color(style, lxw_color_t(efectiveColor));
}
void ExcelFile::createStyle(const std::string& name, lxw_format* style, bool odd)
{
addColor(style, odd);
if (name == "textCentered") {
format_set_align(style, LXW_ALIGN_CENTER);
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "text") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "bodyHeader") {
format_set_bold(style);
format_set_font_size(style, normalSize);
format_set_align(style, LXW_ALIGN_CENTER);
format_set_align(style, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(style, LXW_BORDER_THIN);
format_set_bg_color(style, lxw_color_t(colorTitle));
} else if (name == "result") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
format_set_num_format(style, "0.0000000");
} else if (name == "time") {
format_set_font_size(style, normalSize);
format_set_border(style, LXW_BORDER_THIN);
format_set_num_format(style, "#,##0.000000");
} else if (name == "ints") {
format_set_font_size(style, normalSize);
format_set_num_format(style, "###,##0");
format_set_border(style, LXW_BORDER_THIN);
} else if (name == "floats") {
format_set_border(style, LXW_BORDER_THIN);
format_set_font_size(style, normalSize);
format_set_num_format(style, "#,##0.00");
}
}
void ExcelFile::createFormats()
{
auto styleNames = { "text", "textCentered", "bodyHeader", "result", "time", "ints", "floats" };
lxw_format* style;
for (std::string name : styleNames) {
lxw_format* style = workbook_add_format(workbook);
style = workbook_add_format(workbook);
createStyle(name, style, true);
styles[name + "_odd"] = style;
style = workbook_add_format(workbook);
createStyle(name, style, false);
styles[name + "_even"] = style;
}
// Header 1st line
lxw_format* headerFirst = workbook_add_format(workbook);
format_set_bold(headerFirst);
format_set_font_size(headerFirst, 18);
format_set_align(headerFirst, LXW_ALIGN_CENTER);
format_set_align(headerFirst, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(headerFirst, LXW_BORDER_THIN);
format_set_bg_color(headerFirst, lxw_color_t(colorTitle));
// Header rest
lxw_format* headerRest = workbook_add_format(workbook);
format_set_bold(headerRest);
format_set_align(headerRest, LXW_ALIGN_CENTER);
format_set_font_size(headerRest, 16);
format_set_align(headerRest, LXW_ALIGN_VERTICAL_CENTER);
format_set_border(headerRest, LXW_BORDER_THIN);
format_set_bg_color(headerRest, lxw_color_t(colorOdd));
// Header small
lxw_format* headerSmall = workbook_add_format(workbook);
format_set_bold(headerSmall);
format_set_align(headerSmall, LXW_ALIGN_LEFT);
format_set_font_size(headerSmall, 12);
format_set_border(headerSmall, LXW_BORDER_THIN);
format_set_align(headerSmall, LXW_ALIGN_VERTICAL_CENTER);
format_set_bg_color(headerSmall, lxw_color_t(colorOdd));
// Summary style
lxw_format* summaryStyle = workbook_add_format(workbook);
format_set_bold(summaryStyle);
format_set_font_size(summaryStyle, 16);
format_set_border(summaryStyle, LXW_BORDER_THIN);
format_set_align(summaryStyle, LXW_ALIGN_VERTICAL_CENTER);
styles["headerFirst"] = headerFirst;
styles["headerRest"] = headerRest;
styles["headerSmall"] = headerSmall;
styles["summaryStyle"] = summaryStyle;
}
}

43
src/Platform/ExcelFile.h Normal file
View File

@@ -0,0 +1,43 @@
#ifndef EXCELFILE_H
#define EXCELFILE_H
#include <locale>
#include <string>
#include <map>
#include "xlsxwriter.h"
namespace platform {
struct separated : std::numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
std::string do_grouping() const { return "\03"; }
};
class ExcelFile {
public:
ExcelFile();
ExcelFile(lxw_workbook* workbook);
ExcelFile(lxw_workbook* workbook, lxw_worksheet* worksheet);
lxw_workbook* getWorkbook();
protected:
void setProperties(std::string title);
void writeString(int row, int col, const std::string& text, const std::string& style = "");
void writeInt(int row, int col, const int number, const std::string& style = "");
void writeDouble(int row, int col, const double number, const std::string& style = "");
void createFormats();
void createStyle(const std::string& name, lxw_format* style, bool odd);
void addColor(lxw_format* style, bool odd);
lxw_format* efectiveStyle(const std::string& name);
lxw_workbook* workbook;
lxw_worksheet* worksheet;
std::map<std::string, lxw_format*> styles;
int row;
int normalSize; //font size for report body
uint32_t colorTitle;
uint32_t colorOdd;
uint32_t colorEven;
private:
void setDefault();
};
}
#endif // !EXCELFILE_H

View File

@@ -1,11 +1,12 @@
#include <fstream>
#include "Experiment.h" #include "Experiment.h"
#include "Datasets.h" #include "Datasets.h"
#include "Models.h" #include "Models.h"
#include "Report.h" #include "ReportConsole.h"
#include "Paths.h"
namespace platform { namespace platform {
using json = nlohmann::json; using json = nlohmann::json;
string get_date() std::string get_date()
{ {
time_t rawtime; time_t rawtime;
tm* timeinfo; tm* timeinfo;
@@ -15,7 +16,7 @@ namespace platform {
oss << std::put_time(timeinfo, "%Y-%m-%d"); oss << std::put_time(timeinfo, "%Y-%m-%d");
return oss.str(); return oss.str();
} }
string get_time() std::string get_time()
{ {
time_t rawtime; time_t rawtime;
tm* timeinfo; tm* timeinfo;
@@ -25,9 +26,9 @@ namespace platform {
oss << std::put_time(timeinfo, "%H:%M:%S"); oss << std::put_time(timeinfo, "%H:%M:%S");
return oss.str(); return oss.str();
} }
string Experiment::get_file_name() std::string Experiment::get_file_name()
{ {
string result = "results_" + score_name + "_" + model + "_" + platform + "_" + get_date() + "_" + get_time() + "_" + (stratified ? "1" : "0") + ".json"; std::string result = "results_" + score_name + "_" + model + "_" + platform + "_" + get_date() + "_" + get_time() + "_" + (stratified ? "1" : "0") + ".json";
return result; return result;
} }
@@ -79,7 +80,7 @@ namespace platform {
} }
return result; return result;
} }
void Experiment::save(const string& path) void Experiment::save(const std::string& path)
{ {
json data = build_json(); json data = build_json();
ofstream file(path + "/" + get_file_name()); ofstream file(path + "/" + get_file_name());
@@ -90,40 +91,64 @@ namespace platform {
void Experiment::report() void Experiment::report()
{ {
json data = build_json(); json data = build_json();
Report report(data); ReportConsole report(data);
report.show(); report.show();
} }
void Experiment::show() void Experiment::show()
{ {
json data = build_json(); json data = build_json();
cout << data.dump(4) << endl; std::cout << data.dump(4) << std::endl;
} }
void Experiment::go(vector<string> filesToProcess, const string& path) void Experiment::go(std::vector<std::string> filesToProcess, bool quiet)
{ {
cout << "*** Starting experiment: " << title << " ***" << endl; std::cout << "*** Starting experiment: " << title << " ***" << std::endl;
for (auto fileName : filesToProcess) { for (auto fileName : filesToProcess) {
cout << "- " << setw(20) << left << fileName << " " << right << flush; std::cout << "- " << setw(20) << left << fileName << " " << right << flush;
cross_validation(path, fileName); cross_validation(fileName, quiet);
cout << endl; std::cout << std::endl;
} }
} }
void Experiment::cross_validation(const string& path, const string& fileName) std::string getColor(bayesnet::status_t status)
{ {
auto datasets = platform::Datasets(path, discretized, platform::ARFF); switch (status) {
case bayesnet::NORMAL:
return Colors::GREEN();
case bayesnet::WARNING:
return Colors::YELLOW();
case bayesnet::ERROR:
return Colors::RED();
default:
return Colors::RESET();
}
}
void showProgress(int fold, const std::string& color, const std::string& phase)
{
std::string prefix = phase == "a" ? "" : "\b\b\b\b";
std::cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush;
}
void Experiment::cross_validation(const std::string& fileName, bool quiet)
{
auto datasets = Datasets(discretized, Paths::datasets());
// Get dataset // Get dataset
auto [X, y] = datasets.getTensors(fileName); auto [X, y] = datasets.getTensors(fileName);
auto states = datasets.getStates(fileName); auto states = datasets.getStates(fileName);
auto features = datasets.getFeatures(fileName); auto features = datasets.getFeatures(fileName);
auto samples = datasets.getNSamples(fileName); auto samples = datasets.getNSamples(fileName);
auto className = datasets.getClassName(fileName); auto className = datasets.getClassName(fileName);
cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush; if (!quiet) {
std::cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush;
}
// Prepare Result // Prepare Result
auto result = Result(); auto result = Result();
auto [values, counts] = at::_unique(y); auto [values, counts] = at::_unique(y);
result.setSamples(X.size(1)).setFeatures(X.size(0)).setClasses(values.size(0)); result.setSamples(X.size(1)).setFeatures(X.size(0)).setClasses(values.size(0));
result.setHyperparameters(hyperparameters.get(fileName));
// Initialize results std::vectors
int nResults = nfolds * static_cast<int>(randomSeeds.size()); int nResults = nfolds * static_cast<int>(randomSeeds.size());
auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64); auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64);
auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64); auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64);
@@ -135,7 +160,8 @@ namespace platform {
Timer train_timer, test_timer; Timer train_timer, test_timer;
int item = 0; int item = 0;
for (auto seed : randomSeeds) { for (auto seed : randomSeeds) {
cout << "(" << seed << ") doing Fold: " << flush; if (!quiet)
std::cout << "(" << seed << ") doing Fold: " << flush;
Fold* fold; Fold* fold;
if (stratified) if (stratified)
fold = new StratifiedKFold(nfolds, y, seed); fold = new StratifiedKFold(nfolds, y, seed);
@@ -144,6 +170,10 @@ namespace platform {
for (int nfold = 0; nfold < nfolds; nfold++) { for (int nfold = 0; nfold < nfolds; nfold++) {
auto clf = Models::instance()->create(model); auto clf = Models::instance()->create(model);
setModelVersion(clf->getVersion()); setModelVersion(clf->getVersion());
auto valid = clf->getValidHyperparameters();
hyperparameters.check(valid, fileName);
clf->setHyperparameters(hyperparameters.get(fileName));
// Split train - test dataset
train_timer.start(); train_timer.start();
auto [train, test] = fold->getFold(nfold); auto [train, test] = fold->getFold(nfold);
auto train_t = torch::tensor(train); auto train_t = torch::tensor(train);
@@ -152,31 +182,43 @@ namespace platform {
auto y_train = y.index({ train_t }); auto y_train = y.index({ train_t });
auto X_test = X.index({ "...", test_t }); auto X_test = X.index({ "...", test_t });
auto y_test = y.index({ test_t }); auto y_test = y.index({ test_t });
cout << nfold + 1 << ", " << flush; if (!quiet)
showProgress(nfold + 1, getColor(clf->getStatus()), "a");
// Train model
clf->fit(X_train, y_train, features, className, states); clf->fit(X_train, y_train, features, className, states);
if (!quiet)
showProgress(nfold + 1, getColor(clf->getStatus()), "b");
nodes[item] = clf->getNumberOfNodes(); nodes[item] = clf->getNumberOfNodes();
edges[item] = clf->getNumberOfEdges(); edges[item] = clf->getNumberOfEdges();
num_states[item] = clf->getNumberOfStates(); num_states[item] = clf->getNumberOfStates();
train_time[item] = train_timer.getDuration(); train_time[item] = train_timer.getDuration();
// Score train
auto accuracy_train_value = clf->score(X_train, y_train); auto accuracy_train_value = clf->score(X_train, y_train);
// Test model
if (!quiet)
showProgress(nfold + 1, getColor(clf->getStatus()), "c");
test_timer.start(); test_timer.start();
auto accuracy_test_value = clf->score(X_test, y_test); auto accuracy_test_value = clf->score(X_test, y_test);
test_time[item] = test_timer.getDuration(); test_time[item] = test_timer.getDuration();
accuracy_train[item] = accuracy_train_value; accuracy_train[item] = accuracy_train_value;
accuracy_test[item] = accuracy_test_value; accuracy_test[item] = accuracy_test_value;
// Store results and times in vector if (!quiet)
std::cout << "\b\b\b, " << flush;
// Store results and times in std::vector
result.addScoreTrain(accuracy_train_value); result.addScoreTrain(accuracy_train_value);
result.addScoreTest(accuracy_test_value); result.addScoreTest(accuracy_test_value);
result.addTimeTrain(train_time[item].item<double>()); result.addTimeTrain(train_time[item].item<double>());
result.addTimeTest(test_time[item].item<double>()); result.addTimeTest(test_time[item].item<double>());
item++; item++;
} }
cout << "end. " << flush; if (!quiet)
std::cout << "end. " << flush;
delete fold; delete fold;
} }
result.setScoreTest(torch::mean(accuracy_test).item<double>()).setScoreTrain(torch::mean(accuracy_train).item<double>()); result.setScoreTest(torch::mean(accuracy_test).item<double>()).setScoreTrain(torch::mean(accuracy_train).item<double>());
result.setScoreTestStd(torch::std(accuracy_test).item<double>()).setScoreTrainStd(torch::std(accuracy_train).item<double>()); result.setScoreTestStd(torch::std(accuracy_test).item<double>()).setScoreTrainStd(torch::std(accuracy_train).item<double>());
result.setTrainTime(torch::mean(train_time).item<double>()).setTestTime(torch::mean(test_time).item<double>()); result.setTrainTime(torch::mean(train_time).item<double>()).setTestTime(torch::mean(test_time).item<double>());
result.setTestTimeStd(torch::std(test_time).item<double>()).setTrainTimeStd(torch::std(train_time).item<double>());
result.setNodes(torch::mean(nodes).item<double>()).setLeaves(torch::mean(edges).item<double>()).setDepth(torch::mean(num_states).item<double>()); result.setNodes(torch::mean(nodes).item<double>()).setLeaves(torch::mean(edges).item<double>()).setDepth(torch::mean(num_states).item<double>());
result.setDataset(fileName); result.setDataset(fileName);
addResult(result); addResult(result);

View File

@@ -3,41 +3,28 @@
#include <torch/torch.h> #include <torch/torch.h>
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
#include <string> #include <string>
#include <chrono>
#include "Folding.h" #include "Folding.h"
#include "BaseClassifier.h" #include "BaseClassifier.h"
#include "HyperParameters.h"
#include "TAN.h" #include "TAN.h"
#include "KDB.h" #include "KDB.h"
#include "AODE.h" #include "AODE.h"
#include "Timer.h"
using namespace std;
namespace platform { namespace platform {
using json = nlohmann::json; using json = nlohmann::json;
class Timer {
private:
chrono::high_resolution_clock::time_point begin;
public:
Timer() = default;
~Timer() = default;
void start() { begin = chrono::high_resolution_clock::now(); }
double getDuration()
{
chrono::high_resolution_clock::time_point end = chrono::high_resolution_clock::now();
chrono::duration<double> time_span = chrono::duration_cast<chrono::duration<double>>(end - begin);
return time_span.count();
}
};
class Result { class Result {
private: private:
string dataset, hyperparameters, model_version; std::string dataset, model_version;
json hyperparameters;
int samples{ 0 }, features{ 0 }, classes{ 0 }; int samples{ 0 }, features{ 0 }, classes{ 0 };
double score_train{ 0 }, score_test{ 0 }, score_train_std{ 0 }, score_test_std{ 0 }, train_time{ 0 }, train_time_std{ 0 }, test_time{ 0 }, test_time_std{ 0 }; double score_train{ 0 }, score_test{ 0 }, score_train_std{ 0 }, score_test_std{ 0 }, train_time{ 0 }, train_time_std{ 0 }, test_time{ 0 }, test_time_std{ 0 };
float nodes{ 0 }, leaves{ 0 }, depth{ 0 }; float nodes{ 0 }, leaves{ 0 }, depth{ 0 };
vector<double> scores_train, scores_test, times_train, times_test; std::vector<double> scores_train, scores_test, times_train, times_test;
public: public:
Result() = default; Result() = default;
Result& setDataset(const string& dataset) { this->dataset = dataset; return *this; } Result& setDataset(const std::string& dataset) { this->dataset = dataset; return *this; }
Result& setHyperparameters(const string& hyperparameters) { this->hyperparameters = hyperparameters; return *this; } Result& setHyperparameters(const json& hyperparameters) { this->hyperparameters = hyperparameters; return *this; }
Result& setSamples(int samples) { this->samples = samples; return *this; } Result& setSamples(int samples) { this->samples = samples; return *this; }
Result& setFeatures(int features) { this->features = features; return *this; } Result& setFeatures(int features) { this->features = features; return *this; }
Result& setClasses(int classes) { this->classes = classes; return *this; } Result& setClasses(int classes) { this->classes = classes; return *this; }
@@ -58,8 +45,8 @@ namespace platform {
Result& addTimeTest(double time) { times_test.push_back(time); return *this; } Result& addTimeTest(double time) { times_test.push_back(time); return *this; }
const float get_score_train() const { return score_train; } const float get_score_train() const { return score_train; }
float get_score_test() { return score_test; } float get_score_test() { return score_test; }
const string& getDataset() const { return dataset; } const std::string& getDataset() const { return dataset; }
const string& getHyperparameters() const { return hyperparameters; } const json& getHyperparameters() const { return hyperparameters; }
const int getSamples() const { return samples; } const int getSamples() const { return samples; }
const int getFeatures() const { return features; } const int getFeatures() const { return features; }
const int getClasses() const { return classes; } const int getClasses() const { return classes; }
@@ -74,41 +61,43 @@ namespace platform {
const float getNodes() const { return nodes; } const float getNodes() const { return nodes; }
const float getLeaves() const { return leaves; } const float getLeaves() const { return leaves; }
const float getDepth() const { return depth; } const float getDepth() const { return depth; }
const vector<double>& getScoresTrain() const { return scores_train; } const std::vector<double>& getScoresTrain() const { return scores_train; }
const vector<double>& getScoresTest() const { return scores_test; } const std::vector<double>& getScoresTest() const { return scores_test; }
const vector<double>& getTimesTrain() const { return times_train; } const std::vector<double>& getTimesTrain() const { return times_train; }
const vector<double>& getTimesTest() const { return times_test; } const std::vector<double>& getTimesTest() const { return times_test; }
}; };
class Experiment { class Experiment {
private:
string title, model, platform, score_name, model_version, language_version, language;
bool discretized{ false }, stratified{ false };
vector<Result> results;
vector<int> randomSeeds;
int nfolds{ 0 };
float duration{ 0 };
json build_json();
public: public:
Experiment() = default; Experiment() = default;
Experiment& setTitle(const string& title) { this->title = title; return *this; } Experiment& setTitle(const std::string& title) { this->title = title; return *this; }
Experiment& setModel(const string& model) { this->model = model; return *this; } Experiment& setModel(const std::string& model) { this->model = model; return *this; }
Experiment& setPlatform(const string& platform) { this->platform = platform; return *this; } Experiment& setPlatform(const std::string& platform) { this->platform = platform; return *this; }
Experiment& setScoreName(const string& score_name) { this->score_name = score_name; return *this; } Experiment& setScoreName(const std::string& score_name) { this->score_name = score_name; return *this; }
Experiment& setModelVersion(const string& model_version) { this->model_version = model_version; return *this; } Experiment& setModelVersion(const std::string& model_version) { this->model_version = model_version; return *this; }
Experiment& setLanguage(const string& language) { this->language = language; return *this; } Experiment& setLanguage(const std::string& language) { this->language = language; return *this; }
Experiment& setLanguageVersion(const string& language_version) { this->language_version = language_version; return *this; } Experiment& setLanguageVersion(const std::string& language_version) { this->language_version = language_version; return *this; }
Experiment& setDiscretized(bool discretized) { this->discretized = discretized; return *this; } Experiment& setDiscretized(bool discretized) { this->discretized = discretized; return *this; }
Experiment& setStratified(bool stratified) { this->stratified = stratified; return *this; } Experiment& setStratified(bool stratified) { this->stratified = stratified; return *this; }
Experiment& setNFolds(int nfolds) { this->nfolds = nfolds; return *this; } Experiment& setNFolds(int nfolds) { this->nfolds = nfolds; return *this; }
Experiment& addResult(Result result) { results.push_back(result); return *this; } Experiment& addResult(Result result) { results.push_back(result); return *this; }
Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); return *this; } Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); return *this; }
Experiment& setDuration(float duration) { this->duration = duration; return *this; } Experiment& setDuration(float duration) { this->duration = duration; return *this; }
string get_file_name(); Experiment& setHyperparameters(const HyperParameters& hyperparameters_) { this->hyperparameters = hyperparameters_; return *this; }
void save(const string& path); std::string get_file_name();
void cross_validation(const string& path, const string& fileName); void save(const std::string& path);
void go(vector<string> filesToProcess, const string& path); void cross_validation(const std::string& fileName, bool quiet);
void go(std::vector<std::string> filesToProcess, bool quiet);
void show(); void show();
void report(); void report();
private:
std::string title, model, platform, score_name, model_version, language_version, language;
bool discretized{ false }, stratified{ false };
std::vector<Result> results;
std::vector<int> randomSeeds;
HyperParameters hyperparameters;
int nfolds{ 0 };
float duration{ 0 };
json build_json();
}; };
} }
#endif #endif

View File

@@ -1,95 +1,104 @@
#include "Folding.h" #include "Folding.h"
#include <algorithm> #include <algorithm>
#include <map> #include <map>
Fold::Fold(int k, int n, int seed) : k(k), n(n), seed(seed) namespace platform {
{ Fold::Fold(int k, int n, int seed) : k(k), n(n), seed(seed)
random_device rd; {
random_seed = default_random_engine(seed == -1 ? rd() : seed); std::random_device rd;
srand(seed == -1 ? time(0) : seed); random_seed = std::default_random_engine(seed == -1 ? rd() : seed);
} std::srand(seed == -1 ? time(0) : seed);
KFold::KFold(int k, int n, int seed) : Fold(k, n, seed), indices(vector<int>(n))
{
iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1
shuffle(indices.begin(), indices.end(), random_seed);
}
pair<vector<int>, vector<int>> KFold::getFold(int nFold)
{
if (nFold >= k || nFold < 0) {
throw out_of_range("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")");
} }
int nTest = n / k; KFold::KFold(int k, int n, int seed) : Fold(k, n, seed), indices(std::vector<int>(n))
auto train = vector<int>(); {
auto test = vector<int>(); std::iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1
for (int i = 0; i < n; i++) {
if (i >= nTest * nFold && i < nTest * (nFold + 1)) {
test.push_back(indices[i]);
} else {
train.push_back(indices[i]);
}
}
return { train, test };
}
StratifiedKFold::StratifiedKFold(int k, torch::Tensor& y, int seed) : Fold(k, y.numel(), seed)
{
n = y.numel();
this->y = vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + n);
build();
}
StratifiedKFold::StratifiedKFold(int k, const vector<int>& y, int seed)
: Fold(k, y.size(), seed)
{
this->y = y;
n = y.size();
build();
}
void StratifiedKFold::build()
{
stratified_indices = vector<vector<int>>(k);
int fold_size = n / k;
// Compute class counts and indices
auto class_indices = map<int, vector<int>>();
vector<int> class_counts(*max_element(y.begin(), y.end()) + 1, 0);
for (auto i = 0; i < n; ++i) {
class_counts[y[i]]++;
class_indices[y[i]].push_back(i);
}
// Shuffle class indices
for (auto& [cls, indices] : class_indices) {
shuffle(indices.begin(), indices.end(), random_seed); shuffle(indices.begin(), indices.end(), random_seed);
} }
// Assign indices to folds std::pair<std::vector<int>, std::vector<int>> KFold::getFold(int nFold)
for (auto label = 0; label < class_counts.size(); ++label) { {
auto num_samples_to_take = class_counts[label] / k; if (nFold >= k || nFold < 0) {
if (num_samples_to_take == 0) throw std::out_of_range("nFold (" + std::to_string(nFold) + ") must be less than k (" + std::to_string(k) + ")");
continue;
auto remainder_samples_to_take = class_counts[label] % k;
for (auto fold = 0; fold < k; ++fold) {
auto it = next(class_indices[label].begin(), num_samples_to_take);
move(class_indices[label].begin(), it, back_inserter(stratified_indices[fold])); // ##
class_indices[label].erase(class_indices[label].begin(), it);
} }
while (remainder_samples_to_take > 0) { int nTest = n / k;
int fold = (rand() % static_cast<int>(k)); auto train = std::vector<int>();
if (stratified_indices[fold].size() == fold_size + 1) { auto test = std::vector<int>();
for (int i = 0; i < n; i++) {
if (i >= nTest * nFold && i < nTest * (nFold + 1)) {
test.push_back(indices[i]);
} else {
train.push_back(indices[i]);
}
}
return { train, test };
}
StratifiedKFold::StratifiedKFold(int k, torch::Tensor& y, int seed) : Fold(k, y.numel(), seed)
{
n = y.numel();
this->y = std::vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + n);
build();
}
StratifiedKFold::StratifiedKFold(int k, const std::vector<int>& y, int seed)
: Fold(k, y.size(), seed)
{
this->y = y;
n = y.size();
build();
}
void StratifiedKFold::build()
{
stratified_indices = std::vector<std::vector<int>>(k);
int fold_size = n / k;
// Compute class counts and indices
auto class_indices = std::map<int, std::vector<int>>();
std::vector<int> class_counts(*max_element(y.begin(), y.end()) + 1, 0);
for (auto i = 0; i < n; ++i) {
class_counts[y[i]]++;
class_indices[y[i]].push_back(i);
}
// Shuffle class indices
for (auto& [cls, indices] : class_indices) {
shuffle(indices.begin(), indices.end(), random_seed);
}
// Assign indices to folds
for (auto label = 0; label < class_counts.size(); ++label) {
auto num_samples_to_take = class_counts.at(label) / k;
if (num_samples_to_take == 0) {
std::cerr << "Warning! The number of samples in class " << label << " (" << class_counts.at(label)
<< ") is less than the number of folds (" << k << ")." << std::endl;
faulty = true;
continue; continue;
} }
auto it = next(class_indices[label].begin(), 1); auto remainder_samples_to_take = class_counts[label] % k;
stratified_indices[fold].push_back(*class_indices[label].begin()); for (auto fold = 0; fold < k; ++fold) {
class_indices[label].erase(class_indices[label].begin(), it); auto it = next(class_indices[label].begin(), num_samples_to_take);
remainder_samples_to_take--; move(class_indices[label].begin(), it, back_inserter(stratified_indices[fold])); // ##
class_indices[label].erase(class_indices[label].begin(), it);
}
auto chosen = std::vector<bool>(k, false);
while (remainder_samples_to_take > 0) {
int fold = (rand() % static_cast<int>(k));
if (chosen.at(fold)) {
continue;
}
chosen[fold] = true;
auto it = next(class_indices[label].begin(), 1);
stratified_indices[fold].push_back(*class_indices[label].begin());
class_indices[label].erase(class_indices[label].begin(), it);
remainder_samples_to_take--;
}
} }
} }
} std::pair<std::vector<int>, std::vector<int>> StratifiedKFold::getFold(int nFold)
pair<vector<int>, vector<int>> StratifiedKFold::getFold(int nFold) {
{ if (nFold >= k || nFold < 0) {
if (nFold >= k || nFold < 0) { throw std::out_of_range("nFold (" + std::to_string(nFold) + ") must be less than k (" + std::to_string(k) + ")");
throw out_of_range("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")"); }
std::vector<int> test_indices = stratified_indices[nFold];
std::vector<int> train_indices;
for (int i = 0; i < k; ++i) {
if (i == nFold) continue;
train_indices.insert(train_indices.end(), stratified_indices[i].begin(), stratified_indices[i].end());
}
return { train_indices, test_indices };
} }
vector<int> test_indices = stratified_indices[nFold];
vector<int> train_indices;
for (int i = 0; i < k; ++i) {
if (i == nFold) continue;
train_indices.insert(train_indices.end(), stratified_indices[i].begin(), stratified_indices[i].end());
}
return { train_indices, test_indices };
} }

View File

@@ -3,35 +3,37 @@
#include <torch/torch.h> #include <torch/torch.h>
#include <vector> #include <vector>
#include <random> #include <random>
using namespace std; namespace platform {
class Fold {
class Fold { protected:
protected: int k;
int k; int n;
int n; int seed;
int seed; std::default_random_engine random_seed;
default_random_engine random_seed; public:
public: Fold(int k, int n, int seed = -1);
Fold(int k, int n, int seed = -1); virtual std::pair<std::vector<int>, std::vector<int>> getFold(int nFold) = 0;
virtual pair<vector<int>, vector<int>> getFold(int nFold) = 0; virtual ~Fold() = default;
virtual ~Fold() = default; int getNumberOfFolds() { return k; }
int getNumberOfFolds() { return k; } };
}; class KFold : public Fold {
class KFold : public Fold { private:
private: std::vector<int> indices;
vector<int> indices; public:
public: KFold(int k, int n, int seed = -1);
KFold(int k, int n, int seed = -1); std::pair<std::vector<int>, std::vector<int>> getFold(int nFold) override;
pair<vector<int>, vector<int>> getFold(int nFold) override; };
}; class StratifiedKFold : public Fold {
class StratifiedKFold : public Fold { private:
private: std::vector<int> y;
vector<int> y; std::vector<std::vector<int>> stratified_indices;
vector<vector<int>> stratified_indices; void build();
void build(); bool faulty = false; // Only true if the number of samples of any class is less than the number of folds.
public: public:
StratifiedKFold(int k, const vector<int>& y, int seed = -1); StratifiedKFold(int k, const std::vector<int>& y, int seed = -1);
StratifiedKFold(int k, torch::Tensor& y, int seed = -1); StratifiedKFold(int k, torch::Tensor& y, int seed = -1);
pair<vector<int>, vector<int>> getFold(int nFold) override; std::pair<std::vector<int>, std::vector<int>> getFold(int nFold) override;
}; bool isFaulty() { return faulty; }
};
}
#endif #endif

55
src/Platform/GridData.cc Normal file
View File

@@ -0,0 +1,55 @@
#include "GridData.h"
#include <fstream>
namespace platform {
GridData::GridData(const std::string& fileName)
{
std::ifstream resultData(fileName);
if (resultData.is_open()) {
grid = json::parse(resultData);
} else {
throw std::invalid_argument("Unable to open input file. [" + fileName + "]");
}
}
int GridData::computeNumCombinations(const json& line)
{
int numCombinations = 1;
for (const auto& item : line.items()) {
numCombinations *= item.value().size();
}
return numCombinations;
}
int GridData::getNumCombinations()
{
int numCombinations = 0;
for (const auto& line : grid) {
numCombinations += computeNumCombinations(line);
}
return numCombinations;
}
json GridData::generateCombinations(json::iterator index, const json::iterator last, std::vector<json>& output, json currentCombination)
{
if (index == last) {
// If we reached the end of input, store the current combination
output.push_back(currentCombination);
return currentCombination;
}
const auto& key = index.key();
const auto& values = index.value();
for (const auto& value : values) {
auto combination = currentCombination;
combination[key] = value;
json::iterator nextIndex = index;
generateCombinations(++nextIndex, last, output, combination);
}
return currentCombination;
}
std::vector<json> GridData::getGrid()
{
auto result = std::vector<json>();
for (json line : grid) {
generateCombinations(line.begin(), line.end(), result, json({}));
}
return result;
}
} /* namespace platform */

22
src/Platform/GridData.h Normal file
View File

@@ -0,0 +1,22 @@
#ifndef GRIDDATA_H
#define GRIDDATA_H
#include <string>
#include <vector>
#include <map>
#include <nlohmann/json.hpp>
namespace platform {
using json = nlohmann::json;
class GridData {
public:
explicit GridData(const std::string& fileName);
~GridData() = default;
std::vector<json> getGrid();
int getNumCombinations();
private:
json generateCombinations(json::iterator index, const json::iterator last, std::vector<json>& output, json currentCombination);
int computeNumCombinations(const json& line);
json grid;
};
} /* namespace platform */
#endif /* GRIDDATA_H */

130
src/Platform/GridSearch.cc Normal file
View File

@@ -0,0 +1,130 @@
#include <iostream>
#include <torch/torch.h>
#include "GridSearch.h"
#include "Models.h"
#include "Paths.h"
#include "Folding.h"
#include "Colors.h"
namespace platform {
GridSearch::GridSearch(struct ConfigGrid& config) : config(config)
{
this->config.output_file = config.path + "grid_" + config.model + "_output.json";
this->config.input_file = config.path + "grid_" + config.model + "_input.json";
}
void showProgressComb(const int num, const int total, const std::string& color)
{
int spaces = int(log(total) / log(10)) + 1;
int magic = 37 + 2 * spaces;
std::string prefix = num == 1 ? "" : string(magic, '\b') + string(magic + 1, ' ') + string(magic + 1, '\b');
std::cout << prefix << color << "(" << setw(spaces) << num << "/" << setw(spaces) << total << ") " << Colors::RESET() << flush;
}
void showProgressFold(int fold, const std::string& color, const std::string& phase)
{
std::string prefix = phase == "a" ? "" : "\b\b\b\b";
std::cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush;
}
std::string getColor(bayesnet::status_t status)
{
switch (status) {
case bayesnet::NORMAL:
return Colors::GREEN();
case bayesnet::WARNING:
return Colors::YELLOW();
case bayesnet::ERROR:
return Colors::RED();
default:
return Colors::RESET();
}
}
double GridSearch::processFile(std::string fileName, Datasets& datasets, HyperParameters& hyperparameters)
{
// Get dataset
auto [X, y] = datasets.getTensors(fileName);
auto states = datasets.getStates(fileName);
auto features = datasets.getFeatures(fileName);
auto samples = datasets.getNSamples(fileName);
auto className = datasets.getClassName(fileName);
double totalScore = 0.0;
int numItems = 0;
for (const auto& seed : config.seeds) {
if (!config.quiet)
std::cout << "(" << seed << ") doing Fold: " << flush;
Fold* fold;
if (config.stratified)
fold = new StratifiedKFold(config.n_folds, y, seed);
else
fold = new KFold(config.n_folds, y.size(0), seed);
double bestScore = 0.0;
for (int nfold = 0; nfold < config.n_folds; nfold++) {
auto clf = Models::instance()->create(config.model);
clf->setHyperparameters(hyperparameters.get(fileName));
auto [train, test] = fold->getFold(nfold);
auto train_t = torch::tensor(train);
auto test_t = torch::tensor(test);
auto X_train = X.index({ "...", train_t });
auto y_train = y.index({ train_t });
auto X_test = X.index({ "...", test_t });
auto y_test = y.index({ test_t });
// Train model
if (!config.quiet)
showProgressFold(nfold + 1, getColor(clf->getStatus()), "a");
clf->fit(X_train, y_train, features, className, states);
// Test model
if (!config.quiet)
showProgressFold(nfold + 1, getColor(clf->getStatus()), "b");
totalScore += clf->score(X_test, y_test);
numItems++;
if (!config.quiet)
std::cout << "\b\b\b, \b" << flush;
}
delete fold;
}
return numItems == 0 ? 0.0 : totalScore / numItems;
}
void GridSearch::go()
{
// Load datasets
auto datasets = Datasets(config.discretize, Paths::datasets());
// Create model
std::cout << "***************** Starting Gridsearch *****************" << std::endl;
std::cout << "input file=" << config.input_file << std::endl;
auto grid = GridData(config.input_file);
auto totalComb = grid.getNumCombinations();
std::cout << "* Doing " << totalComb << " combinations for each dataset/seed/fold" << std::endl;
// Generate hyperparameters grid & run gridsearch
// Check each combination of hyperparameters for each dataset and each seed
for (const auto& dataset : datasets.getNames()) {
if (!config.quiet)
std::cout << "- " << setw(20) << left << dataset << " " << right << flush;
int num = 0;
double bestScore = 0.0;
json bestHyperparameters;
for (const auto& hyperparam_line : grid.getGrid()) {
if (!config.quiet)
showProgressComb(++num, totalComb, Colors::CYAN());
auto hyperparameters = platform::HyperParameters(datasets.getNames(), hyperparam_line);
double score = processFile(dataset, datasets, hyperparameters);
if (score > bestScore) {
bestScore = score;
bestHyperparameters = hyperparam_line;
}
}
if (!config.quiet) {
std::cout << "end." << " Score: " << setw(9) << setprecision(7) << fixed
<< bestScore << " [" << bestHyperparameters.dump() << "]" << std::endl;
}
results[dataset]["score"] = bestScore;
results[dataset]["hyperparameters"] = bestHyperparameters;
}
// Save results
save();
std::cout << "***************** Ending Gridsearch *******************" << std::endl;
}
void GridSearch::save() const
{
std::ofstream file(config.output_file);
file << results.dump(4);
file.close();
}
} /* namespace platform */

36
src/Platform/GridSearch.h Normal file
View File

@@ -0,0 +1,36 @@
#ifndef GRIDSEARCH_H
#define GRIDSEARCH_H
#include <string>
#include <vector>
#include <nlohmann/json.hpp>
#include "Datasets.h"
#include "HyperParameters.h"
#include "GridData.h"
namespace platform {
using json = nlohmann::json;
struct ConfigGrid {
std::string model;
std::string score;
std::string path;
std::string input_file;
std::string output_file;
bool quiet;
bool discretize;
bool stratified;
int n_folds;
std::vector<int> seeds;
};
class GridSearch {
public:
explicit GridSearch(struct ConfigGrid& config);
void go();
void save() const;
~GridSearch() = default;
private:
double processFile(std::string fileName, Datasets& datasets, HyperParameters& hyperparameters);
json results;
struct ConfigGrid config;
};
} /* namespace platform */
#endif /* GRIDSEARCH_H */

View File

@@ -0,0 +1,55 @@
#include "HyperParameters.h"
#include <fstream>
#include <sstream>
#include <iostream>
namespace platform {
HyperParameters::HyperParameters(const std::vector<std::string>& datasets, const json& hyperparameters_)
{
// Initialize all datasets with the given hyperparameters
for (const auto& item : datasets) {
hyperparameters[item] = hyperparameters_;
}
}
// https://www.techiedelight.com/implode-a-vector-of-strings-into-a-comma-separated-string-in-cpp/
std::string join(std::vector<std::string> const& strings, std::string delim)
{
std::stringstream ss;
std::copy(strings.begin(), strings.end(),
std::ostream_iterator<std::string>(ss, delim.c_str()));
return ss.str();
}
HyperParameters::HyperParameters(const std::vector<std::string>& datasets, const std::string& hyperparameters_file)
{
// Check if file exists
std::ifstream file(hyperparameters_file);
if (!file.is_open()) {
throw std::runtime_error("File " + hyperparameters_file + " not found");
}
// Check if file is a json
json input_hyperparameters = json::parse(file);
// Check if hyperparameters are valid
for (const auto& dataset : datasets) {
if (!input_hyperparameters.contains(dataset)) {
std::cerr << "*Warning: Dataset " << dataset << " not found in hyperparameters file" << " assuming default hyperparameters" << std::endl;
hyperparameters[dataset] = json({});
continue;
}
hyperparameters[dataset] = input_hyperparameters[dataset].get<json>();
}
}
void HyperParameters::check(const std::vector<std::string>& valid, const std::string& fileName)
{
json result = hyperparameters.at(fileName);
for (const auto& item : result.items()) {
if (find(valid.begin(), valid.end(), item.key()) == valid.end()) {
throw std::invalid_argument("Hyperparameter " + item.key() + " is not valid. Passed Hyperparameters are: "
+ result.dump(4) + "\n Valid hyperparameters are: {" + join(valid, ",") + "}");
}
}
}
json HyperParameters::get(const std::string& fileName)
{
return hyperparameters.at(fileName);
}
} /* namespace platform */

View File

@@ -0,0 +1,23 @@
#ifndef HYPERPARAMETERS_H
#define HYPERPARAMETERS_H
#include <string>
#include <map>
#include <vector>
#include <nlohmann/json.hpp>
namespace platform {
using json = nlohmann::json;
class HyperParameters {
public:
HyperParameters() = default;
explicit HyperParameters(const std::vector<std::string>& datasets, const json& hyperparameters_);
explicit HyperParameters(const std::vector<std::string>& datasets, const std::string& hyperparameters_file);
~HyperParameters() = default;
bool notEmpty(const std::string& key) const { return !hyperparameters.at(key).empty(); }
void check(const std::vector<std::string>& valid, const std::string& fileName);
json get(const std::string& fileName);
private:
std::map<std::string, json> hyperparameters;
};
} /* namespace platform */
#endif /* HYPERPARAMETERS_H */

View File

@@ -0,0 +1,213 @@
#include "ManageResults.h"
#include "CommandParser.h"
#include <filesystem>
#include <tuple>
#include "Colors.h"
#include "CLocale.h"
#include "Paths.h"
#include "ReportConsole.h"
#include "ReportExcel.h"
namespace platform {
ManageResults::ManageResults(int numFiles, const std::string& model, const std::string& score, bool complete, bool partial, bool compare) :
numFiles{ numFiles }, complete{ complete }, partial{ partial }, compare{ compare }, results(Results(Paths::results(), model, score, complete, partial))
{
indexList = true;
openExcel = false;
workbook = NULL;
if (numFiles == 0) {
this->numFiles = results.size();
}
}
void ManageResults::doMenu()
{
if (results.empty()) {
std::cout << Colors::MAGENTA() << "No results found!" << Colors::RESET() << std::endl;
return;
}
results.sortDate();
list();
menu();
if (openExcel) {
workbook_close(workbook);
}
std::cout << Colors::RESET() << "Done!" << std::endl;
}
void ManageResults::list()
{
auto temp = ConfigLocale();
std::string suffix = numFiles != results.size() ? " of " + std::to_string(results.size()) : "";
std::stringstream oss;
oss << "Results on screen: " << numFiles << suffix;
std::cout << Colors::GREEN() << oss.str() << std::endl;
std::cout << std::string(oss.str().size(), '-') << std::endl;
if (complete) {
std::cout << Colors::MAGENTA() << "Only listing complete results" << std::endl;
}
if (partial) {
std::cout << Colors::MAGENTA() << "Only listing partial results" << std::endl;
}
auto i = 0;
int maxModel = results.maxModelSize();
std::cout << Colors::GREEN() << " # Date " << std::setw(maxModel) << std::left << "Model" << " Score Name Score C/P Duration Title" << std::endl;
std::cout << "=== ========== " << std::string(maxModel, '=') << " =========== =========== === ========= =============================================================" << std::endl;
bool odd = true;
for (auto& result : results) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
std::cout << result.to_string(maxModel) << std::endl;
if (i == numFiles) {
break;
}
odd = !odd;
}
}
bool ManageResults::confirmAction(const std::string& intent, const std::string& fileName) const
{
std::string color;
if (intent == "delete") {
color = Colors::RED();
} else {
color = Colors::YELLOW();
}
std::string line;
bool finished = false;
while (!finished) {
std::cout << color << "Really want to " << intent << " " << fileName << "? (y/n): ";
getline(std::cin, line);
finished = line.size() == 1 && (tolower(line[0]) == 'y' || tolower(line[0] == 'n'));
}
if (tolower(line[0]) == 'y') {
return true;
}
std::cout << "Not done!" << std::endl;
return false;
}
void ManageResults::report(const int index, const bool excelReport)
{
std::cout << Colors::YELLOW() << "Reporting " << results.at(index).getFilename() << std::endl;
auto data = results.at(index).load();
if (excelReport) {
ReportExcel reporter(data, compare, workbook);
reporter.show();
openExcel = true;
workbook = reporter.getWorkbook();
std::cout << "Adding sheet to " << Paths::excel() + Paths::excelResults() << std::endl;
} else {
ReportConsole reporter(data, compare);
reporter.show();
}
}
void ManageResults::showIndex(const int index, const int idx)
{
// Show a dataset result inside a report
auto data = results.at(index).load();
std::cout << Colors::YELLOW() << "Showing " << results.at(index).getFilename() << std::endl;
ReportConsole reporter(data, compare, idx);
reporter.show();
}
void ManageResults::sortList()
{
std::cout << Colors::YELLOW() << "Choose sorting field (date='d', score='s', duration='u', model='m'): ";
std::string line;
char option;
getline(std::cin, line);
if (line.size() == 0)
return;
if (line.size() > 1) {
std::cout << "Invalid option" << std::endl;
return;
}
option = line[0];
switch (option) {
case 'd':
results.sortDate();
break;
case 's':
results.sortScore();
break;
case 'u':
results.sortDuration();
break;
case 'm':
results.sortModel();
break;
default:
std::cout << "Invalid option" << std::endl;
}
}
void ManageResults::menu()
{
char option;
int index, subIndex;
bool finished = false;
std::string filename;
// tuple<Option, digit, requires value>
std::vector<std::tuple<std::string, char, bool>> mainOptions = {
{"quit", 'q', false},
{"list", 'l', false},
{"delete", 'd', true},
{"hide", 'h', true},
{"sort", 's', false},
{"report", 'r', true},
{"excel", 'e', true}
};
std::vector<std::tuple<std::string, char, bool>> listOptions = {
{"report", 'r', true},
{"list", 'l', false},
{"quit", 'q', false}
};
auto parser = CommandParser();
while (!finished) {
if (indexList) {
std::tie(option, index) = parser.parse(Colors::GREEN(), mainOptions, 'r', numFiles - 1);
} else {
std::tie(option, subIndex) = parser.parse(Colors::MAGENTA(), listOptions, 'r', results.at(index).load()["results"].size() - 1);
}
switch (option) {
case 'q':
finished = true;
break;
case 'l':
list();
indexList = true;
break;
case 'd':
filename = results.at(index).getFilename();
if (!confirmAction("delete", filename))
break;
std::cout << "Deleting " << filename << std::endl;
results.deleteResult(index);
std::cout << "File: " + filename + " deleted!" << std::endl;
list();
break;
case 'h':
filename = results.at(index).getFilename();
if (!confirmAction("hide", filename))
break;
filename = results.at(index).getFilename();
std::cout << "Hiding " << filename << std::endl;
results.hideResult(index, Paths::hiddenResults());
std::cout << "File: " + filename + " hidden! (moved to " << Paths::hiddenResults() << ")" << std::endl;
list();
break;
case 's':
sortList();
list();
break;
case 'r':
if (indexList) {
report(index, false);
indexList = false;
} else {
showIndex(index, subIndex);
}
break;
case 'e':
report(index, true);
break;
}
}
}
} /* namespace platform */

View File

@@ -0,0 +1,31 @@
#ifndef MANAGE_RESULTS_H
#define MANAGE_RESULTS_H
#include "Results.h"
#include "xlsxwriter.h"
namespace platform {
class ManageResults {
public:
ManageResults(int numFiles, const std::string& model, const std::string& score, bool complete, bool partial, bool compare);
~ManageResults() = default;
void doMenu();
private:
void list();
bool confirmAction(const std::string& intent, const std::string& fileName) const;
void report(const int index, const bool excelReport);
void showIndex(const int index, const int idx);
void sortList();
void menu();
int numFiles;
bool indexList;
bool openExcel;
bool complete;
bool partial;
bool compare;
Results results;
lxw_workbook* workbook;
};
}
#endif /* MANAGE_RESULTS_H */

View File

@@ -1,6 +1,5 @@
#include "Models.h" #include "Models.h"
namespace platform { namespace platform {
using namespace std;
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
Models* Models::factory = nullptr;; Models* Models::factory = nullptr;;
Models* Models::instance() Models* Models::instance()
@@ -10,13 +9,13 @@ namespace platform {
factory = new Models(); factory = new Models();
return factory; return factory;
} }
void Models::registerFactoryFunction(const string& name, void Models::registerFactoryFunction(const std::string& name,
function<bayesnet::BaseClassifier* (void)> classFactoryFunction) function<bayesnet::BaseClassifier* (void)> classFactoryFunction)
{ {
// register the class factory function // register the class factory function
functionRegistry[name] = classFactoryFunction; functionRegistry[name] = classFactoryFunction;
} }
shared_ptr<bayesnet::BaseClassifier> Models::create(const string& name) shared_ptr<bayesnet::BaseClassifier> Models::create(const std::string& name)
{ {
bayesnet::BaseClassifier* instance = nullptr; bayesnet::BaseClassifier* instance = nullptr;
@@ -26,27 +25,26 @@ namespace platform {
instance = it->second(); instance = it->second();
// wrap instance in a shared ptr and return // wrap instance in a shared ptr and return
if (instance != nullptr) if (instance != nullptr)
return shared_ptr<bayesnet::BaseClassifier>(instance); return unique_ptr<bayesnet::BaseClassifier>(instance);
else else
return nullptr; return nullptr;
} }
vector<string> Models::getNames() std::vector<std::string> Models::getNames()
{ {
vector<string> names; std::vector<std::string> names;
transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names), transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names),
[](const pair<string, function<bayesnet::BaseClassifier* (void)>>& pair) { return pair.first; }); [](const pair<std::string, function<bayesnet::BaseClassifier* (void)>>& pair) { return pair.first; });
return names; return names;
} }
string Models::toString() std::string Models::tostring()
{ {
string result = ""; std::string result = "";
for (const auto& pair : functionRegistry) { for (const auto& pair : functionRegistry) {
result += pair.first + ", "; result += pair.first + ", ";
} }
return "{" + result.substr(0, result.size() - 2) + "}"; return "{" + result.substr(0, result.size() - 2) + "}";
} }
Registrar::Registrar(const std::string& name, function<bayesnet::BaseClassifier* (void)> classFactoryFunction)
Registrar::Registrar(const string& name, function<bayesnet::BaseClassifier* (void)> classFactoryFunction)
{ {
// register the class factory function // register the class factory function
Models::instance()->registerFactoryFunction(name, classFactoryFunction); Models::instance()->registerFactoryFunction(name, classFactoryFunction);

View File

@@ -10,10 +10,15 @@
#include "KDBLd.h" #include "KDBLd.h"
#include "SPODELd.h" #include "SPODELd.h"
#include "AODELd.h" #include "AODELd.h"
#include "BoostAODE.h"
#include "STree.h"
#include "ODTE.h"
#include "SVC.h"
#include "RandomForest.h"
namespace platform { namespace platform {
class Models { class Models {
private: private:
map<string, function<bayesnet::BaseClassifier* (void)>> functionRegistry; map<std::string, function<bayesnet::BaseClassifier* (void)>> functionRegistry;
static Models* factory; //singleton static Models* factory; //singleton
Models() {}; Models() {};
public: public:
@@ -21,16 +26,16 @@ namespace platform {
void operator=(const Models&) = delete; void operator=(const Models&) = delete;
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
static Models* instance(); static Models* instance();
shared_ptr<bayesnet::BaseClassifier> create(const string& name); shared_ptr<bayesnet::BaseClassifier> create(const std::string& name);
void registerFactoryFunction(const string& name, void registerFactoryFunction(const std::string& name,
function<bayesnet::BaseClassifier* (void)> classFactoryFunction); function<bayesnet::BaseClassifier* (void)> classFactoryFunction);
vector<string> getNames(); std::vector<string> getNames();
string toString(); std::string tostring();
}; };
class Registrar { class Registrar {
public: public:
Registrar(const string& className, function<bayesnet::BaseClassifier* (void)> classFactoryFunction); Registrar(const std::string& className, function<bayesnet::BaseClassifier* (void)> classFactoryFunction);
}; };
} }
#endif #endif

31
src/Platform/Paths.h Normal file
View File

@@ -0,0 +1,31 @@
#ifndef PATHS_H
#define PATHS_H
#include <string>
#include <filesystem>
#include "DotEnv.h"
namespace platform {
class Paths {
public:
static std::string results() { return "results/"; }
static std::string hiddenResults() { return "hidden_results/"; }
static std::string excel() { return "excel/"; }
static std::string grid() { return "grid/"; }
static std::string datasets()
{
auto env = platform::DotEnv();
return env.get("source_data");
}
static void createPath(const std::string& path)
{
// Create directory if it does not exist
try {
std::filesystem::create_directory(path);
}
catch (std::exception& e) {
throw std::runtime_error("Could not create directory " + path);
}
}
static std::string excelResults() { return "some_results.xlsx"; }
};
}
#endif

View File

@@ -1,66 +0,0 @@
#include "Report.h"
namespace platform {
string headerLine(const string& text)
{
int n = MAXL - text.length() - 3;
return "* " + text + string(n, ' ') + "*\n";
}
string Report::fromVector(const string& key)
{
string result = "";
for (auto& item : data[key]) {
result += to_string(item) + ", ";
}
return "[" + result.substr(0, result.length() - 2) + "]";
}
string fVector(const json& data)
{
string result = "";
for (const auto& item : data) {
result += to_string(item) + ", ";
}
return "[" + result.substr(0, result.length() - 2) + "]";
}
void Report::show()
{
header();
body();
}
void Report::header()
{
cout << string(MAXL, '*') << endl;
cout << headerLine("Report " + data["model"].get<string>() + " ver. " + data["version"].get<string>() + " with " + to_string(data["folds"].get<int>()) + " Folds cross validation and " + to_string(data["seeds"].size()) + " random seeds. " + data["date"].get<string>() + " " + data["time"].get<string>());
cout << headerLine(data["title"].get<string>());
cout << headerLine("Random seeds: " + fromVector("seeds") + " Stratified: " + (data["stratified"].get<bool>() ? "True" : "False"));
cout << headerLine("Execution took " + to_string(data["duration"].get<float>()) + " seconds, " + to_string(data["duration"].get<float>() / 3600) + " hours, on " + data["platform"].get<string>());
cout << headerLine("Score is " + data["score_name"].get<string>());
cout << string(MAXL, '*') << endl;
cout << endl;
}
void Report::body()
{
cout << "Dataset Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << endl;
cout << "============================== ====== ===== === ======= ======= ======= =============== ================= ===============" << endl;
for (const auto& r : data["results"]) {
cout << setw(30) << left << r["dataset"].get<string>() << " ";
cout << setw(6) << right << r["samples"].get<int>() << " ";
cout << setw(5) << right << r["features"].get<int>() << " ";
cout << setw(3) << right << r["classes"].get<int>() << " ";
cout << setw(7) << setprecision(2) << fixed << r["nodes"].get<float>() << " ";
cout << setw(7) << setprecision(2) << fixed << r["leaves"].get<float>() << " ";
cout << setw(7) << setprecision(2) << fixed << r["depth"].get<float>() << " ";
cout << setw(8) << right << setprecision(6) << fixed << r["score_test"].get<double>() << "±" << setw(6) << setprecision(4) << fixed << r["score_test_std"].get<double>() << " ";
cout << setw(10) << right << setprecision(6) << fixed << r["test_time"].get<double>() << "±" << setw(6) << setprecision(4) << fixed << r["test_time_std"].get<double>() << " ";
cout << " " << r["hyperparameters"].get<string>();
cout << endl;
cout << string(MAXL, '*') << endl;
cout << headerLine("Train scores: " + fVector(r["scores_train"]));
cout << headerLine("Test scores: " + fVector(r["scores_test"]));
cout << headerLine("Train times: " + fVector(r["times_train"]));
cout << headerLine("Test times: " + fVector(r["times_test"]));
cout << string(MAXL, '*') << endl;
}
}
}

Some files were not shown because too many files have changed in this diff Show More