Compare commits

315 Commits

Author SHA1 Message Date
4116699b01 Add sort results by title 2025-08-06 12:08:08 +02:00
b7f4651e2c Fix result name overlapping in simultaneous experiments 2025-08-04 13:10:48 +02:00
e8b35d4c5e Fix tests 2025-08-04 12:29:22 +02:00
34a0719a16 Add mdlp3, mdlp4 and mdlp5 2025-07-27 19:22:03 +02:00
72e228f367 Remove PyClassifiers 2025-07-23 00:41:36 +02:00
c3c580a611 Fix ExcelFile warning 2025-07-22 00:24:25 +02:00
515455695b Add CMAKE_POSITION_INDEPENDENT_CODE to CMakeLists 2025-07-21 18:19:50 -04:00
f68d216150 Fix tensor utils ambiguous call 2025-07-21 11:40:16 +02:00
b990684581 Refactor TensorUtils to a unique header file 2025-07-21 11:10:26 +02:00
5fd0ef692d Fix typo in conanfile 2025-07-19 23:21:09 +02:00
dfcdadbf38 Merge branch 'main' of ssh://gitea.rmontanana.es:6422/rmontanana/Platform 2025-07-19 23:20:00 +02:00
613f4b6813 Update Requirements 2025-07-19 23:19:54 +02:00
dc324fe5f7 Add Seed to note in experiment 2025-07-08 18:50:48 +02:00
9816896240 Complete the conan integration 2025-07-04 10:20:59 +02:00
a3f765ce3c Fix compilation errors and enhance Makefile 2025-07-03 10:41:16 +02:00
3d814a79c6 Begin conan integration 2025-07-03 01:40:30 +02:00
1ef7ca6180 Merge pull request 'Integrate libraries with vcpkg' (#6) from vcpkg into main
Reviewed-on: #6
2025-07-02 17:39:44 +00:00
9448a971e8 fix vcpkg.json 2025-06-27 20:25:41 +02:00
24cef7496d Optimize AdaBoostPredict and default 100 estimators 2025-06-18 18:28:54 +02:00
a1a6d3d612 Optimize AdaBoost buildModel 2025-06-18 18:15:19 +02:00
dda9740e83 Test AdaBoost fine but unoptimized 2025-06-18 18:03:19 +02:00
41afa1b888 Enhance predictProbaSample 2025-06-18 17:33:56 +02:00
4e18dc87be Fix predict_proba in AdaBoost 2025-06-18 14:18:15 +02:00
56af1a5f85 AdaBoost a falta de predict_proba 2025-06-18 13:59:23 +02:00
415a7ae608 Begin AdaBoost integration 2025-06-18 11:27:11 +02:00
023d5613b4 Add DecisionTree with tests 2025-06-17 13:48:11 +02:00
8c413a1eb0 Begin to add AdaBoost implementation 2025-06-16 00:11:51 +02:00
3b158e9fc1 Add AdaBoost 2025-06-15 12:07:12 +02:00
514968a082 Open excel file automatically when generated 2025-05-28 17:37:53 +02:00
dcde8c01be ADd std to screen output 2025-05-28 10:53:29 +02:00
a6b6efce95 Remove uneeded output in Statistics 2025-05-25 10:41:36 +02:00
473d194dde Complete integration of Wilcoxon test 2025-05-24 12:59:28 +02:00
a56ec98ef9 Add Wilcoxon Test 2025-05-21 11:51:04 +02:00
70d8022926 Refactor postHoc 2025-05-17 18:12:57 +02:00
f5107abea7 Add comment in Statistics 2025-05-14 14:02:53 +02:00
e64e281b63 Return AUC 0.5 if nPos==0 || nNeg==0 2025-05-14 13:15:33 +02:00
b639a2d79a Fix folder param in b_manage 2025-05-14 12:51:56 +02:00
d6603dd638 Add folder parameter to best, grid and main 2025-05-14 11:46:15 +02:00
321e2a2f28 Add folder to manage 2025-05-13 14:09:25 +02:00
36c72491e7 Add folder to b_best 2025-05-13 13:50:07 +02:00
aa19ab6c21 Option to use BayesNet local or vcpkg in CMakeLists 2025-05-09 19:16:17 +02:00
16b4923851 Complete configuration xlsxwriter is still with the old config 2025-05-09 11:10:27 +02:00
b1965c8ae5 Add vcpkg config files 2025-05-09 10:54:27 +02:00
7d3a2dd713 Remove modules 2025-05-08 17:15:42 +02:00
50fde9521b Update last commit badge in README 2025-04-22 11:16:27 +00:00
cd2f47c58b Merge pull request 'Including XA1DE model' (#5) from XA1DE into main
Reviewed-on: #5
2025-03-20 14:58:37 +00:00
facf6f6ddd Fix GridBase to eliminate uneeded GridData 2025-03-20 15:54:13 +01:00
c9ab88e475 Update models and remove normalize weights in XA1DE 2025-03-17 13:28:35 +01:00
c2a4e3e64e Add XSPnDE n=2 2025-03-13 11:00:21 +01:00
664a6a5aeb Add XBAODE & XSPODE from bayesnet 2025-03-09 19:20:51 +01:00
ae7b89b134 tolerance <- 3 2025-03-08 18:07:56 +01:00
9c1852c6c3 First working version 2025-03-08 14:20:27 +01:00
7a23782b05 Add XSpode submodel 2025-03-07 18:34:16 +01:00
b2002d341c Create Xaode2 and add initializer factor in predict 2025-03-03 12:38:05 +01:00
9a8b960ce8 Remove uneeded commented code 2025-03-03 11:29:57 +01:00
7bc8633ed1 Enhance result 2025-03-03 10:56:20 +01:00
11155463b9 Fix predict_proba_spode 2025-03-02 21:41:21 +01:00
12e69a7f53 Add Prior probability to predict
Fix predict_spode
2025-03-01 20:29:45 +01:00
c127cb670a Fix predict_proba_spode mistake 2025-02-27 20:45:28 +01:00
610c2a6a4a Continue refactoring 2025-02-27 11:37:30 +01:00
2dcd073299 Refactor Xaode 2025-02-27 10:08:27 +01:00
f51d5b5e40 Continue refactoring 2025-02-27 09:57:40 +01:00
4e3043b2d1 Fix XA1DE integration 2025-02-27 09:23:47 +01:00
b055065e59 Fix predict_proba declaration 2025-02-26 21:08:33 +01:00
0d1e4b3c6f Continue refactoring 2025-02-26 21:03:01 +01:00
1a688f90b4 Complete refactor of XA1DE & XBAODE with new ExpClf class 2025-02-26 16:55:04 +01:00
c63baf419f Add log and fix some mistakes in integration 2025-02-25 20:35:13 +01:00
de7cf091be Add open excel file on b_manage termination 2025-02-25 13:41:06 +01:00
475a819a87 Continue integration into trainModel 2025-02-25 11:03:53 +01:00
ce6e192a33 Include BoostAODE trainModel method in XBAODE fit method 2025-02-24 10:27:24 +01:00
5daf7cbd69 Create XBAODE classifier 2025-02-23 19:44:13 +01:00
1b26de1e38 Set use_threads true as default for XA1DE 2025-02-23 18:54:55 +01:00
d3de429f2c Add room for nodes, depth and edges on screen report 2025-02-19 16:05:21 +01:00
f48864a415 Fix back button in manage
Fix sort datasets in b_main when --datasets is used
2025-02-19 13:32:07 +01:00
c1531dba2a Complete XA1DE integration 2025-02-19 11:40:33 +01:00
5556fbab03 Complete integration with memory failure 2025-02-18 22:57:02 +01:00
ac89cefab3 Add conversion methods 2025-02-18 12:07:56 +01:00
14dd8ebb66 First compilation 2025-02-18 11:04:24 +01:00
bd5ba14f04 Begin model inclusion 2025-02-18 10:48:46 +01:00
17728212c1 Ignore case in datasets sorting 2025-02-17 20:01:06 +01:00
86b4558f9d Add 1 char to b_list datasets headers 2025-02-17 19:44:23 +01:00
505edc79ac Fix sample issue 2025-02-04 18:53:23 +01:00
73a4b3d5e5 Add changeModel to b_manage 2025-02-04 17:34:00 +01:00
cbe8f4c79c Fix status length output in b_main 2025-02-01 21:42:56 +01:00
0d08a526fa Add score to b_main output 2025-01-30 17:36:45 +01:00
d0706da887 Fix sort order in bgrid report 2025-01-21 20:38:07 +01:00
07e3cc9599 Fix errors in grid Experiment 2025-01-19 13:51:51 +01:00
2a9652b450 Fix b_main order of datasets if --datasets parameter used 2025-01-18 20:31:58 +01:00
3397d0962f Refactor arguments management for Experimentation 2025-01-18 18:26:34 +01:00
7aaf6d1bf8 Add conditional saveResults to GridExperiment 2025-01-18 13:09:45 +01:00
eb430a84c4 Fix dataset name order in grid experiment 2025-01-17 16:58:39 +01:00
d0e65348e0 Complete b_grid experiment 2025-01-17 13:56:19 +01:00
c1d5dd74e3 Continue with grid experiment 2025-01-17 10:39:56 +01:00
9a9a9fb17a Continue grid Experiment 2025-01-14 22:04:23 +01:00
386faf960e Refactor grid classes and add summary of tasks at the end 2025-01-14 18:53:11 +01:00
28894004c8 Fix time output in b_main 2025-01-08 20:45:08 +01:00
ae41975fb4 Add nominal or index dataset name in tex output 2025-01-08 17:18:32 +01:00
0e475e4488 Sort datasets on input 2025-01-08 11:05:22 +01:00
909cec712c Complete schema validation 2025-01-07 18:24:55 +01:00
4901bb1f32 Add json results format validation 2025-01-07 11:58:18 +01:00
0318dcf8e5 Continue with grid_experiment refactor 2024-12-21 14:18:47 +01:00
1cc19a7b19 Refactor mpi classes 2024-12-20 19:10:17 +01:00
f88944de36 Add grid base class and static class 2024-12-20 18:54:08 +01:00
1a336a094e Refactor gridsearch and begin gridexperiment 2024-12-20 17:36:43 +01:00
8705adf3ee Begin b_grid experiment 2024-12-20 12:51:33 +01:00
017cb8a0dc Fix smoothing problem in gridsearch 2024-12-18 11:17:04 +01:00
e966c880e6 Refactor gridsearch output 2024-12-17 10:49:58 +01:00
70ea32dc9a Update folding library 2024-12-14 20:23:31 +01:00
ba455bb934 Rename config.h to config_platform.h 2024-12-13 19:57:05 +01:00
a65955248a Add mdlp as dependency 2024-12-13 10:28:27 +01:00
84930b0537 Remove lib/mdlp folder 2024-12-13 10:11:45 +01:00
10c65f44a0 Add mdlp library dependency 2024-12-13 09:55:37 +01:00
6d112f01e7 Remove external library dependency 2024-12-13 09:49:46 +01:00
401296293b Add header to b_main time 2024-12-11 23:18:20 +01:00
9566ae4cf6 Fix gridsearch discretize_algo mistake 2024-12-11 12:45:16 +01:00
55187ee521 Add time to experiment seed 2024-12-11 10:05:24 +01:00
68ea06d129 Fix fimdlp library includes 2024-11-20 21:19:35 +01:00
6c1d1d0d32 Remove mdlp files 2024-11-20 21:14:42 +01:00
b0853d169b Remove mdlp submodule 2024-11-20 21:14:19 +01:00
26f8e07774 Remove Python 3.11 only requirement 2024-11-20 20:21:39 +01:00
315dfb104f Add train test time to report console 2024-10-25 09:53:31 +02:00
381f226d53 Fix pm code in tex bestresults 2024-10-15 10:32:28 +02:00
ea13835701 Add Markdown best results output 2024-10-07 18:08:42 +02:00
d75468cf78 Replace Nº with # in output labels 2024-09-28 22:55:11 +02:00
c58bd9d60d add score name to best results excel file name 2024-09-28 18:58:49 +02:00
148a3b831a Add missing \ to results.tex 2024-09-03 12:57:22 +02:00
69063badbb Fix status error in holm.tex 2024-09-03 12:54:09 +02:00
6ae2b2182a Complete Tex output with Holm test 2024-09-03 12:43:50 +02:00
4dbd76df55 Continue TeX output 2024-09-02 20:30:47 +02:00
4545f76667 Begin adding TeX output to b_best -m any command 2024-09-02 18:14:53 +02:00
8372987dae Update sample to last library version 2024-08-31 12:41:11 +02:00
d72943c749 Fix hyperparams mistake 2024-08-07 10:52:04 +02:00
800246acd2 Accept nested hyperparameters in b_main 2024-08-04 17:19:31 +02:00
0ea967dd9d Support b_main with best hyperparameters 2024-08-02 19:10:25 +02:00
97abec8b69 Fix hide result error 2024-08-02 12:02:11 +02:00
17c9522e77 Add support to old format results 2024-07-25 17:06:31 +02:00
45af550cf9 Change time showed in report 2024-07-24 18:40:59 +02:00
5d5f49777e Fix wrong columns message 2024-07-16 11:30:28 +02:00
540a8ea06d Refactor update rows 2024-07-16 10:33:44 +02:00
1924c4392b Adapt screen to resized window 2024-07-16 10:25:15 +02:00
f2556a30af Add screen width control in b_manage 2024-07-15 18:06:39 +02:00
2f2ed00ca1 Add roc-auc-ovr as score to b_main 2024-07-14 12:48:33 +02:00
28f6a0d7a7 RocAuc refactor to speed up binary classif. problems 2024-07-13 16:54:34 +02:00
028522f180 Add AUC to reportConsole 2024-07-12 17:41:23 +02:00
84adf13a79 Add AUC computing in Experiment and store in result 2024-07-12 17:23:03 +02:00
26dfe6d056 Add Graphs to results
Add bin5..bin10 q & u discretizers algos
Fix trouble in computing states
Update mdlp to 2.0.0
2024-07-11 11:23:20 +02:00
3acc34e4c6 Fix title mistake in b_main 2024-06-17 19:07:15 +02:00
8f92b74260 Change Constant smooth type 2024-06-14 10:16:32 +02:00
3d900f8c81 Update models versions 2024-06-13 12:30:31 +02:00
e628d80f4c Experiment working with smoothing and disc-algo 2024-06-11 13:52:26 +02:00
0f06f8971e Change default smooth type in Experiment 2024-06-10 15:50:54 +02:00
f800772149 Add new hyperparameters validation in b_main 2024-06-10 10:16:07 +02:00
b8a8ddaf8c Add smooth strategy to hyperparameter in b_main
Add smooth strategy to reports
2024-06-09 20:46:14 +02:00
90555489ff Add discretiz_algo to b_main as hyperparameter 2024-06-09 11:35:50 +02:00
080f3cee34 Add discretization algo to reports 2024-06-09 01:11:56 +02:00
643633e6dd fit discretizer only with train data 2024-06-09 00:50:55 +02:00
361c51d864 Add traintest split in gridsearch 2024-06-07 11:05:59 +02:00
5dd3deca1a Add discretiz algorithm management to b_main & Dataset 2024-06-07 09:00:51 +02:00
2202a81782 Add discretization algo to result 2024-06-06 18:33:01 +02:00
c4f4e332f6 Add parsing to DotEnv 2024-06-06 17:55:39 +02:00
a7ec930fa0 Add numeric features management to Dataset 2024-06-06 13:03:57 +02:00
6858b3d89a Remove model selection from b_best and b_list 2024-06-03 17:09:45 +02:00
5fb176d78a Add message of the file saved in b_main 2024-05-29 20:52:25 +02:00
f5d5c35002 Add generate-fold-files to b_main 2024-05-28 10:52:08 +02:00
b34af13eea Add new Files library 2024-05-26 17:27:42 +02:00
e3a06264a9 Remove old Files library 2024-05-26 17:25:36 +02:00
df82f82e88 Add F column to b_best in excel 2024-05-21 08:45:17 +02:00
886dde7a06 Fix various classification reports in the same excel book 2024-05-19 18:53:55 +02:00
88468434e7 Add color and fix format in classification report in excel 2024-05-19 11:12:31 +02:00
ad5c3319bd Complete excel classification report 2024-05-18 22:59:37 +02:00
594adb0534 Begin classification report in excel 2024-05-18 21:37:34 +02:00
b9e0c92334 Move ResultsDatasetConsole to results folder 2024-05-18 18:41:17 +02:00
25bd7a42c6 Replacce pragma once with ifndef 2024-05-18 13:00:13 +02:00
c165a4bdda Fix refactor of static aggregate method 2024-05-17 23:38:21 +02:00
49a36904dc Refactor aggregate score to a constructor 2024-05-17 22:52:13 +02:00
577351eda5 put using json=nlohmann:ordered_json under namespace platform 2024-05-17 18:32:01 +02:00
a3c4bde460 Fix problem with num of classes in pyclassifiers experiments 2024-05-17 14:05:09 +02:00
696c0564a7 Add BoostA2DE model and fix some report errors 2024-05-17 01:25:27 +02:00
30a6d5e60d Complete reporconsole with classification report 2024-05-14 13:22:13 +02:00
f8f3ca28dc Fix colors of classification report 2024-05-14 12:06:08 +02:00
5c190d7c66 Add train classification report 2024-05-14 11:45:54 +02:00
99c9c6731f Add colors to confusion matrix and classification report 2024-05-14 00:41:29 +02:00
8d20545fd2 Git add Confusion Matrix to console report 2024-05-13 10:40:25 +02:00
2b480cdcb7 Merge pull request 'Fix json key automatic ordering error when creating Score from json' (#4) from temp into main
Reviewed-on: #4
2024-05-12 16:36:08 +00:00
ebaddf1a6c Fix json key automatic ordering error when creating Score from json 2024-05-12 18:23:48 +02:00
07a2efb298 Show classification report in b_manage 2024-05-12 12:52:22 +02:00
f88b223c46 Update libraries 2024-05-12 12:26:49 +02:00
69b9609154 Add labels to confusion_matrices in results 2024-05-10 17:12:11 +02:00
6d4117d188 Add Classification report to end of experiment if only one dataset is tested 2024-05-10 14:11:51 +02:00
ec0268c514 Add confusion matrix to json results
Add Aggregate method to Scores
2024-05-10 13:42:38 +02:00
dd94fd51f7 Add json constructor to Scores 2024-05-10 11:35:07 +02:00
009ed037b8 Add Scores class and TestsScores 2024-05-10 00:51:21 +02:00
6d1b78ada7 Remove trace message from report 2024-05-09 17:09:03 +02:00
3882ebd6e4 Add SPnDE & A2DE models 2024-05-05 19:53:14 +02:00
423242d280 Add logo to README 2024-05-02 11:36:58 +02:00
b9381aa453 Fix json keys in ReporExcelCompared 2024-05-01 11:53:21 +02:00
33cfb78554 Fix Nodes, Leaves, Depth vs Nodes, Edges, States headers in reports 2024-04-21 11:05:12 +02:00
1caa39c071 Add env to enable test data 2024-04-19 10:02:59 +02:00
018c94bfe6 add platform filter to b_manage 2024-04-18 15:43:39 +02:00
a54d6b8716 Fix paginator error when deleting in b_manage 2024-04-17 12:57:57 +02:00
6cde09d81e Change launch parameters 2024-04-17 11:36:21 +02:00
7be95d889d Fix some output mistakes in b_manage experiments list 2024-04-17 11:35:43 +02:00
42d61c6fc4 Add datasets-file to b_main 2024-04-15 18:14:21 +02:00
e5e947779f Add datasets hyperparameter to b_main 2024-04-15 17:34:37 +02:00
ad168d13ba Add stratified and discretize to b_manage list 2024-04-11 11:45:43 +02:00
78b8a8ae66 Add platform to b_manage, fix report after experiment 2024-04-11 10:54:18 +02:00
7ed9073d15 Add ascending/descending sort to b_manage 2024-04-10 19:42:40 +02:00
ee93789ca3 Fix CMakeLists PyClassifier install folder 2024-04-10 13:34:48 +02:00
375ed437ed Find BayesNet and PyClassifiers in $HOME/lib folder 2024-04-10 00:53:39 +02:00
5ec7fe8d00 Show model version in b_main 2024-04-09 23:20:19 +02:00
72ea62f783 Update main CMakeLists 2024-04-06 21:15:51 +02:00
4b91f2bde0 Update vscode c++ configuration 2024-04-05 23:10:27 +02:00
3bc51cb7b0 Add pagination to detail result
Add version of libraries info to header
2024-04-04 00:14:21 +02:00
cf83d1f8f4 Add tests for libraries required versions 2024-04-03 20:51:21 +02:00
0dd10bcbe4 Fix some console report formats 2024-04-02 10:23:32 +02:00
622b36b2c7 Fix divide by 0 error in excel compared 2024-03-23 22:25:09 +01:00
ea29a96ca1 hide make buildr command 2024-03-21 11:30:03 +01:00
673a41fc4d fix b_main dataset selection 2024-03-19 17:37:32 +01:00
634ea36169 Add optimization to compile flags in Release 2024-03-18 14:00:34 +01:00
20fef5b6b3 Add excel to experiment view in b_manage 2024-03-18 10:21:28 +01:00
7cf864c3f3 Fix report after experiment 2024-03-18 10:10:48 +01:00
4a0fa33917 Remove indexList variable in ManageScreen 2024-03-17 13:08:07 +01:00
d47da27571 Complete pagination of result report 2024-03-17 11:26:26 +01:00
faccb09c43 Begin result report pagination 2024-03-17 02:07:10 +01:00
fa4f47ff35 Create Base class for paged reports 2024-03-17 01:22:50 +01:00
106a36109e Refactor report folder 2024-03-17 00:06:00 +01:00
37eba57765 Rename ManageResults -> ManageScreen 2024-03-16 23:44:21 +01:00
67487ffce1 shorten dataset name to maximum length 2024-03-16 23:37:37 +01:00
9c11dee019 Complete Datasets in b_manage 2024-03-16 22:39:25 +01:00
58ae2c7690 Complete file output in ResultsDataset & ReportDataset 2024-03-16 17:05:26 +01:00
fa366a4c22 Convert DatasetsConsole & ResultsDatasetConsole to string output 2024-03-16 13:48:49 +01:00
b9af086c29 Refactor library folders
Add paginators per output type in b_manage
2024-03-16 12:02:24 +01:00
6a285b149b Fix report and showindex header in bmanage 2024-03-16 01:24:47 +01:00
ad402ac21e ReportConsole to string 2024-03-16 01:16:00 +01:00
38978aa7b7 Add message of Excel file created in b_manage 2024-03-15 19:54:03 +01:00
3691363b8e Parsing errors to to status in b_manage 2024-03-15 19:28:37 +01:00
fe24aa0b3e Change header color to white in b_manage 2024-03-15 14:04:16 +01:00
175e0eb591 Fix some status issue in b_manage 2024-03-15 12:45:08 +01:00
1912d17498 Add status to b_manage 2024-03-15 11:31:56 +01:00
54249e5304 Add different header colors in b_manage 2024-03-15 00:24:16 +01:00
d7f92c9682 Refactor colors in b_manage 2024-03-15 00:18:30 +01:00
00bb7f4680 Adjust sizes in b_manage 2024-03-14 23:52:33 +01:00
bf5dabb169 Add pagination to b_manage 2024-03-14 23:41:05 +01:00
cdf339856a Fix b_manage error if no results were present 2024-03-13 17:56:44 +01:00
3ceea5677c Remove odd variable in some sources 2024-03-12 13:35:07 +01:00
260fd122eb Fix number in header of b_manage 2024-03-12 13:27:22 +01:00
eff0be1c1c Add apply number of lines in terminal in b_manage 2024-03-12 13:23:30 +01:00
0ade72a37a Permit partial results comparison 2024-03-12 00:24:36 +01:00
72cda3784a Add bold max score per model in b_list results 2024-03-11 17:02:58 +01:00
52d689666a Update License & Readme 2024-03-11 10:21:40 +01:00
26e87c9cb1 Merge pull request 'list_results' (#3) from list_results into main
Reviewed-on: #3
2024-03-11 08:54:01 +00:00
03cd6e5a51 Complete b_list results 2024-03-10 20:12:13 +01:00
cd9ff89b52 Add results to b_list 2024-03-10 18:02:03 +01:00
05d05e25c2 Add make example command 2024-03-10 13:25:55 +01:00
5cd6e3d1a5 Rename tests from cc to cpp 2024-03-10 13:04:02 +01:00
d9e9356d92 Rename all from *.cc to *.cpp 2024-03-10 13:03:37 +01:00
0010c840d1 Replace #define ... with pragma once 2024-03-10 12:50:35 +01:00
51f32113c0 Add model argument validation in b_best 2024-03-10 12:31:13 +01:00
b3b3d9f1b9 Add command results to b_list
Rename tostring -> toString in models
Add datasets names to b_main command help - validation
2024-03-10 12:16:02 +01:00
4c847fc3f6 Add model selection to b_best to filter results 2024-03-09 20:19:27 +01:00
7e4ee0a9a9 Refactor to accept new Library structure 2024-03-08 22:20:13 +01:00
b7398db9b1 Update CMake to work in Linux 2024-03-08 13:21:25 +01:00
0a9bd0d9c4 Update sample 2024-03-08 12:49:21 +01:00
7a3adaf4a9 Remove source bayesnet & pyclassifiers libraries dependency 2024-03-08 12:30:04 +01:00
5c4efa08db Add # models to ReportExcelCompared 2024-03-07 11:40:36 +01:00
576016bbd9 Merge pull request 'Create an excel report with two complete results compared in b_manage' (#2) from report_compared into main
Reviewed-on: #2
2024-03-06 12:17:30 +00:00
e26b3c0970 Add fixed header to Delta 2024-03-06 11:22:43 +01:00
183cf12300 Refactor column count and header 2024-03-06 10:35:42 +01:00
4eb08cd281 Complete sheet with totals 2024-03-06 01:26:51 +01:00
4f5f629124 Create class ReportExcelCompared 2024-03-05 23:44:19 +01:00
df011f7e6b Update second menu color in b_manage 2024-03-02 18:24:36 +01:00
42648f3125 Add info to README.md 2024-03-01 19:03:16 +01:00
d2832ed2b3 Add back to submenu in b_manage 2024-03-01 11:20:49 +01:00
ec323d86ab Refactor datasetsExcel 2024-02-29 19:05:20 +01:00
e4a6575722 Fix block header in b_list excel 2024-02-29 18:21:15 +01:00
67f1feb71f Merge pull request 'refactor_folders' (#1) from refactor_folders into main
Reviewed-on: #1
2024-02-29 16:29:31 +00:00
23c3bed667 Complete excel in b_list 2024-02-29 16:14:01 +01:00
b68d520726 Fix excel constructor 2024-02-29 13:20:37 +01:00
c69dc08134 Begin b_list excel 2024-02-29 12:53:11 +01:00
9a26baec47 fix meaning message format in b_main 2024-02-25 23:07:20 +01:00
82f2c36621 Update to include BayesNet 1.0.3 2024-02-25 18:25:10 +01:00
731e03681a Remove uneeded variable in b_list 2024-02-19 10:02:36 +01:00
643038fd19 Refactor notes position in Excel reports 2024-02-17 20:31:24 +01:00
7d92876f06 Complete PartialResult refactoring 2024-02-17 20:01:09 +01:00
53dafa3404 Refactor Result & PartialResult classes
Add title modification to b_manage
2024-02-17 19:09:43 +01:00
a1c7dbfea1 Remove space from header in b_list 2024-02-16 09:46:22 +01:00
581a8652cc Add missing sstream header to BestResults 2024-02-14 19:06:10 +01:00
4df1094340 Add .env sample file to project 2024-02-14 10:32:53 +01:00
45d0886adb Update version number 2024-02-13 10:51:56 +01:00
d996496f87 Add PyClassifiers submodule to update BayesNet 2024-02-12 14:49:37 +01:00
ab03d1de49 Removed submodule lib/PyClassifiers 2024-02-12 14:42:19 +01:00
9d44ea4cf2 Update BayesNet library 2024-02-12 14:25:37 +01:00
4b5d2b4f82 Enhance notes format in Excel Report 2024-02-10 10:37:51 +01:00
52d2004915 Add enhanced notes info and add notes to excel 2024-02-10 01:02:22 +01:00
3f3c14e8fc Add classifier notes to console & excel report 2024-02-09 18:08:08 +01:00
0907906ef6 Some lint suggestions refactoring 2024-02-08 17:27:19 +01:00
b490d406a2 Add node count to sample 2024-02-06 09:48:09 +01:00
5993ece4fd Add default score to b_best
Add doxyfile config
2024-02-05 16:36:28 +01:00
c9dc378f98 refactor max length compute in bestResults 2024-02-01 11:53:11 +01:00
d7174e930b Remove build & report parameters from b_best 2024-02-01 11:08:27 +01:00
e336d39cfb Add --no-train-score to b_main 2024-01-30 13:11:16 +01:00
7dbef9fc36 Add discretiz info to console report 2024-01-30 12:55:09 +01:00
889668bf00 Enhance b_main experiment output 2024-01-29 18:50:53 +01:00
a220b847d4 Fix column width in excel friedman test 2024-01-19 12:40:24 +01:00
25a6975b02 Fix excel output of friedman test 2024-01-19 11:25:36 +01:00
69bb930e3e Update PyClassifiers library 2024-01-18 10:26:22 +01:00
24666a3a16 Add number to b_list 2024-01-17 10:47:20 +01:00
210ce4a255 fix best report with incomplete data 2024-01-16 18:18:42 +01:00
5e1d59acdb Fix b_main output with multiple seeds 2024-01-16 10:42:24 +01:00
2b20d0315c Add b_main support to grid_output files 2024-01-15 11:53:34 +01:00
ecce7955f8 Add export command to b_grid 2024-01-15 11:26:39 +01:00
6660e8b6ce Separate commands from modules in folders 2024-01-10 12:31:22 +01:00
d145e71909 Fix tests 2024-01-09 18:44:41 +01:00
2fd83e940a Complete correct compilation with libraries 2024-01-09 18:35:27 +01:00
7a116bb0db Update submodules commands in Makefile 2024-01-09 17:57:41 +01:00
179 changed files with 16050 additions and 3864 deletions

View File

@@ -4,8 +4,8 @@ diagrams:
Platform:
type: class
glob:
- src/Platform/*.cc
- src/Command/*.cc
- src/*.cpp
- src/modules/*.cpp
using_namespace: platform
include:
namespaces:
@@ -17,7 +17,7 @@ diagrams:
sequence:
type: sequence
glob:
- src/Command/b_main.cc
- src/b_main.cpp
combine_free_functions_into_file_participants: true
using_namespace:
- std
@@ -25,7 +25,6 @@ diagrams:
- platform
include:
paths:
- src/Command
- src/Platform
- src
start_from:
- function: main(int,const char **)

16
.env.example Normal file
View File

@@ -0,0 +1,16 @@
experiment=discretiz
score=accuracy
platform=um790Linux
n_folds=5
stratified=0
model=TAN
source_data=Arff
seeds=[271]
discretize=0
ignore_nan=0
nodes=Nodes
leaves=Edges
depth=States
fit_features=0
framework=bulma
margin=0.1

7
.gitignore vendored
View File

@@ -38,3 +38,10 @@ cmake-build*/**
.idea
puml/**
.vscode/settings.json
*.dot
diagrams/html/**
diagrams/latex/**
.cache
vcpkg_installed
.claude/settings.local.json
CMakeUserPresets.json

15
.gitmodules vendored
View File

@@ -1,15 +0,0 @@
[submodule "lib/catch2"]
path = lib/catch2
url = https://github.com/catchorg/Catch2.git
[submodule "lib/argparse"]
path = lib/argparse
url = https://github.com/p-ranav/argparse
[submodule "lib/json"]
path = lib/json
url = https://github.com/nlohmann/json
[submodule "lib/libxlsxwriter"]
path = lib/libxlsxwriter
url = https://github.com/jmcnamara/libxlsxwriter.git
[submodule "lib/mdlp"]
path = lib/mdlp
url = https://github.com/rmontanana/mdlp

View File

@@ -11,7 +11,18 @@
],
"cStandard": "c17",
"cppStandard": "c++17",
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json"
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json",
"configurationProvider": "ms-vscode.cmake-tools"
},
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/**"
],
"defines": [],
"cStandard": "c17",
"cppStandard": "c++17",
"configurationProvider": "ms-vscode.cmake-tools"
}
],
"version": 4

49
.vscode/launch.json vendored
View File

@@ -2,9 +2,9 @@
"version": "0.2.0",
"configurations": [
{
"name": "sample",
"type": "lldb",
"request": "launch",
"name": "sample",
"program": "${workspaceFolder}/build_debug/sample/BayesNetSample",
"args": [
"-d",
@@ -14,15 +14,15 @@
"-s",
"271",
"-p",
"/Users/rmontanana/Code/discretizbench/datasets/",
"${workspaceFolder}/../discretizbench/datasets/",
],
//"cwd": "${workspaceFolder}/build/sample/",
},
{
"name": "experimentPy",
"type": "lldb",
"request": "launch",
"name": "experimentPy",
"program": "${workspaceFolder}/build_debug/src/Platform/b_main",
"program": "${workspaceFolder}/build_debug/src/b_main",
"args": [
"-m",
"STree",
@@ -36,10 +36,10 @@
"cwd": "${workspaceFolder}/../discretizbench",
},
{
"name": "gridsearch",
"type": "lldb",
"request": "launch",
"name": "gridsearch",
"program": "${workspaceFolder}/build_debug/src/Platform/b_grid",
"program": "${workspaceFolder}/build_debug/src/b_grid",
"args": [
"-m",
"KDB",
@@ -52,41 +52,41 @@
"cwd": "${workspaceFolder}/../discretizbench",
},
{
"name": "experimentBayes",
"type": "lldb",
"request": "launch",
"name": "experimentBayes",
"program": "${workspaceFolder}/build_debug/src/Platform/b_main",
"program": "${workspaceFolder}/build_debug/src/b_main",
"args": [
"-m",
"TAN",
"--stratified",
"--discretize",
"-d",
"iris",
"glass",
"--hyperparameters",
"{\"repeatSparent\": true, \"maxModels\": 12}"
"{\"block_update\": true}"
],
"cwd": "/home/rmontanana/Code/discretizbench",
},
{
"name": "best",
"type": "lldb",
"request": "launch",
"name": "best",
"program": "${workspaceFolder}/build_debug/src/Platform/b_best",
"program": "${workspaceFolder}/build_debug/src/b_best",
"args": [
"-m",
"BoostAODE",
"-s",
"accuracy",
"--build",
"--excel"
],
"cwd": "${workspaceFolder}/../discretizbench",
},
{
"name": "manage",
"type": "lldb",
"request": "launch",
"name": "manage",
"program": "${workspaceFolder}/build_debug/src/Platform/b_manage",
"program": "${workspaceFolder}/build_debug/src/b_manage",
"args": [
"-n",
"20"
@@ -94,24 +94,29 @@
"cwd": "${workspaceFolder}/../discretizbench",
},
{
"name": "list",
"type": "lldb",
"request": "launch",
"name": "list",
"program": "${workspaceFolder}/build_debug/src/Platform/b_list",
"args": [],
"program": "${workspaceFolder}/build_debug/src/b_list",
"args": [
"results",
"-d",
"mfeat-morphological"
],
//"cwd": "/Users/rmontanana/Code/discretizbench",
"cwd": "${workspaceFolder}/../discretizbench",
},
{
"name": "test",
"type": "lldb",
"request": "launch",
"name": "test",
"program": "${workspaceFolder}/build_debug/tests/unit_tests",
"program": "${workspaceFolder}/build_debug/tests/unit_tests_platform",
"args": [
"-c=\"Metrics Test\"",
"[Scores]",
// "-c=\"Metrics Test\"",
// "-s",
],
"cwd": "${workspaceFolder}/build/tests",
"cwd": "${workspaceFolder}/build_debug/tests",
},
{
"name": "Build & debug active file",

93
CHANGELOG.md Normal file
View File

@@ -0,0 +1,93 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Changed
- **BREAKING**: Migrated dependency management from vcpkg to Conan
- Updated build system to use Conan toolchain files instead of vcpkg
- Updated `make init` command to use `conan install` instead of `vcpkg install`
- Modified CMakeLists.txt to use Conan's find_package mechanism
- Updated documentation in CLAUDE.md to reflect Conan usage
### Added
- `conanfile.py` - Conan recipe for dependency management with all required dependencies
- CMakeUserPresets.json (generated by Conan)
- Support for Conan build profiles (Release/Debug)
### Removed
- `vcpkg.json` - vcpkg manifest file
- `vcpkg-configuration.json` - vcpkg registry configuration
- vcpkg toolchain dependency in build system
### Notes
- The migration maintains compatibility with existing make targets and workflow
- All dependencies now managed through Conan package manager
## [1.1.0] - 2025-07-02
### Added
- **AdaBoost Implementation**: Complete multi-class SAMME AdaBoost classifier with optimization
- Optimized AdaBoostPredict with 100 estimators as default
- Enhanced predictProbaSample functionality
- Full predict_proba support for probabilistic predictions
- **Decision Tree Classifier**: New base classifier implementation with comprehensive tests
- **XA1DE Model Family**: Extended Averaged One-Dependence Estimators
- XA1DE, XBAODE, XSPODE variants with threading support
- Complete integration with memory optimization
- Prior probability computation in prediction
- **Wilcoxon Statistical Test**: Statistical significance testing for model comparison
- **Folder Management**: Enhanced file organization with folder parameter support across tools
- Added folder parameter to b_best, b_grid, b_main, and b_manage
- **vcpkg Integration**: Package management system integration (now migrated to Conan)
### Enhanced
- **Grid Search System**: Complete refactoring with MPI parallelization
- Grid experiment functionality with conditional result saving
- Fixed smoothing problems and dataset ordering
- Enhanced reporting and summary generation
- **Excel Reporting**: Advanced Excel export capabilities
- ReportExcelCompared class for side-by-side result comparison
- Enhanced formatting with colors and fixed headers
- Automatic file opening after generation
- **Results Management**: Comprehensive result handling and validation
- JSON schema validation for result format integrity
- Improved console reporting with classification reports
- Pagination support for large result sets
- **Statistical Analysis**: Enhanced statistical testing and reporting
- AUC (Area Under Curve) computation and reporting
- Confusion matrix generation and visualization
- Classification reports with color coding
### Performance Improvements
- Optimized AdaBoost training and prediction algorithms
- Enhanced memory management in XA1DE implementations
- Improved discretization algorithms with MDLP integration
- Faster ROC-AUC computation for binary classification problems
### Developer Experience
- **Testing Framework**: Comprehensive test suite with Catch2
- **Build System**: Streamlined CMake configuration with dependency management
- **Documentation**: Enhanced project documentation and build instructions
- **Code Quality**: Refactored codebase with improved error handling and logging
### Bug Fixes
- Fixed predict_proba implementations across multiple classifiers
- Resolved grid search dataset ordering issues
- Fixed Excel report formatting and column width problems
- Corrected time output formatting in various tools
- Fixed memory leaks and stability issues in model implementations
## [1.0.0] - 2024-01-09
### Initial Release
- **Core Framework**: Machine learning experimentation platform for Bayesian Networks
- **Basic Classifiers**: Initial set of Bayesian network classifiers
- **Experiment Management**: Basic experiment orchestration and result storage
- **Dataset Support**: ARFF file format support with discretization
- **Build System**: CMake-based build system with external library integration
- **Command Line Tools**: Initial versions of b_main, b_best, b_list utilities

139
CLAUDE.md Normal file
View File

@@ -0,0 +1,139 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
Platform is a C++ machine learning framework for running experiments with Bayesian Networks and other classifiers. It supports both research-focused experimental classifiers and production-ready models through a unified interface.
## Build System
The project uses CMake with Make as the primary build system:
- **Release build**: `make release` (creates `build_Release/` directory)
- **Debug build**: `make debug` (creates `build_Debug/` directory with testing and coverage enabled)
- **Install binaries**: `make install` (copies executables to `~/bin` by default)
- **Clean project**: `make clean` (removes build directories)
- **Initialize dependencies**: `make init` (runs conan install for both Release and Debug)
### Testing
- **Run tests**: `make test` (builds debug version and runs all tests)
- **Coverage report**: `make coverage` (runs tests and generates coverage with gcovr)
- **Single test with options**: `make test opt="-s"` (verbose) or `make test opt="-c='Test Name'"` (specific test)
### Build Targets
Main executables (built from `src/commands/`):
- `b_main`: Main experiment runner
- `b_grid`: Grid search over hyperparameters
- `b_best`: Best results analysis and comparison
- `b_list`: Dataset listing and properties
- `b_manage`: Results management interface
- `b_results`: Results processing
## Dependencies
The project uses Conan for package management with these key dependencies:
- **libtorch**: PyTorch C++ backend for tensor operations
- **nlohmann_json**: JSON processing
- **catch2**: Unit testing framework
- **cli11**: Command-line argument parsing (replacement for argparse)
Custom dependencies (not available in ConanCenter):
- **fimdlp**: MDLP discretization library (needs manual integration)
- **folding**: Cross-validation utilities (needs manual integration)
- **arff-files**: ARFF dataset file handling (needs manual integration)
External dependencies (managed separately):
- **BayesNet**: Core Bayesian network classifiers (from `../lib/`)
- **PyClassifiers**: Python classifier wrappers (from `../lib/`)
- **MPI**: Message Passing Interface for parallel processing
- **Boost**: Python integration and utilities
**Note**: Some dependencies (fimdlp, folding, arff-files) are not available in ConanCenter and need to be:
- Built as custom Conan packages, or
- Integrated using CMake FetchContent, or
- Built separately and found via find_package
## Architecture
### Core Components
**Experiment Framework** (`src/main/`):
- `Experiment.cpp/h`: Main experiment orchestration
- `Models.cpp/h`: Classifier factory and registration system
- `Scores.cpp/h`: Performance metrics calculation
- `HyperParameters.cpp/h`: Parameter management
- `ArgumentsExperiment.cpp/h`: Command-line argument handling
**Data Handling** (`src/common/`):
- `Dataset.cpp/h`: Individual dataset representation
- `Datasets.cpp/h`: Dataset collection management
- `Discretization.cpp/h`: Data discretization utilities
**Classifiers** (`src/experimental_clfs/`):
- `AdaBoost.cpp/h`: Multi-class SAMME AdaBoost implementation
- `DecisionTree.cpp/h`: Decision tree base classifier
- `XA1DE.cpp/h`: Extended AODE variants
- Experimental implementations of Bayesian network classifiers
**Grid Search** (`src/grid/`):
- `GridSearch.cpp/h`: Hyperparameter optimization
- `GridExperiment.cpp/h`: Grid search experiment management
- Uses MPI for parallel hyperparameter evaluation
**Results & Reporting** (`src/results/`, `src/reports/`):
- JSON-based result storage with schema validation
- Excel export capabilities via libxlsxwriter
- Console and paginated result display
### Model Registration System
The framework uses a factory pattern with automatic registration:
- All classifiers inherit from `bayesnet::BaseClassifier`
- Registration happens in `src/main/modelRegister.h`
- Factory creates instances by string name via `Models::create()`
## Configuration
**Environment Configuration** (`.env` file):
- `experiment`: Experiment name/type
- `n_folds`: Cross-validation folds (default: 5)
- `seeds`: Random seeds for reproducibility
- `model`: Default classifier name
- `score`: Primary evaluation metric
- `platform`: System identifier for results
**Grid Search Configuration**:
- `grid_<model_name>_input.json`: Hyperparameter search space
- `grid_<model_name>_output.json`: Search results
## Data Format
**Dataset Requirements**:
- ARFF format files in `datasets/` directory
- `all.txt` file listing datasets: `<name>,<class_name>,<real_features>`
- Supports both discrete and continuous features
- Automatic discretization available via MDLP
**Experimental Data**:
- Results stored in JSON format with versioned schemas
- Test data in `tests/data/` for unit testing
- Sample datasets: iris, diabetes, ecoli, glass, etc.
## Development Workflow
1. **Setup**: Run `make init` to install dependencies via Conan
2. **Development**: Use `make debug` for development builds with testing
3. **Testing**: Run `make test` after changes
4. **Release**: Use `make release` for optimized builds
5. **Experiments**: Use `.env` configuration and run `b_main` with appropriate flags
## Key Features
- **Multi-threaded**: Uses MPI for parallel grid search and experiments
- **Cross-platform**: Supports Linux and macOS via vcpkg
- **Extensible**: Easy classifier registration and integration
- **Research-focused**: Designed for machine learning experimentation
- **Visualization**: DOT graph generation for decision trees and networks

View File

@@ -1,95 +1,99 @@
cmake_minimum_required(VERSION 3.20)
project(Platform
VERSION 1.0.0
VERSION 1.1.1
DESCRIPTION "Platform to run Experiments with classifiers."
HOMEPAGE_URL "https://github.com/rmontanana/platform"
LANGUAGES CXX
)
if (CODE_COVERAGE AND NOT ENABLE_TESTING)
MESSAGE(FATAL_ERROR "Code coverage requires testing enabled")
endif (CODE_COVERAGE AND NOT ENABLE_TESTING)
find_package(Torch REQUIRED)
if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif ()
# Global CMake variables
# ----------------------
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
# Options
# -------
option(ENABLE_CLANG_TIDY "Enable to add clang tidy." OFF)
option(ENABLE_TESTING "Unit testing build" OFF)
option(CODE_COVERAGE "Collect coverage from test library" OFF)
# CMakes modules
# --------------
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
# MPI
find_package(MPI REQUIRED)
message("MPI_CXX_LIBRARIES=${MPI_CXX_LIBRARIES}")
message("MPI_CXX_INCLUDE_DIRS=${MPI_CXX_INCLUDE_DIRS}")
# Boost Library
cmake_policy(SET CMP0135 NEW)
cmake_policy(SET CMP0167 NEW) # For FindBoost
set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_MULTITHREADED ON)
set(Boost_USE_STATIC_RUNTIME OFF)
find_package(Boost 1.66.0 REQUIRED COMPONENTS python3 numpy3)
# # Python
find_package(Python3 REQUIRED COMPONENTS Development)
# # Boost Python
# find_package(boost_python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR} CONFIG REQUIRED COMPONENTS python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR})
# # target_link_libraries(MyTarget PRIVATE Boost::python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR})
if(Boost_FOUND)
message("Boost_INCLUDE_DIRS=${Boost_INCLUDE_DIRS}")
message("Boost_LIBRARIES=${Boost_LIBRARIES}")
message("Boost_VERSION=${Boost_VERSION}")
include_directories(${Boost_INCLUDE_DIRS})
endif()
# Python
find_package(Python3 3.11...3.11.9 COMPONENTS Interpreter Development REQUIRED)
message("Python3_LIBRARIES=${Python3_LIBRARIES}")
# CMakes modules
# --------------
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
include(AddGitSubmodule)
if (CODE_COVERAGE)
enable_testing()
include(CodeCoverage)
MESSAGE("Code coverage enabled")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
endif (CODE_COVERAGE)
if (ENABLE_CLANG_TIDY)
include(StaticAnalyzers) # clang-tidy
endif (ENABLE_CLANG_TIDY)
# External libraries - dependencies of BayesNet
# External libraries - dependencies of Platform
# ---------------------------------------------
add_git_submodule("lib/PyClassifiers")
add_git_submodule("lib/argparse")
find_library(XLSXWRITER_LIB NAMES libxlsxwriter.dylib libxlsxwriter.so PATHS ${Platform_SOURCE_DIR}/lib/libxlsxwriter/lib)
message("XLSXWRITER_LIB=${XLSXWRITER_LIB}")
find_package(nlohmann_json CONFIG REQUIRED)
find_package(argparse CONFIG REQUIRED)
find_package(Torch CONFIG REQUIRED)
find_package(arff-files CONFIG REQUIRED)
find_package(fimdlp CONFIG REQUIRED)
find_package(folding CONFIG REQUIRED)
find_package(bayesnet CONFIG REQUIRED)
# find_package(pyclassifiers CONFIG REQUIRED)
find_package(libxlsxwriter CONFIG REQUIRED)
find_package(Boost REQUIRED COMPONENTS python)
# Subdirectories
# --------------
## Configure test data path
cmake_path(SET TEST_DATA_PATH "${CMAKE_CURRENT_SOURCE_DIR}/tests/data")
configure_file(src/common/SourceData.h.in "${CMAKE_BINARY_DIR}/configured_files/include/SourceData.h")
add_subdirectory(config)
add_subdirectory(src/Platform)
add_subdirectory(sample)
file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${Platform_SOURCE_DIR}/src/Platform/*.cc)
add_subdirectory(src)
# add_subdirectory(sample)
file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${Platform_SOURCE_DIR}/src/*.cpp)
# Testing
# -------
if (ENABLE_TESTING)
MESSAGE("Testing enabled")
if (NOT TARGET Catch2::Catch2)
add_git_submodule("lib/catch2")
endif (NOT TARGET Catch2::Catch2)
set(CMAKE_CXX_FLAGS_DEBUG " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
enable_testing()
find_package(Catch2 CONFIG REQUIRED)
set(CODE_COVERAGE ON)
include(CTest)
add_subdirectory(tests)
endif (ENABLE_TESTING)
if (CODE_COVERAGE)
MESSAGE("Code coverage enabled")
include(CodeCoverage)
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
endif (CODE_COVERAGE)

2830
Doxyfile Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
MIT License
Copyright (c) 2024 rmontanana
Copyright (c) 2024 Ricardo Montañana Gómez
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

122
Makefile
View File

@@ -1,12 +1,18 @@
SHELL := /bin/bash
.DEFAULT_GOAL := help
.PHONY: coverage setup help build test clean debug release submodules buildr buildd install dependency testp testb clang-uml
.PHONY: init clean coverage setup help build test clean debug release buildr buildd install dependency testp testb clang-uml example
f_release = build_release
f_debug = build_debug
app_targets = b_best b_list b_main b_manage b_grid
test_targets = unit_tests_bayesnet unit_tests_platform
n_procs = -j 16
f_release = build_Release
f_debug = build_Debug
app_targets = b_best b_list b_main b_manage b_grid b_results
test_targets = unit_tests_platform
# Set the number of parallel jobs to the number of available processors minus 7
CPUS := $(shell getconf _NPROCESSORS_ONLN 2>/dev/null \
|| nproc --all 2>/dev/null \
|| sysctl -n hw.ncpu)
# --- Your desired job count: CPUs 7, but never less than 1 --------------
JOBS := $(shell n=$(CPUS); [ $${n} -gt 7 ] && echo $$((n-7)) || echo 1)
define ClearTests
@for t in $(test_targets); do \
@@ -21,12 +27,43 @@ define ClearTests
fi ;
endef
define build_target
@echo ">>> Building the project for $(1)..."
@if [ -d $(2) ]; then rm -fr $(2); fi
@conan install . --build=missing -of $(2) -s build_type=$(1)
@cmake -S . -B $(2) -DCMAKE_TOOLCHAIN_FILE=$(2)/build/$(1)/generators/conan_toolchain.cmake -DCMAKE_BUILD_TYPE=$(1) -D$(3)
@echo ">>> Will build using $(JOBS) parallel jobs"
echo ">>> Done"
endef
submodules: ## Update submodules
@git submodule update --init --recursive
@git submodule update --remote --merge
@git submodule foreach git pull origin master
define compile_target
@echo ">>> Compiling for $(1)..."
if [ "$(3)" != "" ]; then \
target="-t$(3)"; \
else \
target=""; \
fi
@cmake --build $(2) --config $(1) --parallel $(JOBS) $(target)
@echo ">>> Done"
endef
init: ## Initialize the project installing dependencies
@echo ">>> Installing dependencies with Conan"
@conan install . --output-folder=build --build=missing -s build_type=Release
@conan install . --output-folder=build_debug --build=missing -s build_type=Debug
@echo ">>> Done"
clean: ## Clean the project
@echo ">>> Cleaning the project..."
@if test -f CMakeCache.txt ; then echo "- Deleting CMakeCache.txt"; rm -f CMakeCache.txt; fi
@for folder in $(f_release) $(f_debug) build build_debug install_test ; do \
if test -d "$$folder" ; then \
echo "- Deleting $$folder folder" ; \
rm -rf "$$folder"; \
fi; \
done
$(call ClearTests)
@echo ">>> Done";
setup: ## Install dependencies for tests and coverage
@if [ "$(shell uname)" = "Darwin" ]; then \
brew install gcovr; \
@@ -39,13 +76,15 @@ setup: ## Install dependencies for tests and coverage
dest ?= ${HOME}/bin
install: ## Copy binary files to bin folder
@echo "Destination folder: $(dest)"
make buildr
@make buildr
@echo "*******************************************"
@echo ">>> Copying files to $(dest)"
@echo "*******************************************"
@for item in $(app_targets); do \
echo ">>> Copying $$item" ; \
cp $(f_release)/src/Platform/$$item $(dest) ; \
cp $(f_release)/src/$$item $(dest) || { \
echo "*** Error copying $$item" ; \
} ; \
done
dependency: ## Create a dependency graph diagram of the project (build/dependency.png)
@@ -54,38 +93,26 @@ dependency: ## Create a dependency graph diagram of the project (build/dependenc
cd $(f_debug) && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png
buildd: ## Build the debug targets
cmake --build $(f_debug) -t $(app_targets) BayesNetSample $(n_procs)
@$(call compile_target,"Debug","$(f_debug)")
buildr: ## Build the release targets
cmake --build $(f_release) -t $(app_targets) BayesNetSample $(n_procs)
clean: ## Clean the tests info
@echo ">>> Cleaning Debug BayesNet tests...";
$(call ClearTests)
@echo ">>> Done";
@$(call compile_target,"Release","$(f_release)")
clang-uml: ## Create uml class and sequence diagrams
clang-uml -p --add-compile-flag -I /usr/lib/gcc/x86_64-redhat-linux/8/include/
debug: ## Build a debug version of the project
@echo ">>> Building Debug BayesNet...";
@if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi
@mkdir $(f_debug);
@cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON
@echo ">>> Done";
debug: ## Build a debug version of the project with Conan
@$(call build_target,"Debug","$(f_debug)", "ENABLE_TESTING=ON")
release: ## Build a Release version of the project with Conan
@$(call build_target,"Release","$(f_release)", "ENABLE_TESTING=OFF")
release: ## Build a Release version of the project
@echo ">>> Building Release BayesNet...";
@if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi
@mkdir $(f_release);
@cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release
@echo ">>> Done";
opt = ""
test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
@echo ">>> Running BayesNet & Platform tests...";
@$(MAKE) clean
@cmake --build $(f_debug) -t $(test_targets) $(n_procs)
@echo ">>> Running Platform tests...";
@$(MAKE) debug
@$(call compile_target, "Debug", "$(f_debug)", $(test_targets))
@for t in $(test_targets); do \
if [ -f $(f_debug)/tests/$$t ]; then \
cd $(f_debug)/tests ; \
@@ -94,33 +121,24 @@ test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximu
done
@echo ">>> Done";
opt = ""
testp: ## Run platform tests (opt="-s") to verbose output the tests, (opt="-c='Stratified Fold Test'") to run only that section
@echo ">>> Running Platform tests...";
@$(MAKE) clean
@cmake --build $(f_debug) --target unit_tests_platform $(n_procs)
@if [ -f $(f_debug)/tests/unit_tests_platform ]; then cd $(f_debug)/tests ; ./unit_tests_platform $(opt) ; fi ;
fname = iris
example: ## Build sample
@echo ">>> Building Sample...";
@cmake --build $(f_release) -t sample
$(f_release)/sample/PlatformSample --model BoostAODE --dataset $(fname) --discretize --stratified
@echo ">>> Done";
opt = ""
testb: ## Run BayesNet tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
@echo ">>> Running BayesNet tests...";
@$(MAKE) clean
@cmake --build $(f_debug) --target unit_tests_bayesnet $(n_procs)
@if [ -f $(f_debug)/tests/unit_tests_bayesnet ]; then cd $(f_debug)/tests ; ./unit_tests_bayesnet $(opt) ; fi ;
@echo ">>> Done";
coverage: ## Run tests and generate coverage report (build/index.html)
@echo ">>> Building tests with coverage...";
@echo ">>> Building tests with coverage..."
@$(MAKE) test
@cd $(f_debug) ; \
gcovr --config ../gcovr.cfg tests ;
@echo ">>> Done";
@gcovr $(f_debug)/tests
@echo ">>> Done";
help: ## Show help message
@IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
help_lines=(`grep -Fh "##" $(MAKEFILE_LIST) | grep -Fv fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
printf "%s\n\n" "Usage: make [task]"; \
printf "%-20s %s\n" "task" "help" ; \
printf "%-20s %s\n" "------" "----" ; \

View File

@@ -1,16 +1,15 @@
# Platform
# <img src="logo.png" alt="logo" width="50"/> Platform
Platform to run Bayesian Networks and Machine Learning Classifiers experiments.
# Platform
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
![C++](https://img.shields.io/badge/c++-%2300599C.svg?style=flat&logo=c%2B%2B&logoColor=white)
[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](<https://opensource.org/licenses/MIT>)
[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/rmontanana/Platform)
![Gitea Last Commit](https://img.shields.io/gitea/last-commit/rmontanana/platform?gitea_url=https://gitea.rmontanana.es&logo=gitea)
Platform to run Bayesian Networks and Machine Learning Classifiers experiments.
## 0. Setup
Before compiling BayesNet.
Before compiling Platform.
### Miniconda
@@ -22,11 +21,18 @@ In Linux sometimes the library libstdc++ is mistaken from the miniconda installa
libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by b_xxxx)
```
The solution is to erase the libstdc++ library from the miniconda installation:
The solution is to erase the libstdc++ library from the miniconda installation and no further compilation is needed.
### MPI
In Linux just install openmpi & openmpi-devel packages. Only if cmake can't find openmpi installation (like in Oracle Linux) set the following variable:
In Linux just install openmpi & openmpi-devel packages.
```bash
source /etc/profile.d/modules.sh
module load mpi/openmpi-x86_64
```
If cmake can't find openmpi installation (like in Oracle Linux) set the following variable:
```bash
export MPI_HOME="/usr/lib64/openmpi"
@@ -35,7 +41,7 @@ export MPI_HOME="/usr/lib64/openmpi"
In Mac OS X, install mpich with brew and if cmake doesn't find it, edit mpicxx wrapper to remove the ",-commons,use_dylibs" from final_ldflags
```bash
vi /opt/homebrew/bin/mpicx
vi /opt/homebrew/bin/mpicxx
```
### boost library
@@ -86,4 +92,64 @@ make release
make debug
```
## 1. Introduction
### Configuration
The configuration file is named .env and it should be located in the folder where the experiments should be run. In the root folder of the project there is a file named .env.example that can be used as a template.
## 1. Commands
### b_list
List all the datasets and its properties. The datasets are located in the _datasets_ folder under the experiments root folder. A special file called all.txt with the names of the datasets has to be created. This all file is built wih lines of the form:
<name>,<class_name>,<real_features>
where <real_features> can be either the word _all_ or a list of numbers separated by commas, i.e. [0,3,6,7]
### b_grid
Run a grid search over the parameters of the classifiers. The parameters are defined in the file _grid.txt_ located in the grid folder of the experiments. The file has to be created with the following format:
```json
{
"all": [
<set of hyperparams>, ...
],
"<dataset_name>": [
<specific set of hyperparams for <dataset_name>>, ...
],
}
```
The file has to be named _grid_<model_name>_input.json_
As a result it builds a file named _grid_<model_name>_output.json_ with the results of the grid search.
The computation is done in parallel using MPI.
![b_grid](img/bgrid.gif)
### b_main
Run the main experiment. There are several hyperparameters that can set in command line:
- -d, -\-dataset <dataset_name> : Name of the dataset to run the experiment with. If no dataset is specificied the experiment will run with all the datasets in the all.txt file.
- -m, -\-model <classifier_name> : Name of the classifier to run the experiment with (i.e. BoostAODE, TAN, Odte, etc.).
- -\-discretize: Discretize the dataset before running the experiment.
- -\-stratified: Use stratified cross validation.
- -\-folds <folds>: Number of folds for cross validation (optional, default value is in .env file).
- -s, -\-seeds <seed>: Seeds for the random number generator (optional, default values are in .env file).
- -\-no-train-score: Do not calculate the train score (optional), this is useful when the dataset is big and the training score is not needed.
- -\-hyperparameters <hyperparameters>: Hyperparameters for the experiment in json format.
- -\-hyper-file <hyperparameters_file>: File with the hyperparameters for the experiment in json format. This file uses the output format of the b_grid command.
- -\-title <title_text>: Title of the experiment (optional if only one dataset is specificied).
- -\-quiet: Don't display detailed progress and result of the experiment.
### b_manage
Manage the results of the experiments.
### b_best
Get and optionally compare the best results of the experiments. The results can be stored in an MS Excel file.
![b_best](img/bbest.gif)

View File

@@ -137,7 +137,7 @@
include(CMakeParseArguments)
option(CODE_COVERAGE_VERBOSE "Verbose information" FALSE)
option(CODE_COVERAGE_VERBOSE "Verbose information" TRUE)
# Check prereqs
find_program( GCOV_PATH gcov )
@@ -160,7 +160,11 @@ foreach(LANG ${LANGUAGES})
endif()
elseif(NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "GNU"
AND NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "(LLVM)?[Ff]lang")
message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
if ("${LANG}" MATCHES "CUDA")
message(STATUS "Ignoring CUDA")
else()
message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
endif()
endif()
endforeach()

42
conanfile.py Normal file
View File

@@ -0,0 +1,42 @@
from conan import ConanFile
from conan.tools.cmake import CMakeToolchain, CMakeDeps, cmake_layout
class PlatformConan(ConanFile):
name = "platform"
version = "1.1.0"
# Binary configuration
settings = "os", "compiler", "build_type", "arch"
# Sources are located in the same place as this recipe, copy them to the recipe
exports_sources = "CMakeLists.txt", "src/*", "tests/*", "config/*", "cmake/*"
def requirements(self):
# Core dependencies from vcpkg.json
self.requires("argparse/3.2")
self.requires("libtorch/2.7.1")
self.requires("nlohmann_json/3.11.3")
self.requires("folding/1.1.2")
self.requires("fimdlp/2.1.1")
self.requires("arff-files/1.2.1")
self.requires("bayesnet/1.2.1")
# self.requires("pyclassifiers/1.0.3")
self.requires("libxlsxwriter/1.2.2")
def build_requirements(self):
self.tool_requires("cmake/[>=3.30]")
self.test_requires("catch2/3.8.1")
def layout(self):
cmake_layout(self)
def generate(self):
deps = CMakeDeps(self)
deps.generate()
tc = CMakeToolchain(self)
tc.generate()
def configure(self):
# C++20 requirement
self.settings.compiler.cppstd = "20"

View File

@@ -1,4 +1,4 @@
configure_file(
"config.h.in"
"${CMAKE_BINARY_DIR}/configured_files/include/config.h" ESCAPE_QUOTES
"${CMAKE_BINARY_DIR}/configured_files/include/config_platform.h" ESCAPE_QUOTES
)

View File

@@ -1,14 +1,11 @@
#pragma once
#ifndef PLATFORM_H
#define PLATFORM_H
#include <string>
#include <string_view>
#define PROJECT_VERSION_MAJOR @PROJECT_VERSION_MAJOR @
#define PROJECT_VERSION_MINOR @PROJECT_VERSION_MINOR @
#define PROJECT_VERSION_PATCH @PROJECT_VERSION_PATCH @
static constexpr std::string_view project_name = "@PROJECT_NAME@";
static constexpr std::string_view project_version = "@PROJECT_VERSION@";
static constexpr std::string_view project_description = "@PROJECT_DESCRIPTION@";
static constexpr std::string_view git_sha = "@GIT_SHA@";
static constexpr std::string_view data_path = "@Platform_SOURCE_DIR@/tests/data/";
static constexpr std::string_view platform_project_name = "@PROJECT_NAME@";
static constexpr std::string_view platform_project_version = "@PROJECT_VERSION@";
static constexpr std::string_view platform_project_description = "@PROJECT_DESCRIPTION@";
static constexpr std::string_view platform_git_sha = "@GIT_SHA@";
static constexpr std::string_view platform_data_path = "@Platform_SOURCE_DIR@/tests/data/";
#endif

View File

@@ -1,4 +1,4 @@
filter = src/
exclude-directories = build/lib/
exclude-directories = build_debug/lib/
print-summary = yes
sort-percentage = yes

View File

@@ -1,31 +0,0 @@
[submodule "lib/mdlp"]
path = lib/mdlp
url = https://github.com/rmontanana/mdlp
main = main
update = merge
[submodule "lib/catch2"]
path = lib/catch2
main = v2.x
update = merge
url = https://github.com/catchorg/Catch2.git
[submodule "lib/argparse"]
path = lib/argparse
url = https://github.com/p-ranav/argparse
master = master
update = merge
[submodule "lib/json"]
path = lib/json
url = https://github.com/nlohmann/json.git
master = master
update = merge
[submodule "lib/libxlsxwriter"]
path = lib/libxlsxwriter
url = https://github.com/jmcnamara/libxlsxwriter.git
main = main
update = merge
[submodule "lib/PyClassifiers"]
path = lib/PyClassifiers
url = https://github.com/rmontanana/PyClassifiers
[submodule "lib/folding"]
path = lib/folding
url = https://github.com/rmontanana/Folding

BIN
img/bbest.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 MiB

BIN
img/bgrid.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 349 KiB

BIN
img/blist.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

BIN
img/bmain.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.3 MiB

BIN
img/bmanage.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.7 MiB

View File

@@ -1,168 +0,0 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
ArffFiles::ArffFiles() = default;
std::vector<std::string> ArffFiles::getLines() const
{
return lines;
}
unsigned long int ArffFiles::getSize() const
{
return lines.size();
}
std::vector<std::pair<std::string, std::string>> ArffFiles::getAttributes() const
{
return attributes;
}
std::string ArffFiles::getClassName() const
{
return className;
}
std::string ArffFiles::getClassType() const
{
return classType;
}
std::vector<std::vector<float>>& ArffFiles::getX()
{
return X;
}
std::vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::loadCommon(std::string fileName)
{
std::ifstream file(fileName);
if (!file.is_open()) {
throw std::invalid_argument("Unable to open file");
}
std::string line;
std::string keyword;
std::string attribute;
std::string type;
std::string type_w;
while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
ss >> keyword >> attribute;
type = "";
while (ss >> type_w)
type += type_w + " ";
attributes.emplace_back(trim(attribute), trim(type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw std::invalid_argument("No attributes found");
}
void ArffFiles::load(const std::string& fileName, bool classLast)
{
int labelIndex;
loadCommon(fileName);
if (classLast) {
className = std::get<0>(attributes.back());
classType = std::get<1>(attributes.back());
attributes.pop_back();
labelIndex = static_cast<int>(attributes.size());
} else {
className = std::get<0>(attributes.front());
classType = std::get<1>(attributes.front());
attributes.erase(attributes.begin());
labelIndex = 0;
}
generateDataset(labelIndex);
}
void ArffFiles::load(const std::string& fileName, const std::string& name)
{
int labelIndex;
loadCommon(fileName);
bool found = false;
for (int i = 0; i < attributes.size(); ++i) {
if (attributes[i].first == name) {
className = std::get<0>(attributes[i]);
classType = std::get<1>(attributes[i]);
attributes.erase(attributes.begin() + i);
labelIndex = i;
found = true;
break;
}
}
if (!found) {
throw std::invalid_argument("Class name not found");
}
generateDataset(labelIndex);
}
void ArffFiles::generateDataset(int labelIndex)
{
X = std::vector<std::vector<float>>(attributes.size(), std::vector<float>(lines.size()));
auto yy = std::vector<std::string>(lines.size(), "");
auto removeLines = std::vector<int>(); // Lines with missing values
for (size_t i = 0; i < lines.size(); i++) {
std::stringstream ss(lines[i]);
std::string value;
int pos = 0;
int xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
if (value == "?") {
X[xIndex++][i] = -1;
removeLines.push_back(i);
} else
X[xIndex++][i] = stof(value);
}
}
}
for (auto i : removeLines) {
yy.erase(yy.begin() + i);
for (auto& x : X) {
x.erase(x.begin() + i);
}
}
y = factorize(yy);
}
std::string ArffFiles::trim(const std::string& source)
{
std::string s(source);
s.erase(0, s.find_first_not_of(" '\n\r\t"));
s.erase(s.find_last_not_of(" '\n\r\t") + 1);
return s;
}
std::vector<int> ArffFiles::factorize(const std::vector<std::string>& labels_t)
{
std::vector<int> yy;
yy.reserve(labels_t.size());
std::map<std::string, int> labelMap;
int i = 0;
for (const std::string& label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

View File

@@ -1,32 +0,0 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
class ArffFiles {
private:
std::vector<std::string> lines;
std::vector<std::pair<std::string, std::string>> attributes;
std::string className;
std::string classType;
std::vector<std::vector<float>> X;
std::vector<int> y;
void generateDataset(int);
void loadCommon(std::string);
public:
ArffFiles();
void load(const std::string&, bool = true);
void load(const std::string&, const std::string&);
std::vector<std::string> getLines() const;
unsigned long int getSize() const;
std::string getClassName() const;
std::string getClassType() const;
static std::string trim(const std::string&);
std::vector<std::vector<float>>& getX();
std::vector<int>& getY();
std::vector<std::pair<std::string, std::string>> getAttributes() const;
static std::vector<int> factorize(const std::vector<std::string>& labels_t);
};
#endif

View File

@@ -1 +0,0 @@
add_library(ArffFiles ArffFiles.cc)

Submodule lib/argparse deleted from 69dabd88a8

BIN
logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 874 KiB

14
remove_submodules.sh Normal file
View File

@@ -0,0 +1,14 @@
git config --file .gitmodules --get-regexp path | awk '{ print $2 }' | while read line; do
echo "Removing $line"
# Deinit the submodule
git submodule deinit -f "$line"
# Remove the submodule from the working tree
git rm -f "$line"
# Remove the submodule from .git/modules
rm -rf ".git/modules/$line"
done
# Remove the .gitmodules file
git rm -f .gitmodules

View File

@@ -1,11 +1,11 @@
include_directories(
${Platform_SOURCE_DIR}/src/Platform
${Platform_SOURCE_DIR}/src/PyClassifiers
${TORCH_INCLUDE_DIRS}
${Platform_SOURCE_DIR}/src/common
${Platform_SOURCE_DIR}/src/main
${Python3_INCLUDE_DIRS}
${Platform_SOURCE_DIR}/lib/Files
${Platform_SOURCE_DIR}/lib/mdlp
${Platform_SOURCE_DIR}/lib/argparse/include
${Platform_SOURCE_DIR}/lib/json/include
${CMAKE_BINARY_DIR}/configured_files/include
${PyClassifiers_INCLUDE_DIRS}
${bayesnet_INCLUDE_DIRS}
)
add_executable(PlatformSample sample.cc ${Platform_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(PlatformSample Platform ArffFiles mdlp "${TORCH_LIBRARIES}" PyWrap)
add_executable(PlatformSample sample.cpp ${Platform_SOURCE_DIR}/src/main/Models.cpp)
target_link_libraries(PlatformSample "${PyClassifiers}" "${BayesNet}" fimdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} ${Boost_LIBRARIES})

View File

@@ -1,236 +0,0 @@
#include <iostream>
#include <torch/torch.h>
#include <string>
#include <map>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include "ArffFiles.h"
#include "BayesMetrics.h"
#include "CPPFImdlp.h"
#include "folding.hpp"
#include "Models.h"
#include "modelRegister.h"
#include <fstream>
#include "config.h"
const std::string PATH = { data_path.begin(), data_path.end() };
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
{
std::vector<mdlp::labels_t>Xd;
map<std::string, int> maxes;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
Xd.push_back(xd);
}
return { Xd, maxes };
}
bool file_exists(const std::string& name)
{
if (FILE* file = fopen(name.c_str(), "r")) {
fclose(file);
return true;
} else {
return false;
}
}
pair<std::vector<std::vector<int>>, std::vector<int>> extract_indices(std::vector<int> indices, std::vector<std::vector<int>> X, std::vector<int> y)
{
std::vector<std::vector<int>> Xr; // nxm
std::vector<int> yr;
for (int col = 0; col < X.size(); ++col) {
Xr.push_back(std::vector<int>());
}
for (auto index : indices) {
for (int col = 0; col < X.size(); ++col) {
Xr[col].push_back(X[col][index]);
}
yr.push_back(y[index]);
}
return { Xr, yr };
}
int main(int argc, char** argv)
{
map<std::string, bool> datasets = {
{"diabetes", true},
{"ecoli", true},
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
};
auto valid_datasets = std::vector<std::string>();
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
[](const pair<std::string, bool>& pair) { return pair.first; });
argparse::ArgumentParser program("BayesNetSample");
program.add_argument("-d", "--dataset")
.help("Dataset file name")
.action([valid_datasets](const std::string& value) {
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
return value;
}
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
}
);
program.add_argument("-p", "--path")
.help(" folder where the data files are located, default")
.default_value(std::string{ PATH }
);
program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->tostring())
.action([](const std::string& value) {
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw runtime_error("Model must be one of " + platform::Models::instance()->tostring());
}
);
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw runtime_error(err.what());
}
catch (...) {
throw runtime_error("Number of folds must be an integer");
}});
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
bool class_last, stratified, tensors, dump_cpt;
std::string model_name, file_name, path, complete_file_name;
int nFolds, seed;
try {
program.parse_args(argc, argv);
file_name = program.get<std::string>("dataset");
path = program.get<std::string>("path");
model_name = program.get<std::string>("model");
complete_file_name = path + file_name + ".arff";
stratified = program.get<bool>("stratified");
tensors = program.get<bool>("tensors");
nFolds = program.get<int>("folds");
seed = program.get<int>("seed");
dump_cpt = program.get<bool>("dumpcpt");
class_last = datasets[file_name];
if (!file_exists(complete_file_name)) {
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
}
}
catch (const exception& err) {
cerr << err.what() << std::endl;
cerr << program;
exit(1);
}
/*
* Begin Processing
*/
auto handler = ArffFiles();
handler.load(complete_file_name, class_last);
// Get Dataset X, y
std::vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
std::vector<std::string> features;
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features),
[](const pair<std::string, std::string>& item) { return item.first; });
// Discretize Dataset
auto [Xd, maxes] = discretize(X, y, features);
maxes[className] = *max_element(y.begin(), y.end()) + 1;
map<std::string, std::vector<int>> states;
for (auto feature : features) {
states[feature] = std::vector<int>(maxes[feature]);
}
states[className] = std::vector<int>(maxes[className]);
auto clf = platform::Models::instance()->create(model_name);
clf->fit(Xd, y, features, className, states);
if (dump_cpt) {
std::cout << "--- CPT Tables ---" << std::endl;
clf->dump_cpt();
}
auto lines = clf->show();
for (auto line : lines) {
std::cout << line << std::endl;
}
std::cout << "--- Topological Order ---" << std::endl;
auto order = clf->topological_order();
for (auto name : order) {
std::cout << name << ", ";
}
std::cout << "end." << std::endl;
auto score = clf->score(Xd, y);
std::cout << "Score: " << score << std::endl;
auto graph = clf->graph();
auto dot_file = model_name + "_" + file_name;
ofstream file(dot_file + ".dot");
file << graph;
file.close();
std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl;
std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl;
std::string stratified_string = stratified ? " Stratified" : "";
std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl;
std::cout << "==========================================" << std::endl;
torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
torch::Tensor yt = torch::tensor(y, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
}
float total_score = 0, total_score_train = 0, score_train, score_test;
platform::Fold* fold;
if (stratified)
fold = new folding::StratifiedKFold(nFolds, y, seed);
else
fold = new folding::KFold(nFolds, y.size(), seed);
for (auto i = 0; i < nFolds; ++i) {
auto [train, test] = fold->getFold(i);
std::cout << "Fold: " << i + 1 << std::endl;
if (tensors) {
auto ttrain = torch::tensor(train, torch::kInt64);
auto ttest = torch::tensor(test, torch::kInt64);
torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
torch::Tensor ytraint = yt.index({ ttrain });
torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
torch::Tensor ytestt = yt.index({ ttest });
clf->fit(Xtraint, ytraint, features, className, states);
auto temp = clf->predict(Xtraint);
score_train = clf->score(Xtraint, ytraint);
score_test = clf->score(Xtestt, ytestt);
} else {
auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
auto [Xtest, ytest] = extract_indices(test, Xd, y);
clf->fit(Xtrain, ytrain, features, className, states);
score_train = clf->score(Xtrain, ytrain);
score_test = clf->score(Xtest, ytest);
}
if (dump_cpt) {
std::cout << "--- CPT Tables ---" << std::endl;
clf->dump_cpt();
}
total_score_train += score_train;
total_score += score_test;
std::cout << "Score Train: " << score_train << std::endl;
std::cout << "Score Test : " << score_test << std::endl;
std::cout << "-------------------------------------------------------------------------------" << std::endl;
}
std::cout << "**********************************************************************************" << std::endl;
std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl;
std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0;
}

279
sample/sample.cpp Normal file
View File

@@ -0,0 +1,279 @@
#include <iostream>
#include <string>
#include <map>
#include <fstream>
#include <torch/torch.h>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include <ArffFiles.hpp>
#include <fimdlp/CPPFImdlp.h>
#include <folding.hpp>
#include <bayesnet/utils/BayesMetrics.h>
#include <bayesnet/classifiers/SPODE.h>
#include "Models.h"
#include "modelRegister.h"
#include "config_platform.h"
const std::string PATH = { platform_data_path.begin(), platform_data_path.end() };
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
{
std::vector<mdlp::labels_t>Xd;
map<std::string, int> maxes;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
Xd.push_back(xd);
}
return { Xd, maxes };
}
bool file_exists(const std::string& name)
{
if (FILE* file = fopen(name.c_str(), "r")) {
fclose(file);
return true;
} else {
return false;
}
}
pair<std::vector<std::vector<int>>, std::vector<int>> extract_indices(std::vector<int> indices, std::vector<std::vector<int>> X, std::vector<int> y)
{
std::vector<std::vector<int>> Xr; // nxm
std::vector<int> yr;
for (int col = 0; col < X.size(); ++col) {
Xr.push_back(std::vector<int>());
}
for (auto index : indices) {
for (int col = 0; col < X.size(); ++col) {
Xr[col].push_back(X[col][index]);
}
yr.push_back(y[index]);
}
return { Xr, yr };
}
int main(int argc, char** argv)
{
map<std::string, bool> datasets = {
{"diabetes", true},
{"ecoli", true},
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
};
auto valid_datasets = std::vector<std::string>();
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
[](const pair<std::string, bool>& pair) { return pair.first; });
argparse::ArgumentParser program("PlatformSample");
program.add_argument("-d", "--dataset")
.help("Dataset file name")
.action([valid_datasets](const std::string& value) {
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
return value;
}
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
}
);
program.add_argument("-p", "--path")
.help(" folder where the data files are located, default")
.default_value(std::string{ PATH }
);
program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->toString())
.action([](const std::string& value) {
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw runtime_error("Model must be one of " + platform::Models::instance()->toString());
}
);
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw runtime_error(err.what());
}
catch (...) {
throw runtime_error("Number of folds must be an integer");
}});
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
bool class_last, stratified, tensors, dump_cpt;
std::string model_name, file_name, path, complete_file_name;
int nFolds, seed;
try {
program.parse_args(argc, argv);
file_name = program.get<std::string>("dataset");
path = program.get<std::string>("path");
model_name = program.get<std::string>("model");
complete_file_name = path + file_name + ".arff";
stratified = program.get<bool>("stratified");
tensors = program.get<bool>("tensors");
nFolds = program.get<int>("folds");
seed = program.get<int>("seed");
dump_cpt = program.get<bool>("dumpcpt");
class_last = datasets[file_name];
if (!file_exists(complete_file_name)) {
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
}
}
catch (const exception& err) {
cerr << err.what() << std::endl;
cerr << program;
exit(1);
}
/*
* Begin Processing
*/
auto handler = ArffFiles();
handler.load(complete_file_name, class_last);
// Get Dataset X, y
std::vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
std::vector<std::string> features;
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features),
[](const pair<std::string, std::string>& item) { return item.first; });
// Discretize Dataset
auto [Xd, maxes] = discretize(X, y, features);
maxes[className] = *max_element(y.begin(), y.end()) + 1;
map<std::string, std::vector<int>> states;
for (auto feature : features) {
states[feature] = std::vector<int>(maxes[feature]);
}
states[className] = std::vector<int>(maxes[className]);
// Output the states
std::cout << std::string(80, '-') << std::endl;
std::cout << "States" << std::endl;
for (auto feature : features) {
std::cout << feature << ": " << states[feature].size() << std::endl;
}
std::cout << std::string(80, '-') << std::endl;
//auto clf = platform::Models::instance()->create("SPODE");
auto clf = bayesnet::SPODE(2);
bayesnet::Smoothing_t smoothing = bayesnet::Smoothing_t::ORIGINAL;
clf.fit(Xd, y, features, className, states, smoothing);
if (dump_cpt) {
std::cout << "--- CPT Tables ---" << std::endl;
std::cout << clf.dump_cpt();
}
std::cout << "--- Datos predicción ---" << std::endl;
std::cout << "Orden de variables: " << std::endl;
for (auto feature : features) {
std::cout << feature << ", ";
}
std::cout << std::endl;
std::cout << "X[0]: ";
for (int i = 0; i < Xd.size(); ++i) {
std::cout << Xd[i][0] << ", ";
}
std::cout << std::endl;
std::cout << std::string(80, '-') << std::endl;
auto lines = clf.show();
for (auto line : lines) {
std::cout << line << std::endl;
}
std::cout << "--- Topological Order ---" << std::endl;
auto order = clf.topological_order();
for (auto name : order) {
std::cout << name << ", ";
}
auto predict_proba = clf.predict_proba(Xd);
std::cout << "Instances predict_proba: ";
for (int i = 0; i < predict_proba.size(); i++) {
std::cout << "Instance " << i << ": ";
for (int j = 0; j < 4; ++j) {
std::cout << Xd[j][i] << ", ";
}
std::cout << ": ";
for (auto score : predict_proba[i]) {
std::cout << score << ", ";
}
std::cout << std::endl;
}
// std::cout << std::endl;
// std::cout << "end." << std::endl;
// auto score = clf->score(Xd, y);
// std::cout << "Score: " << score << std::endl;
// auto graph = clf->graph();
// auto dot_file = model_name + "_" + file_name;
// ofstream file(dot_file + ".dot");
// file << graph;
// file.close();
// std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl;
// std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl;
// std::string stratified_string = stratified ? " Stratified" : "";
// std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl;
// std::cout << "==========================================" << std::endl;
// torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
// torch::Tensor yt = torch::tensor(y, torch::kInt32);
// for (int i = 0; i < features.size(); ++i) {
// Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
// }
// float total_score = 0, total_score_train = 0, score_train, score_test;
// folding::Fold* fold;
// double nodes = 0.0;
// if (stratified)
// fold = new folding::StratifiedKFold(nFolds, y, seed);
// else
// fold = new folding::KFold(nFolds, y.size(), seed);
// for (auto i = 0; i < nFolds; ++i) {
// auto [train, test] = fold->getFold(i);
// std::cout << "Fold: " << i + 1 << std::endl;
// if (tensors) {
// auto ttrain = torch::tensor(train, torch::kInt64);
// auto ttest = torch::tensor(test, torch::kInt64);
// torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
// torch::Tensor ytraint = yt.index({ ttrain });
// torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
// torch::Tensor ytestt = yt.index({ ttest });
// clf->fit(Xtraint, ytraint, features, className, states, smoothing);
// auto temp = clf->predict(Xtraint);
// score_train = clf->score(Xtraint, ytraint);
// score_test = clf->score(Xtestt, ytestt);
// } else {
// auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
// auto [Xtest, ytest] = extract_indices(test, Xd, y);
// clf->fit(Xtrain, ytrain, features, className, states, smoothing);
// std::cout << "Nodes: " << clf->getNumberOfNodes() << std::endl;
// nodes += clf->getNumberOfNodes();
// score_train = clf->score(Xtrain, ytrain);
// score_test = clf->score(Xtest, ytest);
// }
// // if (dump_cpt) {
// // std::cout << "--- CPT Tables ---" << std::endl;
// // std::cout << clf->dump_cpt();
// // }
// total_score_train += score_train;
// total_score += score_test;
// std::cout << "Score Train: " << score_train << std::endl;
// std::cout << "Score Test : " << score_test << std::endl;
// std::cout << "-------------------------------------------------------------------------------" << std::endl;
// }
// std::cout << "Nodes: " << nodes / nFolds << std::endl;
// std::cout << "**********************************************************************************" << std::endl;
// std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl;
// std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0;
}

80
src/CMakeLists.txt Normal file
View File

@@ -0,0 +1,80 @@
include_directories(
${Python3_INCLUDE_DIRS}
${MPI_CXX_INCLUDE_DIRS}
${CMAKE_BINARY_DIR}/configured_files/include
${Platform_SOURCE_DIR}/src
)
# b_best
add_executable(
b_best commands/b_best.cpp best/Statistics.cpp
best/BestResultsExcel.cpp best/BestResultsTex.cpp best/BestResultsMd.cpp best/BestResults.cpp
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
main/Models.cpp main/Scores.cpp
reports/ReportExcel.cpp reports/ReportBase.cpp reports/ExcelFile.cpp
results/Result.cpp
experimental_clfs/XA1DE.cpp
experimental_clfs/ExpClf.cpp
experimental_clfs/DecisionTree.cpp
experimental_clfs/AdaBoost.cpp
)
target_link_libraries(b_best Boost::boost bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy libxlsxwriter::libxlsxwriter)
# b_grid
set(grid_sources GridSearch.cpp GridData.cpp GridExperiment.cpp GridBase.cpp )
list(TRANSFORM grid_sources PREPEND grid/)
add_executable(b_grid commands/b_grid.cpp ${grid_sources}
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
main/HyperParameters.cpp main/Models.cpp main/Experiment.cpp main/Scores.cpp main/ArgumentsExperiment.cpp
reports/ReportConsole.cpp reports/ReportBase.cpp
results/Result.cpp
experimental_clfs/XA1DE.cpp
experimental_clfs/ExpClf.cpp
experimental_clfs/DecisionTree.cpp
experimental_clfs/AdaBoost.cpp
)
target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy)
# b_list
add_executable(b_list commands/b_list.cpp
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
main/Models.cpp main/Scores.cpp
reports/ReportExcel.cpp reports/ExcelFile.cpp reports/ReportBase.cpp reports/DatasetsExcel.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
results/Result.cpp results/ResultsDatasetExcel.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
experimental_clfs/XA1DE.cpp
experimental_clfs/ExpClf.cpp
experimental_clfs/DecisionTree.cpp
experimental_clfs/AdaBoost.cpp
)
target_link_libraries(b_list bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy libxlsxwriter::libxlsxwriter)
# b_main
set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp ArgumentsExperiment.cpp)
list(TRANSFORM main_sources PREPEND main/)
add_executable(b_main commands/b_main.cpp ${main_sources}
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
reports/ReportConsole.cpp reports/ReportBase.cpp
results/Result.cpp
experimental_clfs/XA1DE.cpp
experimental_clfs/ExpClf.cpp
experimental_clfs/ExpClf.cpp
experimental_clfs/DecisionTree.cpp
experimental_clfs/AdaBoost.cpp
)
target_link_libraries(b_main PRIVATE nlohmann_json::nlohmann_json bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy)
# b_manage
set(manage_sources ManageScreen.cpp OptionsMenu.cpp ResultsManager.cpp)
list(TRANSFORM manage_sources PREPEND manage/)
add_executable(
b_manage commands/b_manage.cpp ${manage_sources}
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
reports/ReportConsole.cpp reports/ReportExcel.cpp reports/ReportExcelCompared.cpp reports/ReportBase.cpp reports/ExcelFile.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
results/Result.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
main/Scores.cpp
)
target_link_libraries(b_manage torch::torch libxlsxwriter::libxlsxwriter fimdlp::fimdlp bayesnet::bayesnet argparse::argparse)
# b_results
add_executable(b_results commands/b_results.cpp)
target_link_libraries(b_results torch::torch libxlsxwriter::libxlsxwriter fimdlp::fimdlp bayesnet::bayesnet argparse::argparse)

View File

@@ -1,28 +0,0 @@
include_directories(
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/src/BayesNet
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/folding
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/mdlp
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/json/include
${Platform_SOURCE_DIR}/lib/PyClassifiers/src/PyClassifiers
${Platform_SOURCE_DIR}/src/Platform
${Platform_SOURCE_DIR}/lib/Files
${Platform_SOURCE_DIR}/lib/mdlp
${Platform_SOURCE_DIR}/lib/argparse/include
${Platform_SOURCE_DIR}/lib/json/include
${Platform_SOURCE_DIR}/lib/libxlsxwriter/include
${Python3_INCLUDE_DIRS}
${MPI_CXX_INCLUDE_DIRS}
${CMAKE_BINARY_DIR}/configured_files/include
)
add_executable(b_best b_best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
add_executable(b_grid b_grid.cc GridSearch.cc GridData.cc HyperParameters.cc Datasets.cc Dataset.cc Models.cc)
add_executable(b_list b_list.cc Datasets.cc Dataset.cc)
add_executable(b_main b_main.cc Experiment.cc Datasets.cc Dataset.cc Models.cc HyperParameters.cc ReportConsole.cc ReportBase.cc)
add_executable(b_manage b_manage.cc Results.cc ManageResults.cc CommandParser.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}" "${TORCH_LIBRARIES}" ArffFiles mdlp)
target_link_libraries(b_grid PyClassifiers ${MPI_CXX_LIBRARIES})
target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}")
target_link_libraries(b_main PyClassifiers BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)

View File

@@ -1,15 +0,0 @@
#ifndef COLORS_H
#define COLORS_H
class Colors {
public:
static std::string MAGENTA() { return "\033[1;35m"; }
static std::string BLUE() { return "\033[1;34m"; }
static std::string CYAN() { return "\033[1;36m"; }
static std::string GREEN() { return "\033[1;32m"; }
static std::string YELLOW() { return "\033[1;33m"; }
static std::string RED() { return "\033[1;31m"; }
static std::string WHITE() { return "\033[1;37m"; }
static std::string IBLUE() { return "\033[0;94m"; }
static std::string RESET() { return "\033[0m"; }
};
#endif // COLORS_H

View File

@@ -1,87 +0,0 @@
#include "CommandParser.h"
#include <iostream>
#include <sstream>
#include <algorithm>
#include "Colors.h"
#include "Utils.h"
namespace platform {
void CommandParser::messageError(const std::string& message)
{
std::cout << Colors::RED() << message << Colors::RESET() << std::endl;
}
std::pair<char, int> CommandParser::parse(const std::string& color, const std::vector<std::tuple<std::string, char, bool>>& options, const char defaultCommand, const int maxIndex)
{
bool finished = false;
while (!finished) {
std::stringstream oss;
std::string line;
oss << color << "Choose option (";
bool first = true;
for (auto& option : options) {
if (first) {
first = false;
} else {
oss << ", ";
}
oss << std::get<char>(option) << "=" << std::get<std::string>(option);
}
oss << "): ";
std::cout << oss.str();
getline(std::cin, line);
std::cout << Colors::RESET();
line = trim(line);
if (line.size() == 0)
continue;
if (all_of(line.begin(), line.end(), ::isdigit)) {
command = defaultCommand;
index = stoi(line);
if (index > maxIndex || index < 0) {
messageError("Index out of range");
continue;
}
finished = true;
break;
}
bool found = false;
for (auto& option : options) {
if (line[0] == std::get<char>(option)) {
found = true;
// it's a match
line.erase(line.begin());
line = trim(line);
if (std::get<bool>(option)) {
// The option requires a value
if (line.size() == 0) {
messageError("Option " + std::get<std::string>(option) + " requires a value");
break;
}
try {
index = stoi(line);
if (index > maxIndex || index < 0) {
messageError("Index out of range");
break;
}
}
catch (const std::invalid_argument& ia) {
messageError("Invalid value: " + line);
break;
}
} else {
if (line.size() > 0) {
messageError("option " + std::get<std::string>(option) + " doesn't accept values");
break;
}
}
command = std::get<char>(option);
finished = true;
break;
}
}
if (!found) {
messageError("I don't know " + line);
}
}
return { command, index };
}
} /* namespace platform */

View File

@@ -1,20 +0,0 @@
#ifndef COMMAND_PARSER_H
#define COMMAND_PARSER_H
#include <string>
#include <vector>
#include <tuple>
namespace platform {
class CommandParser {
public:
CommandParser() = default;
std::pair<char, int> parse(const std::string& color, const std::vector<std::tuple<std::string, char, bool>>& options, const char defaultCommand, const int maxIndex);
char getCommand() const { return command; };
int getIndex() const { return index; };
private:
void messageError(const std::string& message);
char command;
int index;
};
} /* namespace platform */
#endif /* COMMAND_PARSER_H */

View File

@@ -1,215 +0,0 @@
#include "Dataset.h"
#include "ArffFiles.h"
#include <fstream>
namespace platform {
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
{
}
std::string Dataset::getName() const
{
return name;
}
std::string Dataset::getClassName() const
{
return className;
}
std::vector<std::string> Dataset::getFeatures() const
{
if (loaded) {
return features;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures() const
{
if (loaded) {
return n_features;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples() const
{
if (loaded) {
return n_samples;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
std::map<std::string, std::vector<int>> Dataset::getStates() const
{
if (loaded) {
return states;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<int>>&, std::vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
std::string line;
getline(file, line);
std::vector<std::string> tokens = split(line, ',');
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
if (className == "-1") {
className = tokens.back();
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw std::invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = std::vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
}
states[className] = std::vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
}
std::vector<std::string> tokenize(std::string line)
{
std::vector<std::string> tokens;
for (auto i = 0; i < line.size(); ++i) {
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
std::string token = line.substr(0, i);
tokens.push_back(token);
line.erase(line.begin(), line.begin() + i + 1);
i = 0;
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
line.erase(line.begin(), line.begin() + i + 1);
}
}
if (line.size() > 0) {
tokens.push_back(line);
}
return tokens;
}
void Dataset::load_rdata()
{
ifstream file(path + "/" + name + "_R.dat");
if (file.is_open()) {
std::string line;
getline(file, line);
line = ArffFiles::trim(line);
std::vector<std::string> tokens = tokenize(line);
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
if (className == "-1") {
className = ArffFiles::trim(tokens.back());
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = tokenize(line);
// We have to skip the first token, which is the instance number.
for (auto i = 1; i < features.size() + 1; ++i) {
const float value = stof(tokens[i]);
Xv[i - 1].push_back(value);
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw std::invalid_argument("Unable to open dataset file.");
}
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
} else if (fileType == RDATA) {
load_rdata();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
}
n_samples = Xv[0].size();
n_features = Xv.size();
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
}
y = torch::tensor(yv, torch::kInt32);
}
std::vector<mdlp::labels_t> Dataset::discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
{
std::vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
}

View File

@@ -1,129 +0,0 @@
#include "Datasets.h"
#include <fstream>
namespace platform {
void Datasets::load()
{
auto sd = SourceData(sfileType);
fileType = sd.getFileType();
path = sd.getPath();
ifstream catalog(path + "all.txt");
if (catalog.is_open()) {
std::string line;
while (getline(catalog, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
std::vector<std::string> tokens = split(line, ',');
std::string name = tokens[0];
std::string className;
if (tokens.size() == 1) {
className = "-1";
} else {
className = tokens[1];
}
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
}
catalog.close();
} else {
throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
}
}
std::vector<std::string> Datasets::getNames()
{
std::vector<std::string> result;
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
return result;
}
std::vector<std::string> Datasets::getFeatures(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getFeatures();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
map<std::string, std::vector<int>> Datasets::getStates(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getStates();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
void Datasets::loadDataset(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return;
} else {
datasets.at(name)->load();
}
}
std::string Datasets::getClassName(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getClassName();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNSamples(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getNSamples();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNClasses(const std::string& name)
{
if (datasets.at(name)->isLoaded()) {
auto className = datasets.at(name)->getClassName();
if (discretize) {
auto states = getStates(name);
return states.at(className).size();
}
auto [Xv, yv] = getVectors(name);
return *std::max_element(yv.begin(), yv.end()) + 1;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
std::vector<int> Datasets::getClassesCounts(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
auto [Xv, yv] = datasets.at(name)->getVectors();
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
for (auto y : yv) {
counts[y]++;
}
return counts;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<float>>&, std::vector<int>&> Datasets::getVectors(const std::string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getVectors();
}
pair<std::vector<std::vector<int>>&, std::vector<int>&> Datasets::getVectorsDiscretized(const std::string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getVectorsDiscretized();
}
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(const std::string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getTensors();
}
bool Datasets::isDataset(const std::string& name) const
{
return datasets.find(name) != datasets.end();
}
}

View File

@@ -1,30 +0,0 @@
#ifndef DATASETS_H
#define DATASETS_H
#include "Dataset.h"
namespace platform {
class Datasets {
private:
std::string path;
fileType_t fileType;
std::string sfileType;
std::map<std::string, std::unique_ptr<Dataset>> datasets;
bool discretize;
void load(); // Loads the list of datasets
public:
explicit Datasets(bool discretize, std::string sfileType) : discretize(discretize), sfileType(sfileType) { load(); };
std::vector<string> getNames();
std::vector<string> getFeatures(const std::string& name) const;
int getNSamples(const std::string& name) const;
std::string getClassName(const std::string& name) const;
int getNClasses(const std::string& name);
std::vector<int> getClassesCounts(const std::string& name) const;
std::map<std::string, std::vector<int>> getStates(const std::string& name) const;
std::pair<std::vector<std::vector<float>>&, std::vector<int>&> getVectors(const std::string& name);
std::pair<std::vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized(const std::string& name);
std::pair<torch::Tensor&, torch::Tensor&> getTensors(const std::string& name);
bool isDataset(const std::string& name) const;
void loadDataset(const std::string& name) const;
};
};
#endif

View File

@@ -1,55 +0,0 @@
#ifndef DOTENV_H
#define DOTENV_H
#include <string>
#include <map>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <iostream>
#include "Utils.h"
//#include "Dataset.h"
namespace platform {
class DotEnv {
private:
std::map<std::string, std::string> env;
public:
DotEnv()
{
std::ifstream file(".env");
if (!file.is_open()) {
std::cerr << "File .env not found" << std::endl;
exit(1);
}
std::string line;
while (std::getline(file, line)) {
line = trim(line);
if (line.empty() || line[0] == '#') {
continue;
}
std::istringstream iss(line);
std::string key, value;
if (std::getline(iss, key, '=') && std::getline(iss, value)) {
env[key] = value;
}
}
}
std::string get(const std::string& key)
{
return env.at(key);
}
std::vector<int> getSeeds()
{
auto seeds = std::vector<int>();
auto seeds_str = env["seeds"];
seeds_str = trim(seeds_str);
seeds_str = seeds_str.substr(1, seeds_str.size() - 2);
auto seeds_str_split = split(seeds_str, ',');
transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) {
return stoi(str);
});
return seeds;
}
};
}
#endif

View File

@@ -1,226 +0,0 @@
#include <fstream>
#include "Experiment.h"
#include "Datasets.h"
#include "Models.h"
#include "ReportConsole.h"
#include "Paths.h"
namespace platform {
using json = nlohmann::json;
std::string get_date()
{
time_t rawtime;
tm* timeinfo;
time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::ostringstream oss;
oss << std::put_time(timeinfo, "%Y-%m-%d");
return oss.str();
}
std::string get_time()
{
time_t rawtime;
tm* timeinfo;
time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::ostringstream oss;
oss << std::put_time(timeinfo, "%H:%M:%S");
return oss.str();
}
std::string Experiment::get_file_name()
{
std::string result = "results_" + score_name + "_" + model + "_" + platform + "_" + get_date() + "_" + get_time() + "_" + (stratified ? "1" : "0") + ".json";
return result;
}
json Experiment::build_json()
{
json result;
result["title"] = title;
result["date"] = get_date();
result["time"] = get_time();
result["model"] = model;
result["version"] = model_version;
result["platform"] = platform;
result["score_name"] = score_name;
result["language"] = language;
result["language_version"] = language_version;
result["discretized"] = discretized;
result["stratified"] = stratified;
result["folds"] = nfolds;
result["seeds"] = randomSeeds;
result["duration"] = duration;
result["results"] = json::array();
for (const auto& r : results) {
json j;
j["dataset"] = r.getDataset();
j["hyperparameters"] = r.getHyperparameters();
j["samples"] = r.getSamples();
j["features"] = r.getFeatures();
j["classes"] = r.getClasses();
j["score_train"] = r.getScoreTrain();
j["score_test"] = r.getScoreTest();
j["score"] = r.getScoreTest();
j["score_std"] = r.getScoreTestStd();
j["score_train_std"] = r.getScoreTrainStd();
j["score_test_std"] = r.getScoreTestStd();
j["train_time"] = r.getTrainTime();
j["train_time_std"] = r.getTrainTimeStd();
j["test_time"] = r.getTestTime();
j["test_time_std"] = r.getTestTimeStd();
j["time"] = r.getTestTime() + r.getTrainTime();
j["time_std"] = r.getTestTimeStd() + r.getTrainTimeStd();
j["scores_train"] = r.getScoresTrain();
j["scores_test"] = r.getScoresTest();
j["times_train"] = r.getTimesTrain();
j["times_test"] = r.getTimesTest();
j["nodes"] = r.getNodes();
j["leaves"] = r.getLeaves();
j["depth"] = r.getDepth();
result["results"].push_back(j);
}
return result;
}
void Experiment::save(const std::string& path)
{
json data = build_json();
ofstream file(path + "/" + get_file_name());
file << data;
file.close();
}
void Experiment::report()
{
json data = build_json();
ReportConsole report(data);
report.show();
}
void Experiment::show()
{
json data = build_json();
std::cout << data.dump(4) << std::endl;
}
void Experiment::go(std::vector<std::string> filesToProcess, bool quiet)
{
std::cout << "*** Starting experiment: " << title << " ***" << std::endl;
for (auto fileName : filesToProcess) {
std::cout << "- " << setw(20) << left << fileName << " " << right << flush;
cross_validation(fileName, quiet);
std::cout << std::endl;
}
}
std::string getColor(bayesnet::status_t status)
{
switch (status) {
case bayesnet::NORMAL:
return Colors::GREEN();
case bayesnet::WARNING:
return Colors::YELLOW();
case bayesnet::ERROR:
return Colors::RED();
default:
return Colors::RESET();
}
}
void showProgress(int fold, const std::string& color, const std::string& phase)
{
std::string prefix = phase == "a" ? "" : "\b\b\b\b";
std::cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush;
}
void Experiment::cross_validation(const std::string& fileName, bool quiet)
{
auto datasets = Datasets(discretized, Paths::datasets());
// Get dataset
auto [X, y] = datasets.getTensors(fileName);
auto states = datasets.getStates(fileName);
auto features = datasets.getFeatures(fileName);
auto samples = datasets.getNSamples(fileName);
auto className = datasets.getClassName(fileName);
if (!quiet) {
std::cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush;
}
// Prepare Result
auto result = Result();
auto [values, counts] = at::_unique(y);
result.setSamples(X.size(1)).setFeatures(X.size(0)).setClasses(values.size(0));
result.setHyperparameters(hyperparameters.get(fileName));
// Initialize results std::vectors
int nResults = nfolds * static_cast<int>(randomSeeds.size());
auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64);
auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64);
auto train_time = torch::zeros({ nResults }, torch::kFloat64);
auto test_time = torch::zeros({ nResults }, torch::kFloat64);
auto nodes = torch::zeros({ nResults }, torch::kFloat64);
auto edges = torch::zeros({ nResults }, torch::kFloat64);
auto num_states = torch::zeros({ nResults }, torch::kFloat64);
Timer train_timer, test_timer;
int item = 0;
for (auto seed : randomSeeds) {
if (!quiet)
std::cout << "(" << seed << ") doing Fold: " << flush;
folding::Fold* fold;
if (stratified)
fold = new folding::StratifiedKFold(nfolds, y, seed);
else
fold = new folding::KFold(nfolds, y.size(0), seed);
for (int nfold = 0; nfold < nfolds; nfold++) {
auto clf = Models::instance()->create(model);
setModelVersion(clf->getVersion());
auto valid = clf->getValidHyperparameters();
hyperparameters.check(valid, fileName);
clf->setHyperparameters(hyperparameters.get(fileName));
// Split train - test dataset
train_timer.start();
auto [train, test] = fold->getFold(nfold);
auto train_t = torch::tensor(train);
auto test_t = torch::tensor(test);
auto X_train = X.index({ "...", train_t });
auto y_train = y.index({ train_t });
auto X_test = X.index({ "...", test_t });
auto y_test = y.index({ test_t });
if (!quiet)
showProgress(nfold + 1, getColor(clf->getStatus()), "a");
// Train model
clf->fit(X_train, y_train, features, className, states);
if (!quiet)
showProgress(nfold + 1, getColor(clf->getStatus()), "b");
nodes[item] = clf->getNumberOfNodes();
edges[item] = clf->getNumberOfEdges();
num_states[item] = clf->getNumberOfStates();
train_time[item] = train_timer.getDuration();
// Score train
auto accuracy_train_value = clf->score(X_train, y_train);
// Test model
if (!quiet)
showProgress(nfold + 1, getColor(clf->getStatus()), "c");
test_timer.start();
auto accuracy_test_value = clf->score(X_test, y_test);
test_time[item] = test_timer.getDuration();
accuracy_train[item] = accuracy_train_value;
accuracy_test[item] = accuracy_test_value;
if (!quiet)
std::cout << "\b\b\b, " << flush;
// Store results and times in std::vector
result.addScoreTrain(accuracy_train_value);
result.addScoreTest(accuracy_test_value);
result.addTimeTrain(train_time[item].item<double>());
result.addTimeTest(test_time[item].item<double>());
item++;
}
if (!quiet)
std::cout << "end. " << flush;
delete fold;
}
result.setScoreTest(torch::mean(accuracy_test).item<double>()).setScoreTrain(torch::mean(accuracy_train).item<double>());
result.setScoreTestStd(torch::std(accuracy_test).item<double>()).setScoreTrainStd(torch::std(accuracy_train).item<double>());
result.setTrainTime(torch::mean(train_time).item<double>()).setTestTime(torch::mean(test_time).item<double>());
result.setTestTimeStd(torch::std(test_time).item<double>()).setTrainTimeStd(torch::std(train_time).item<double>());
result.setNodes(torch::mean(nodes).item<double>()).setLeaves(torch::mean(edges).item<double>()).setDepth(torch::mean(num_states).item<double>());
result.setDataset(fileName);
addResult(result);
}
}

View File

@@ -1,103 +0,0 @@
#ifndef EXPERIMENT_H
#define EXPERIMENT_H
#include <torch/torch.h>
#include <nlohmann/json.hpp>
#include <string>
#include "folding.hpp"
#include "BaseClassifier.h"
#include "HyperParameters.h"
#include "TAN.h"
#include "KDB.h"
#include "AODE.h"
#include "Timer.h"
namespace platform {
using json = nlohmann::json;
class Result {
private:
std::string dataset, model_version;
json hyperparameters;
int samples{ 0 }, features{ 0 }, classes{ 0 };
double score_train{ 0 }, score_test{ 0 }, score_train_std{ 0 }, score_test_std{ 0 }, train_time{ 0 }, train_time_std{ 0 }, test_time{ 0 }, test_time_std{ 0 };
float nodes{ 0 }, leaves{ 0 }, depth{ 0 };
std::vector<double> scores_train, scores_test, times_train, times_test;
public:
Result() = default;
Result& setDataset(const std::string& dataset) { this->dataset = dataset; return *this; }
Result& setHyperparameters(const json& hyperparameters) { this->hyperparameters = hyperparameters; return *this; }
Result& setSamples(int samples) { this->samples = samples; return *this; }
Result& setFeatures(int features) { this->features = features; return *this; }
Result& setClasses(int classes) { this->classes = classes; return *this; }
Result& setScoreTrain(double score) { this->score_train = score; return *this; }
Result& setScoreTest(double score) { this->score_test = score; return *this; }
Result& setScoreTrainStd(double score_std) { this->score_train_std = score_std; return *this; }
Result& setScoreTestStd(double score_std) { this->score_test_std = score_std; return *this; }
Result& setTrainTime(double train_time) { this->train_time = train_time; return *this; }
Result& setTrainTimeStd(double train_time_std) { this->train_time_std = train_time_std; return *this; }
Result& setTestTime(double test_time) { this->test_time = test_time; return *this; }
Result& setTestTimeStd(double test_time_std) { this->test_time_std = test_time_std; return *this; }
Result& setNodes(float nodes) { this->nodes = nodes; return *this; }
Result& setLeaves(float leaves) { this->leaves = leaves; return *this; }
Result& setDepth(float depth) { this->depth = depth; return *this; }
Result& addScoreTrain(double score) { scores_train.push_back(score); return *this; }
Result& addScoreTest(double score) { scores_test.push_back(score); return *this; }
Result& addTimeTrain(double time) { times_train.push_back(time); return *this; }
Result& addTimeTest(double time) { times_test.push_back(time); return *this; }
const float get_score_train() const { return score_train; }
float get_score_test() { return score_test; }
const std::string& getDataset() const { return dataset; }
const json& getHyperparameters() const { return hyperparameters; }
const int getSamples() const { return samples; }
const int getFeatures() const { return features; }
const int getClasses() const { return classes; }
const double getScoreTrain() const { return score_train; }
const double getScoreTest() const { return score_test; }
const double getScoreTrainStd() const { return score_train_std; }
const double getScoreTestStd() const { return score_test_std; }
const double getTrainTime() const { return train_time; }
const double getTrainTimeStd() const { return train_time_std; }
const double getTestTime() const { return test_time; }
const double getTestTimeStd() const { return test_time_std; }
const float getNodes() const { return nodes; }
const float getLeaves() const { return leaves; }
const float getDepth() const { return depth; }
const std::vector<double>& getScoresTrain() const { return scores_train; }
const std::vector<double>& getScoresTest() const { return scores_test; }
const std::vector<double>& getTimesTrain() const { return times_train; }
const std::vector<double>& getTimesTest() const { return times_test; }
};
class Experiment {
public:
Experiment() = default;
Experiment& setTitle(const std::string& title) { this->title = title; return *this; }
Experiment& setModel(const std::string& model) { this->model = model; return *this; }
Experiment& setPlatform(const std::string& platform) { this->platform = platform; return *this; }
Experiment& setScoreName(const std::string& score_name) { this->score_name = score_name; return *this; }
Experiment& setModelVersion(const std::string& model_version) { this->model_version = model_version; return *this; }
Experiment& setLanguage(const std::string& language) { this->language = language; return *this; }
Experiment& setLanguageVersion(const std::string& language_version) { this->language_version = language_version; return *this; }
Experiment& setDiscretized(bool discretized) { this->discretized = discretized; return *this; }
Experiment& setStratified(bool stratified) { this->stratified = stratified; return *this; }
Experiment& setNFolds(int nfolds) { this->nfolds = nfolds; return *this; }
Experiment& addResult(Result result) { results.push_back(result); return *this; }
Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); return *this; }
Experiment& setDuration(float duration) { this->duration = duration; return *this; }
Experiment& setHyperparameters(const HyperParameters& hyperparameters_) { this->hyperparameters = hyperparameters_; return *this; }
std::string get_file_name();
void save(const std::string& path);
void cross_validation(const std::string& fileName, bool quiet);
void go(std::vector<std::string> filesToProcess, bool quiet);
void show();
void report();
private:
std::string title, model, platform, score_name, model_version, language_version, language;
bool discretized{ false }, stratified{ false };
std::vector<Result> results;
std::vector<int> randomSeeds;
HyperParameters hyperparameters;
int nfolds{ 0 };
float duration{ 0 };
json build_json();
};
}
#endif

View File

@@ -1,441 +0,0 @@
#include <iostream>
#include <cstddef>
#include <torch/torch.h>
#include "GridSearch.h"
#include "Models.h"
#include "Paths.h"
#include "folding.hpp"
#include "Colors.h"
namespace platform {
std::string get_date()
{
time_t rawtime;
tm* timeinfo;
time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::ostringstream oss;
oss << std::put_time(timeinfo, "%Y-%m-%d");
return oss.str();
}
std::string get_time()
{
time_t rawtime;
tm* timeinfo;
time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::ostringstream oss;
oss << std::put_time(timeinfo, "%H:%M:%S");
return oss.str();
}
std::string get_color_rank(int rank)
{
auto colors = { Colors::WHITE(), Colors::RED(), Colors::GREEN(), Colors::BLUE(), Colors::MAGENTA(), Colors::CYAN() };
return *(colors.begin() + rank % colors.size());
}
GridSearch::GridSearch(struct ConfigGrid& config) : config(config)
{
}
json GridSearch::loadResults()
{
std::ifstream file(Paths::grid_output(config.model));
if (file.is_open()) {
return json::parse(file);
}
return json();
}
std::vector<std::string> GridSearch::filterDatasets(Datasets& datasets) const
{
// Load datasets
auto datasets_names = datasets.getNames();
if (config.continue_from != NO_CONTINUE()) {
// Continue previous execution:
if (std::find(datasets_names.begin(), datasets_names.end(), config.continue_from) == datasets_names.end()) {
throw std::invalid_argument("Dataset " + config.continue_from + " not found");
}
// Remove datasets already processed
std::vector<string>::iterator it = datasets_names.begin();
while (it != datasets_names.end()) {
if (*it != config.continue_from) {
it = datasets_names.erase(it);
} else {
if (config.only)
++it;
else
break;
}
}
}
// Exclude datasets
for (const auto& name : config.excluded) {
auto dataset = name.get<std::string>();
auto it = std::find(datasets_names.begin(), datasets_names.end(), dataset);
if (it == datasets_names.end()) {
throw std::invalid_argument("Dataset " + dataset + " already excluded or doesn't exist!");
}
datasets_names.erase(it);
}
return datasets_names;
}
json GridSearch::build_tasks_mpi(int rank)
{
auto tasks = json::array();
auto grid = GridData(Paths::grid_input(config.model));
auto datasets = Datasets(false, Paths::datasets());
auto all_datasets = datasets.getNames();
auto datasets_names = filterDatasets(datasets);
for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) {
auto dataset = datasets_names[idx_dataset];
for (const auto& seed : config.seeds) {
auto combinations = grid.getGrid(dataset);
for (int n_fold = 0; n_fold < config.n_folds; n_fold++) {
json task = {
{ "dataset", dataset },
{ "idx_dataset", idx_dataset},
{ "seed", seed },
{ "fold", n_fold},
};
tasks.push_back(task);
}
}
}
// Shuffle the array so heavy datasets are spread across the workers
std::mt19937 g{ 271 }; // Use fixed seed to obtain the same shuffle
std::shuffle(tasks.begin(), tasks.end(), g);
std::cout << get_color_rank(rank) << "* Number of tasks: " << tasks.size() << std::endl;
std::cout << "|";
for (int i = 0; i < tasks.size(); ++i) {
std::cout << (i + 1) % 10;
}
std::cout << "|" << std::endl << "|" << std::flush;
return tasks;
}
void process_task_mpi_consumer(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result)
{
// initialize
Timer timer;
timer.start();
json task = tasks[n_task];
auto model = config.model;
auto grid = GridData(Paths::grid_input(model));
auto dataset = task["dataset"].get<std::string>();
auto idx_dataset = task["idx_dataset"].get<int>();
auto seed = task["seed"].get<int>();
auto n_fold = task["fold"].get<int>();
bool stratified = config.stratified;
// Generate the hyperparamters combinations
auto combinations = grid.getGrid(dataset);
auto [X, y] = datasets.getTensors(dataset);
auto states = datasets.getStates(dataset);
auto features = datasets.getFeatures(dataset);
auto className = datasets.getClassName(dataset);
//
// Start working on task
//
folding::Fold* fold;
if (stratified)
fold = new folding::StratifiedKFold(config.n_folds, y, seed);
else
fold = new folding::KFold(config.n_folds, y.size(0), seed);
auto [train, test] = fold->getFold(n_fold);
auto train_t = torch::tensor(train);
auto test_t = torch::tensor(test);
auto X_train = X.index({ "...", train_t });
auto y_train = y.index({ train_t });
auto X_test = X.index({ "...", test_t });
auto y_test = y.index({ test_t });
double best_fold_score = 0.0;
int best_idx_combination = -1;
json best_fold_hyper;
for (int idx_combination = 0; idx_combination < combinations.size(); ++idx_combination) {
auto hyperparam_line = combinations[idx_combination];
auto hyperparameters = platform::HyperParameters(datasets.getNames(), hyperparam_line);
folding::Fold* nested_fold;
if (config.stratified)
nested_fold = new folding::StratifiedKFold(config.nested, y_train, seed);
else
nested_fold = new folding::KFold(config.nested, y_train.size(0), seed);
double score = 0.0;
for (int n_nested_fold = 0; n_nested_fold < config.nested; n_nested_fold++) {
// Nested level fold
auto [train_nested, test_nested] = nested_fold->getFold(n_nested_fold);
auto train_nested_t = torch::tensor(train_nested);
auto test_nested_t = torch::tensor(test_nested);
auto X_nested_train = X_train.index({ "...", train_nested_t });
auto y_nested_train = y_train.index({ train_nested_t });
auto X_nested_test = X_train.index({ "...", test_nested_t });
auto y_nested_test = y_train.index({ test_nested_t });
// Build Classifier with selected hyperparameters
auto clf = Models::instance()->create(config.model);
auto valid = clf->getValidHyperparameters();
hyperparameters.check(valid, dataset);
clf->setHyperparameters(hyperparameters.get(dataset));
// Train model
clf->fit(X_nested_train, y_nested_train, features, className, states);
// Test model
score += clf->score(X_nested_test, y_nested_test);
}
delete nested_fold;
score /= config.nested;
if (score > best_fold_score) {
best_fold_score = score;
best_idx_combination = idx_combination;
best_fold_hyper = hyperparam_line;
}
}
delete fold;
// Build Classifier with the best hyperparameters to obtain the best score
auto hyperparameters = platform::HyperParameters(datasets.getNames(), best_fold_hyper);
auto clf = Models::instance()->create(config.model);
auto valid = clf->getValidHyperparameters();
hyperparameters.check(valid, dataset);
clf->setHyperparameters(best_fold_hyper);
clf->fit(X_train, y_train, features, className, states);
best_fold_score = clf->score(X_test, y_test);
// Return the result
result->idx_dataset = task["idx_dataset"].get<int>();
result->idx_combination = best_idx_combination;
result->score = best_fold_score;
result->n_fold = n_fold;
result->time = timer.getDuration();
// Update progress bar
std::cout << get_color_rank(config_mpi.rank) << "*" << std::flush;
}
json store_result(std::vector<std::string>& names, Task_Result& result, json& results)
{
json json_result = {
{ "score", result.score },
{ "combination", result.idx_combination },
{ "fold", result.n_fold },
{ "time", result.time },
{ "dataset", result.idx_dataset }
};
auto name = names[result.idx_dataset];
if (!results.contains(name)) {
results[name] = json::array();
}
results[name].push_back(json_result);
return results;
}
json producer(std::vector<std::string>& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
{
Task_Result result;
json results;
int num_tasks = tasks.size();
//
// 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
//
for (int i = 0; i < num_tasks; ++i) {
MPI_Status status;
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_RESULT) {
//Store result
store_result(names, result, results);
}
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_TASK, MPI_COMM_WORLD);
}
//
// 2a.2 Producer will send the end message to all the consumers
//
for (int i = 0; i < config_mpi.n_procs - 1; ++i) {
MPI_Status status;
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_RESULT) {
//Store result
store_result(names, result, results);
}
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_END, MPI_COMM_WORLD);
}
return results;
}
void select_best_results_folds(json& results, json& all_results, std::string& model)
{
Timer timer;
auto grid = GridData(Paths::grid_input(model));
//
// Select the best result of the computed outer folds
//
for (const auto& result : all_results.items()) {
// each result has the results of all the outer folds as each one were a different task
double best_score = 0.0;
json best;
for (const auto& result_fold : result.value()) {
double score = result_fold["score"].get<double>();
if (score > best_score) {
best_score = score;
best = result_fold;
}
}
auto dataset = result.key();
auto combinations = grid.getGrid(dataset);
json json_best = {
{ "score", best_score },
{ "hyperparameters", combinations[best["combination"].get<int>()] },
{ "date", get_date() + " " + get_time() },
{ "grid", grid.getInputGrid(dataset) },
{ "duration", timer.translate2String(best["time"].get<double>()) }
};
results[dataset] = json_best;
}
}
void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
{
Task_Result result;
//
// 2b.1 Consumers announce to the producer that they are ready to receive a task
//
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_QUERY, MPI_COMM_WORLD);
int task;
while (true) {
MPI_Status status;
//
// 2b.2 Consumers receive the task from the producer and process it
//
MPI_Recv(&task, 1, MPI_INT, config_mpi.manager, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_END) {
break;
}
process_task_mpi_consumer(config, config_mpi, tasks, task, datasets, &result);
//
// 2b.3 Consumers send the result to the producer
//
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_RESULT, MPI_COMM_WORLD);
}
}
void GridSearch::go(struct ConfigMPI& config_mpi)
{
/*
* Each task is a json object with the following structure:
* {
* "dataset": "dataset_name",
* "idx_dataset": idx_dataset, // used to identify the dataset in the results
* // this index is relative to the used datasets in the actual run not to the whole datasets
* "seed": # of seed to use,
* "Fold": # of fold to process
* }
*
* The overall process consists in these steps:
* 0. Create the MPI result type & tasks
* 0.1 Create the MPI result type
* 0.2 Manager creates the tasks
* 1. Manager will broadcast the tasks to all the processes
* 1.1 Broadcast the number of tasks
* 1.2 Broadcast the length of the following string
* 1.2 Broadcast the tasks as a char* string
* 2a. Producer delivers the tasks to the consumers
* 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
* 2a.2 Producer will send the end message to all the consumers
* 2b. Consumers process the tasks and send the results to the producer
* 2b.1 Consumers announce to the producer that they are ready to receive a task
* 2b.2 Consumers receive the task from the producer and process it
* 2b.3 Consumers send the result to the producer
* 3. Manager select the bests sccores for each dataset
* 3.1 Loop thru all the results obtained from each outer fold (task) and select the best
* 3.2 Save the results
*/
//
// 0.1 Create the MPI result type
//
Task_Result result;
int tasks_size;
MPI_Datatype MPI_Result;
MPI_Datatype type[5] = { MPI_UNSIGNED, MPI_UNSIGNED, MPI_INT, MPI_DOUBLE, MPI_DOUBLE };
int blocklen[5] = { 1, 1, 1, 1, 1 };
MPI_Aint disp[5];
disp[0] = offsetof(Task_Result, idx_dataset);
disp[1] = offsetof(Task_Result, idx_combination);
disp[2] = offsetof(Task_Result, n_fold);
disp[3] = offsetof(Task_Result, score);
disp[4] = offsetof(Task_Result, time);
MPI_Type_create_struct(5, blocklen, disp, type, &MPI_Result);
MPI_Type_commit(&MPI_Result);
//
// 0.2 Manager creates the tasks
//
char* msg;
json tasks;
if (config_mpi.rank == config_mpi.manager) {
timer.start();
tasks = build_tasks_mpi(config_mpi.rank);
auto tasks_str = tasks.dump();
tasks_size = tasks_str.size();
msg = new char[tasks_size + 1];
strcpy(msg, tasks_str.c_str());
}
//
// 1. Manager will broadcast the tasks to all the processes
//
MPI_Bcast(&tasks_size, 1, MPI_INT, config_mpi.manager, MPI_COMM_WORLD);
if (config_mpi.rank != config_mpi.manager) {
msg = new char[tasks_size + 1];
}
MPI_Bcast(msg, tasks_size + 1, MPI_CHAR, config_mpi.manager, MPI_COMM_WORLD);
tasks = json::parse(msg);
delete[] msg;
auto datasets = Datasets(config.discretize, Paths::datasets());
if (config_mpi.rank == config_mpi.manager) {
//
// 2a. Producer delivers the tasks to the consumers
//
auto datasets_names = filterDatasets(datasets);
json all_results = producer(datasets_names, tasks, config_mpi, MPI_Result);
std::cout << get_color_rank(config_mpi.rank) << "|" << std::endl;
//
// 3. Manager select the bests sccores for each dataset
//
auto results = initializeResults();
select_best_results_folds(results, all_results, config.model);
//
// 3.2 Save the results
//
save(results);
} else {
//
// 2b. Consumers process the tasks and send the results to the producer
//
consumer(datasets, tasks, config, config_mpi, MPI_Result);
}
}
json GridSearch::initializeResults()
{
// Load previous results if continue is set
json results;
if (config.continue_from != NO_CONTINUE()) {
if (!config.quiet)
std::cout << "* Loading previous results" << std::endl;
try {
std::ifstream file(Paths::grid_output(config.model));
if (file.is_open()) {
results = json::parse(file);
results = results["results"];
}
}
catch (const std::exception& e) {
std::cerr << "* There were no previous results" << std::endl;
std::cerr << "* Initizalizing new results" << std::endl;
results = json();
}
}
return results;
}
void GridSearch::save(json& results)
{
std::ofstream file(Paths::grid_output(config.model));
json output = {
{ "model", config.model },
{ "score", config.score },
{ "discretize", config.discretize },
{ "stratified", config.stratified },
{ "n_folds", config.n_folds },
{ "seeds", config.seeds },
{ "date", get_date() + " " + get_time()},
{ "nested", config.nested},
{ "platform", config.platform },
{ "duration", timer.getDurationString(true)},
{ "results", results }
};
file << output.dump(4);
}
} /* namespace platform */

View File

@@ -1,213 +0,0 @@
#include "ManageResults.h"
#include "CommandParser.h"
#include <filesystem>
#include <tuple>
#include "Colors.h"
#include "CLocale.h"
#include "Paths.h"
#include "ReportConsole.h"
#include "ReportExcel.h"
namespace platform {
ManageResults::ManageResults(int numFiles, const std::string& model, const std::string& score, bool complete, bool partial, bool compare) :
numFiles{ numFiles }, complete{ complete }, partial{ partial }, compare{ compare }, results(Results(Paths::results(), model, score, complete, partial))
{
indexList = true;
openExcel = false;
workbook = NULL;
if (numFiles == 0) {
this->numFiles = results.size();
}
}
void ManageResults::doMenu()
{
if (results.empty()) {
std::cout << Colors::MAGENTA() << "No results found!" << Colors::RESET() << std::endl;
return;
}
results.sortDate();
list();
menu();
if (openExcel) {
workbook_close(workbook);
}
std::cout << Colors::RESET() << "Done!" << std::endl;
}
void ManageResults::list()
{
auto temp = ConfigLocale();
std::string suffix = numFiles != results.size() ? " of " + std::to_string(results.size()) : "";
std::stringstream oss;
oss << "Results on screen: " << numFiles << suffix;
std::cout << Colors::GREEN() << oss.str() << std::endl;
std::cout << std::string(oss.str().size(), '-') << std::endl;
if (complete) {
std::cout << Colors::MAGENTA() << "Only listing complete results" << std::endl;
}
if (partial) {
std::cout << Colors::MAGENTA() << "Only listing partial results" << std::endl;
}
auto i = 0;
int maxModel = results.maxModelSize();
std::cout << Colors::GREEN() << " # Date " << std::setw(maxModel) << std::left << "Model" << " Score Name Score C/P Duration Title" << std::endl;
std::cout << "=== ========== " << std::string(maxModel, '=') << " =========== =========== === ========= =============================================================" << std::endl;
bool odd = true;
for (auto& result : results) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
std::cout << result.to_string(maxModel) << std::endl;
if (i == numFiles) {
break;
}
odd = !odd;
}
}
bool ManageResults::confirmAction(const std::string& intent, const std::string& fileName) const
{
std::string color;
if (intent == "delete") {
color = Colors::RED();
} else {
color = Colors::YELLOW();
}
std::string line;
bool finished = false;
while (!finished) {
std::cout << color << "Really want to " << intent << " " << fileName << "? (y/n): ";
getline(std::cin, line);
finished = line.size() == 1 && (tolower(line[0]) == 'y' || tolower(line[0] == 'n'));
}
if (tolower(line[0]) == 'y') {
return true;
}
std::cout << "Not done!" << std::endl;
return false;
}
void ManageResults::report(const int index, const bool excelReport)
{
std::cout << Colors::YELLOW() << "Reporting " << results.at(index).getFilename() << std::endl;
auto data = results.at(index).load();
if (excelReport) {
ReportExcel reporter(data, compare, workbook);
reporter.show();
openExcel = true;
workbook = reporter.getWorkbook();
std::cout << "Adding sheet to " << Paths::excel() + Paths::excelResults() << std::endl;
} else {
ReportConsole reporter(data, compare);
reporter.show();
}
}
void ManageResults::showIndex(const int index, const int idx)
{
// Show a dataset result inside a report
auto data = results.at(index).load();
std::cout << Colors::YELLOW() << "Showing " << results.at(index).getFilename() << std::endl;
ReportConsole reporter(data, compare, idx);
reporter.show();
}
void ManageResults::sortList()
{
std::cout << Colors::YELLOW() << "Choose sorting field (date='d', score='s', duration='u', model='m'): ";
std::string line;
char option;
getline(std::cin, line);
if (line.size() == 0)
return;
if (line.size() > 1) {
std::cout << "Invalid option" << std::endl;
return;
}
option = line[0];
switch (option) {
case 'd':
results.sortDate();
break;
case 's':
results.sortScore();
break;
case 'u':
results.sortDuration();
break;
case 'm':
results.sortModel();
break;
default:
std::cout << "Invalid option" << std::endl;
}
}
void ManageResults::menu()
{
char option;
int index, subIndex;
bool finished = false;
std::string filename;
// tuple<Option, digit, requires value>
std::vector<std::tuple<std::string, char, bool>> mainOptions = {
{"quit", 'q', false},
{"list", 'l', false},
{"delete", 'd', true},
{"hide", 'h', true},
{"sort", 's', false},
{"report", 'r', true},
{"excel", 'e', true}
};
std::vector<std::tuple<std::string, char, bool>> listOptions = {
{"report", 'r', true},
{"list", 'l', false},
{"quit", 'q', false}
};
auto parser = CommandParser();
while (!finished) {
if (indexList) {
std::tie(option, index) = parser.parse(Colors::GREEN(), mainOptions, 'r', numFiles - 1);
} else {
std::tie(option, subIndex) = parser.parse(Colors::MAGENTA(), listOptions, 'r', results.at(index).load()["results"].size() - 1);
}
switch (option) {
case 'q':
finished = true;
break;
case 'l':
list();
indexList = true;
break;
case 'd':
filename = results.at(index).getFilename();
if (!confirmAction("delete", filename))
break;
std::cout << "Deleting " << filename << std::endl;
results.deleteResult(index);
std::cout << "File: " + filename + " deleted!" << std::endl;
list();
break;
case 'h':
filename = results.at(index).getFilename();
if (!confirmAction("hide", filename))
break;
filename = results.at(index).getFilename();
std::cout << "Hiding " << filename << std::endl;
results.hideResult(index, Paths::hiddenResults());
std::cout << "File: " + filename + " hidden! (moved to " << Paths::hiddenResults() << ")" << std::endl;
list();
break;
case 's':
sortList();
list();
break;
case 'r':
if (indexList) {
report(index, false);
indexList = false;
} else {
showIndex(index, subIndex);
}
break;
case 'e':
report(index, true);
break;
}
}
}
} /* namespace platform */

View File

@@ -1,31 +0,0 @@
#ifndef MANAGE_RESULTS_H
#define MANAGE_RESULTS_H
#include "Results.h"
#include "xlsxwriter.h"
namespace platform {
class ManageResults {
public:
ManageResults(int numFiles, const std::string& model, const std::string& score, bool complete, bool partial, bool compare);
~ManageResults() = default;
void doMenu();
private:
void list();
bool confirmAction(const std::string& intent, const std::string& fileName) const;
void report(const int index, const bool excelReport);
void showIndex(const int index, const int idx);
void sortList();
void menu();
int numFiles;
bool indexList;
bool openExcel;
bool complete;
bool partial;
bool compare;
Results results;
lxw_workbook* workbook;
};
}
#endif /* MANAGE_RESULTS_H */

View File

@@ -1,41 +0,0 @@
#ifndef MODELS_H
#define MODELS_H
#include <map>
#include "BaseClassifier.h"
#include "AODE.h"
#include "TAN.h"
#include "KDB.h"
#include "SPODE.h"
#include "TANLd.h"
#include "KDBLd.h"
#include "SPODELd.h"
#include "AODELd.h"
#include "BoostAODE.h"
#include "STree.h"
#include "ODTE.h"
#include "SVC.h"
#include "RandomForest.h"
namespace platform {
class Models {
private:
map<std::string, function<bayesnet::BaseClassifier* (void)>> functionRegistry;
static Models* factory; //singleton
Models() {};
public:
Models(Models&) = delete;
void operator=(const Models&) = delete;
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
static Models* instance();
shared_ptr<bayesnet::BaseClassifier> create(const std::string& name);
void registerFactoryFunction(const std::string& name,
function<bayesnet::BaseClassifier* (void)> classFactoryFunction);
std::vector<string> getNames();
std::string tostring();
};
class Registrar {
public:
Registrar(const std::string& className, function<bayesnet::BaseClassifier* (void)> classFactoryFunction);
};
}
#endif

View File

@@ -1,39 +0,0 @@
#ifndef PATHS_H
#define PATHS_H
#include <string>
#include <filesystem>
#include "DotEnv.h"
namespace platform {
class Paths {
public:
static std::string results() { return "results/"; }
static std::string hiddenResults() { return "hidden_results/"; }
static std::string excel() { return "excel/"; }
static std::string grid() { return "grid/"; }
static std::string datasets()
{
auto env = platform::DotEnv();
return env.get("source_data");
}
static void createPath(const std::string& path)
{
// Create directory if it does not exist
try {
std::filesystem::create_directory(path);
}
catch (std::exception& e) {
throw std::runtime_error("Could not create directory " + path);
}
}
static std::string excelResults() { return "some_results.xlsx"; }
static std::string grid_input(const std::string& model)
{
return grid() + "grid_" + model + "_input.json";
}
static std::string grid_output(const std::string& model)
{
return grid() + "grid_" + model + "_output.json";
}
};
}
#endif

View File

@@ -1,114 +0,0 @@
#include <iostream>
#include <sstream>
#include <locale>
#include "ReportConsole.h"
#include "BestScore.h"
#include "CLocale.h"
namespace platform {
std::string ReportConsole::headerLine(const std::string& text, int utf = 0)
{
int n = MAXL - text.length() - 3;
n = n < 0 ? 0 : n;
return "* " + text + std::string(n + utf, ' ') + "*\n";
}
void ReportConsole::header()
{
std::stringstream oss;
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
std::cout << headerLine(
"Report " + data["model"].get<std::string>() + " ver. " + data["version"].get<std::string>()
+ " with " + std::to_string(data["folds"].get<int>()) + " Folds cross validation and " + std::to_string(data["seeds"].size())
+ " random seeds. " + data["date"].get<std::string>() + " " + data["time"].get<std::string>()
);
std::cout << headerLine(data["title"].get<std::string>());
std::cout << headerLine("Random seeds: " + fromVector("seeds") + " Stratified: " + (data["stratified"].get<bool>() ? "True" : "False"));
oss << "Execution took " << std::setprecision(2) << std::fixed << data["duration"].get<float>()
<< " seconds, " << data["duration"].get<float>() / 3600 << " hours, on " << data["platform"].get<std::string>();
std::cout << headerLine(oss.str());
std::cout << headerLine("Score is " + data["score_name"].get<std::string>());
std::cout << std::string(MAXL, '*') << std::endl;
std::cout << std::endl;
}
void ReportConsole::body()
{
auto tmp = ConfigLocale();
int maxHyper = 15;
int maxDataset = 7;
for (const auto& r : data["results"]) {
maxHyper = std::max(maxHyper, (int)r["hyperparameters"].dump().size());
maxDataset = std::max(maxDataset, (int)r["dataset"].get<std::string>().size());
}
std::cout << Colors::GREEN() << " # " << std::setw(maxDataset) << std::left << "Dataset" << " Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << std::endl;
std::cout << "=== " << std::string(maxDataset, '=') << " ====== ===== === ========= ========= ========= =============== =================== " << std::string(maxHyper, '=') << std::endl;
json lastResult;
double totalScore = 0.0;
bool odd = true;
int index = 0;
for (const auto& r : data["results"]) {
if (selectedIndex != -1 && index != selectedIndex) {
index++;
continue;
}
auto color = odd ? Colors::CYAN() : Colors::BLUE();
std::cout << color;
std::cout << std::setw(3) << std::right << index++ << " ";
std::cout << std::setw(maxDataset) << std::left << r["dataset"].get<std::string>() << " ";
std::cout << std::setw(6) << std::right << r["samples"].get<int>() << " ";
std::cout << std::setw(5) << std::right << r["features"].get<int>() << " ";
std::cout << std::setw(3) << std::right << r["classes"].get<int>() << " ";
std::cout << std::setw(9) << std::setprecision(2) << std::fixed << r["nodes"].get<float>() << " ";
std::cout << std::setw(9) << std::setprecision(2) << std::fixed << r["leaves"].get<float>() << " ";
std::cout << std::setw(9) << std::setprecision(2) << std::fixed << r["depth"].get<float>() << " ";
std::cout << std::setw(8) << std::right << std::setprecision(6) << std::fixed << r["score"].get<double>() << "±" << std::setw(6) << std::setprecision(4) << std::fixed << r["score_std"].get<double>();
const std::string status = compareResult(r["dataset"].get<std::string>(), r["score"].get<double>());
std::cout << status;
std::cout << std::setw(12) << std::right << std::setprecision(6) << std::fixed << r["time"].get<double>() << "±" << std::setw(6) << std::setprecision(4) << std::fixed << r["time_std"].get<double>() << " ";
std::cout << r["hyperparameters"].dump();
std::cout << std::endl;
std::cout << std::flush;
lastResult = r;
totalScore += r["score"].get<double>();
odd = !odd;
}
if (data["results"].size() == 1 || selectedIndex != -1) {
std::cout << std::string(MAXL, '*') << std::endl;
std::cout << headerLine(fVector("Train scores: ", lastResult["scores_train"], 14, 12));
std::cout << headerLine(fVector("Test scores: ", lastResult["scores_test"], 14, 12));
std::cout << headerLine(fVector("Train times: ", lastResult["times_train"], 10, 3));
std::cout << headerLine(fVector("Test times: ", lastResult["times_test"], 10, 3));
std::cout << std::string(MAXL, '*') << std::endl;
} else {
footer(totalScore);
}
}
void ReportConsole::showSummary()
{
for (const auto& item : summary) {
std::stringstream oss;
oss << std::setw(3) << std::left << item.first;
oss << std::setw(3) << std::right << item.second << " ";
oss << std::left << meaning.at(item.first);
std::cout << headerLine(oss.str(), 2);
}
}
void ReportConsole::footer(double totalScore)
{
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
showSummary();
auto score = data["score_name"].get<std::string>();
auto best = BestScore::getScore(score);
if (best.first != "") {
std::stringstream oss;
oss << score << " compared to " << best.first << " .: " << totalScore / best.second;
std::cout << headerLine(oss.str());
}
if (!getExistBestFile() && compare) {
std::cout << headerLine("*** Best Results File not found. Couldn't compare any result!");
}
std::cout << std::string(MAXL, '*') << std::endl << Colors::RESET();
}
}

View File

@@ -1,22 +0,0 @@
#ifndef REPORTCONSOLE_H
#define REPORTCONSOLE_H
#include <string>
#include "ReportBase.h"
#include "Colors.h"
namespace platform {
const int MAXL = 133;
class ReportConsole : public ReportBase {
public:
explicit ReportConsole(json data_, bool compare = false, int index = -1) : ReportBase(data_, compare), selectedIndex(index) {};
virtual ~ReportConsole() = default;
private:
int selectedIndex;
std::string headerLine(const std::string& text, int utf);
void header() override;
void body() override;
void footer(double totalScore);
void showSummary() override;
};
};
#endif

View File

@@ -1,180 +0,0 @@
#include <sstream>
#include <locale>
#include "ReportExcel.h"
#include "BestScore.h"
namespace platform {
ReportExcel::ReportExcel(json data_, bool compare, lxw_workbook* workbook, lxw_worksheet* worksheet) : ReportBase(data_, compare), ExcelFile(workbook, worksheet)
{
createFile();
}
void ReportExcel::formatColumns()
{
worksheet_freeze_panes(worksheet, 6, 1);
std::vector<int> columns_sizes = { 22, 10, 9, 7, 12, 12, 12, 12, 12, 3, 15, 12, 23 };
for (int i = 0; i < columns_sizes.size(); ++i) {
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
}
}
void ReportExcel::createWorksheet()
{
const std::string name = data["model"].get<std::string>();
std::string suffix = "";
std::string efectiveName;
int num = 1;
// Create a sheet with the name of the model
while (true) {
efectiveName = name + suffix;
if (workbook_get_worksheet_by_name(workbook, efectiveName.c_str())) {
suffix = std::to_string(++num);
} else {
worksheet = workbook_add_worksheet(workbook, efectiveName.c_str());
break;
}
if (num > 100) {
throw std::invalid_argument("Couldn't create sheet " + efectiveName);
}
}
}
void ReportExcel::createFile()
{
if (workbook == NULL) {
workbook = workbook_new((Paths::excel() + Paths::excelResults()).c_str());
}
if (worksheet == NULL) {
createWorksheet();
}
setProperties(data["title"].get<std::string>());
createFormats();
formatColumns();
}
void ReportExcel::closeFile()
{
workbook_close(workbook);
}
void ReportExcel::header()
{
std::locale mylocale(std::cout.getloc(), new separated);
std::locale::global(mylocale);
std::cout.imbue(mylocale);
std::stringstream oss;
std::string message = data["model"].get<std::string>() + " ver. " + data["version"].get<std::string>() + " " +
data["language"].get<std::string>() + " ver. " + data["language_version"].get<std::string>() +
" with " + std::to_string(data["folds"].get<int>()) + " Folds cross validation and " + std::to_string(data["seeds"].size()) +
" random seeds. " + data["date"].get<std::string>() + " " + data["time"].get<std::string>();
worksheet_merge_range(worksheet, 0, 0, 0, 12, message.c_str(), styles["headerFirst"]);
worksheet_merge_range(worksheet, 1, 0, 1, 12, data["title"].get<std::string>().c_str(), styles["headerRest"]);
worksheet_merge_range(worksheet, 2, 0, 3, 0, ("Score is " + data["score_name"].get<std::string>()).c_str(), styles["headerRest"]);
worksheet_merge_range(worksheet, 2, 1, 3, 3, "Execution time", styles["headerRest"]);
oss << std::setprecision(2) << std::fixed << data["duration"].get<float>() << " s";
worksheet_merge_range(worksheet, 2, 4, 2, 5, oss.str().c_str(), styles["headerRest"]);
oss.str("");
oss.clear();
oss << std::setprecision(2) << std::fixed << data["duration"].get<float>() / 3600 << " h";
worksheet_merge_range(worksheet, 3, 4, 3, 5, oss.str().c_str(), styles["headerRest"]);
worksheet_merge_range(worksheet, 2, 6, 3, 7, "Platform", styles["headerRest"]);
worksheet_merge_range(worksheet, 2, 8, 3, 9, data["platform"].get<std::string>().c_str(), styles["headerRest"]);
worksheet_merge_range(worksheet, 2, 10, 2, 12, ("Random seeds: " + fromVector("seeds")).c_str(), styles["headerSmall"]);
oss.str("");
oss.clear();
oss << "Stratified: " << (data["stratified"].get<bool>() ? "True" : "False");
worksheet_merge_range(worksheet, 3, 10, 3, 11, oss.str().c_str(), styles["headerSmall"]);
oss.str("");
oss.clear();
oss << "Discretized: " << (data["discretized"].get<bool>() ? "True" : "False");
worksheet_write_string(worksheet, 3, 12, oss.str().c_str(), styles["headerSmall"]);
}
void ReportExcel::body()
{
auto head = std::vector<std::string>(
{ "Dataset", "Samples", "Features", "Classes", "Nodes", "Edges", "States", "Score", "Score Std.", "St.", "Time",
"Time Std.", "Hyperparameters" });
int col = 0;
for (const auto& item : head) {
writeString(5, col++, item, "bodyHeader");
}
row = 6;
col = 0;
int hypSize = 22;
json lastResult;
double totalScore = 0.0;
std::string hyperparameters;
for (const auto& r : data["results"]) {
writeString(row, col, r["dataset"].get<std::string>(), "text");
writeInt(row, col + 1, r["samples"].get<int>(), "ints");
writeInt(row, col + 2, r["features"].get<int>(), "ints");
writeInt(row, col + 3, r["classes"].get<int>(), "ints");
writeDouble(row, col + 4, r["nodes"].get<float>(), "floats");
writeDouble(row, col + 5, r["leaves"].get<float>(), "floats");
writeDouble(row, col + 6, r["depth"].get<double>(), "floats");
writeDouble(row, col + 7, r["score"].get<double>(), "result");
writeDouble(row, col + 8, r["score_std"].get<double>(), "result");
const std::string status = compareResult(r["dataset"].get<std::string>(), r["score"].get<double>());
writeString(row, col + 9, status, "textCentered");
writeDouble(row, col + 10, r["time"].get<double>(), "time");
writeDouble(row, col + 11, r["time_std"].get<double>(), "time");
hyperparameters = r["hyperparameters"].dump();
if (hyperparameters.size() > hypSize) {
hypSize = hyperparameters.size();
}
writeString(row, col + 12, hyperparameters, "text");
lastResult = r;
totalScore += r["score"].get<double>();
row++;
}
// Set the right column width of hyperparameters with the maximum length
worksheet_set_column(worksheet, 12, 12, hypSize + 5, NULL);
// Show totals if only one dataset is present in the result
if (data["results"].size() == 1) {
for (const std::string& group : { "scores_train", "scores_test", "times_train", "times_test" }) {
row++;
col = 1;
writeString(row, col, group, "text");
for (double item : lastResult[group]) {
std::string style = group.find("scores") != std::string::npos ? "result" : "time";
writeDouble(row, ++col, item, style);
}
}
// Set with of columns to show those totals completely
worksheet_set_column(worksheet, 1, 1, 12, NULL);
for (int i = 2; i < 7; ++i) {
// doesn't work with from col to col, so...
worksheet_set_column(worksheet, i, i, 15, NULL);
}
} else {
footer(totalScore, row);
}
}
void ReportExcel::showSummary()
{
for (const auto& item : summary) {
worksheet_write_string(worksheet, row + 2, 1, item.first.c_str(), styles["summaryStyle"]);
worksheet_write_number(worksheet, row + 2, 2, item.second, styles["summaryStyle"]);
worksheet_merge_range(worksheet, row + 2, 3, row + 2, 5, meaning.at(item.first).c_str(), styles["summaryStyle"]);
row += 1;
}
}
void ReportExcel::footer(double totalScore, int row)
{
showSummary();
row += 4 + summary.size();
auto score = data["score_name"].get<std::string>();
auto best = BestScore::getScore(score);
if (best.first != "") {
worksheet_merge_range(worksheet, row, 1, row, 5, (score + " compared to " + best.first + " .:").c_str(), efectiveStyle("text"));
writeDouble(row, 6, totalScore / best.second, "result");
}
if (!getExistBestFile() && compare) {
worksheet_write_string(worksheet, row + 1, 0, "*** Best Results File not found. Couldn't compare any result!", styles["summaryStyle"]);
}
}
}

View File

@@ -1,58 +0,0 @@
#include "Result.h"
#include "BestScore.h"
#include <filesystem>
#include <fstream>
#include <sstream>
#include "Colors.h"
#include "DotEnv.h"
#include "CLocale.h"
namespace platform {
Result::Result(const std::string& path, const std::string& filename)
: path(path)
, filename(filename)
{
auto data = load();
date = data["date"];
score = 0;
for (const auto& result : data["results"]) {
score += result["score"].get<double>();
}
scoreName = data["score_name"];
auto best = BestScore::getScore(scoreName);
if (best.first != "") {
score /= best.second;
}
title = data["title"];
duration = data["duration"];
model = data["model"];
complete = data["results"].size() > 1;
}
json Result::load() const
{
std::ifstream resultData(path + "/" + filename);
if (resultData.is_open()) {
json data = json::parse(resultData);
return data;
}
throw std::invalid_argument("Unable to open result file. [" + path + "/" + filename + "]");
}
std::string Result::to_string(int maxModel) const
{
auto tmp = ConfigLocale();
std::stringstream oss;
double durationShow = duration > 3600 ? duration / 3600 : duration > 60 ? duration / 60 : duration;
std::string durationUnit = duration > 3600 ? "h" : duration > 60 ? "m" : "s";
oss << date << " ";
oss << std::setw(maxModel) << std::left << model << " ";
oss << std::setw(11) << std::left << scoreName << " ";
oss << std::right << std::setw(11) << std::setprecision(7) << std::fixed << score << " ";
auto completeString = isComplete() ? "C" : "P";
oss << std::setw(1) << " " << completeString << " ";
oss << std::setw(7) << std::setprecision(2) << std::fixed << durationShow << " " << durationUnit << " ";
oss << std::setw(50) << std::left << title << " ";
return oss.str();
}
}

View File

@@ -1,35 +0,0 @@
#ifndef RESULT_H
#define RESULT_H
#include <map>
#include <vector>
#include <string>
#include <nlohmann/json.hpp>
namespace platform {
using json = nlohmann::json;
class Result {
public:
Result(const std::string& path, const std::string& filename);
json load() const;
std::string to_string(int maxModel) const;
std::string getFilename() const { return filename; };
std::string getDate() const { return date; };
double getScore() const { return score; };
std::string getTitle() const { return title; };
double getDuration() const { return duration; };
std::string getModel() const { return model; };
std::string getScoreName() const { return scoreName; };
bool isComplete() const { return complete; };
private:
std::string path;
std::string filename;
std::string date;
double score;
std::string title;
double duration;
std::string model;
std::string scoreName;
bool complete;
};
};
#endif

View File

@@ -1,74 +0,0 @@
#include "Results.h"
#include <algorithm>
namespace platform {
Results::Results(const std::string& path, const std::string& model, const std::string& score, bool complete, bool partial) :
path(path), model(model), scoreName(score), complete(complete), partial(partial)
{
load();
if (!files.empty()) {
maxModel = (*max_element(files.begin(), files.end(), [](const Result& a, const Result& b) { return a.getModel().size() < b.getModel().size(); })).getModel().size();
} else {
maxModel = 0;
}
};
void Results::load()
{
using std::filesystem::directory_iterator;
for (const auto& file : directory_iterator(path)) {
auto filename = file.path().filename().string();
if (filename.find(".json") != std::string::npos && filename.find("results_") == 0) {
auto result = Result(path, filename);
bool addResult = true;
if (model != "any" && result.getModel() != model || scoreName != "any" && scoreName != result.getScoreName() || complete && !result.isComplete() || partial && result.isComplete())
addResult = false;
if (addResult)
files.push_back(result);
}
}
}
void Results::hideResult(int index, const std::string& pathHidden)
{
auto filename = files.at(index).getFilename();
rename((path + "/" + filename).c_str(), (pathHidden + "/" + filename).c_str());
files.erase(files.begin() + index);
}
void Results::deleteResult(int index)
{
auto filename = files.at(index).getFilename();
remove((path + "/" + filename).c_str());
files.erase(files.begin() + index);
}
int Results::size() const
{
return files.size();
}
void Results::sortDate()
{
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
return a.getDate() > b.getDate();
});
}
void Results::sortModel()
{
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
return a.getModel() > b.getModel();
});
}
void Results::sortDuration()
{
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
return a.getDuration() > b.getDuration();
});
}
void Results::sortScore()
{
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
return a.getScore() > b.getScore();
});
}
bool Results::empty() const
{
return files.empty();
}
}

View File

@@ -1,63 +0,0 @@
#ifndef STATISTICS_H
#define STATISTICS_H
#include <iostream>
#include <vector>
#include <map>
#include <nlohmann/json.hpp>
using json = nlohmann::json;
namespace platform {
struct WTL {
int win;
int tie;
int loss;
};
struct FriedmanResult {
double statistic;
double criticalValue;
long double pvalue;
bool reject;
};
struct HolmLine {
std::string model;
long double pvalue;
double rank;
WTL wtl;
bool reject;
};
struct HolmResult {
std::string model;
std::vector<HolmLine> holmLines;
};
class Statistics {
public:
Statistics(const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance = 0.05, bool output = true);
bool friedmanTest();
void postHocHolmTest(bool friedmanResult);
FriedmanResult& getFriedmanResult();
HolmResult& getHolmResult();
std::map<std::string, std::map<std::string, float>>& getRanks();
private:
void fit();
void computeRanks();
void computeWTL();
const std::vector<std::string>& models;
const std::vector<std::string>& datasets;
const json& data;
double significance;
bool output;
bool fitted = false;
int nModels = 0;
int nDatasets = 0;
int controlIdx = 0;
std::map<int, WTL> wtl;
std::map<std::string, float> ranks;
int maxModelName = 0;
int maxDatasetName = 0;
FriedmanResult friedmanResult;
HolmResult holmResult;
std::map<std::string, std::map<std::string, float>> ranksModels;
};
}
#endif // !STATISTICS_H

View File

@@ -1,30 +0,0 @@
#ifndef UTILS_H
#define UTILS_H
#include <sstream>
#include <string>
#include <vector>
namespace platform {
//static std::vector<std::string> split(const std::string& text, char delimiter);
static std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(token);
}
return result;
}
static std::string trim(const std::string& str)
{
std::string result = str;
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
return !std::isspace(ch);
}));
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
return !std::isspace(ch);
}).base(), result.end());
return result;
}
}
#endif

View File

@@ -1,85 +0,0 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include "Paths.h"
#include "BestResults.h"
#include "Colors.h"
#include "config.h"
void manageArguments(argparse::ArgumentParser& program, int argc, char** argv)
{
program.add_argument("-m", "--model").default_value("").help("Filter results of the selected model) (any for all models)");
program.add_argument("-s", "--score").default_value("").help("Filter results of the score name supplied");
program.add_argument("--build").help("build best score results file").default_value(false).implicit_value(true);
program.add_argument("--report").help("report of best score results file").default_value(false).implicit_value(true);
program.add_argument("--friedman").help("Friedman test").default_value(false).implicit_value(true);
program.add_argument("--excel").help("Output to excel").default_value(false).implicit_value(true);
program.add_argument("--level").help("significance level").default_value(0.05).scan<'g', double>().action([](const std::string& value) {
try {
auto k = std::stod(value);
if (k < 0.01 || k > 0.15) {
throw std::runtime_error("Significance level hast to be a number in [0.01, 0.15]");
}
return k;
}
catch (const std::runtime_error& err) {
throw std::runtime_error(err.what());
}
catch (...) {
throw std::runtime_error("Number of folds must be an decimal number");
}});
}
int main(int argc, char** argv)
{
argparse::ArgumentParser program("b_best", { project_version.begin(), project_version.end() });
manageArguments(program, argc, argv);
std::string model, score;
bool build, report, friedman, excel;
double level;
try {
program.parse_args(argc, argv);
model = program.get<std::string>("model");
score = program.get<std::string>("score");
build = program.get<bool>("build");
report = program.get<bool>("report");
friedman = program.get<bool>("friedman");
excel = program.get<bool>("excel");
level = program.get<double>("level");
if (model == "" || score == "") {
throw std::runtime_error("Model and score name must be supplied");
}
if (friedman && model != "any") {
std::cerr << "Friedman test can only be used with all models" << std::endl;
std::cerr << program;
exit(1);
}
if (!report && !build) {
std::cerr << "Either build, report or both, have to be selected to do anything!" << std::endl;
std::cerr << program;
exit(1);
}
}
catch (const std::exception& err) {
std::cerr << err.what() << std::endl;
std::cerr << program;
exit(1);
}
// Generate report
auto results = platform::BestResults(platform::Paths::results(), score, model, friedman, level);
if (build) {
if (model == "any") {
results.buildAll();
} else {
std::string fileName = results.build();
std::cout << Colors::GREEN() << fileName << " created!" << Colors::RESET() << std::endl;
}
}
if (report) {
if (model == "any") {
results.reportAll(excel);
} else {
results.reportSingle(excel);
}
}
return 0;
}

View File

@@ -1,232 +0,0 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include <map>
#include <nlohmann/json.hpp>
#include <mpi.h>
#include "DotEnv.h"
#include "Models.h"
#include "modelRegister.h"
#include "GridSearch.h"
#include "Paths.h"
#include "Timer.h"
#include "Colors.h"
#include "config.h"
using json = nlohmann::json;
const int MAXL = 133;
void manageArguments(argparse::ArgumentParser& program)
{
auto env = platform::DotEnv();
auto& group = program.add_mutually_exclusive_group(true);
program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->tostring())
.action([](const std::string& value) {
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw std::runtime_error("Model must be one of " + platform::Models::instance()->tostring());
}
);
group.add_argument("--dump").help("Show the grid combinations").default_value(false).implicit_value(true);
group.add_argument("--report").help("Report the computed hyperparameters").default_value(false).implicit_value(true);
group.add_argument("--compute").help("Perform computation of the grid output hyperparameters").default_value(false).implicit_value(true);
program.add_argument("--discretize").help("Discretize input datasets").default_value((bool)stoi(env.get("discretize"))).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true);
program.add_argument("--continue").help("Continue computing from that dataset").default_value(platform::GridSearch::NO_CONTINUE());
program.add_argument("--only").help("Used with continue to compute that dataset only").default_value(false).implicit_value(true);
program.add_argument("--exclude").default_value("[]").help("Datasets to exclude in json format, e.g. [\"dataset1\", \"dataset2\"]");
program.add_argument("--nested").help("Set the double/nested cross validation number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw std::runtime_error("Number of nested folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw std::runtime_error(err.what());
}
catch (...) {
throw std::runtime_error("Number of nested folds must be an integer");
}});
program.add_argument("--score").help("Score used in gridsearch").default_value("accuracy");
program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw std::runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw std::runtime_error(err.what());
}
catch (...) {
throw std::runtime_error("Number of folds must be an integer");
}});
auto seed_values = env.getSeeds();
program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
}
void list_dump(std::string& model)
{
auto data = platform::GridData(platform::Paths::grid_input(model));
std::cout << Colors::MAGENTA() << "Listing configuration input file (Grid)" << std::endl << std::endl;
int index = 0;
int max_hyper = 15;
int max_dataset = 7;
auto combinations = data.getGridFile();
for (auto const& item : combinations) {
if (item.first.size() > max_dataset) {
max_dataset = item.first.size();
}
if (item.second.dump().size() > max_hyper) {
max_hyper = item.second.dump().size();
}
}
std::cout << Colors::GREEN() << left << " # " << left << setw(max_dataset) << "Dataset" << " #Com. "
<< setw(max_hyper) << "Hyperparameters" << std::endl;
std::cout << "=== " << string(max_dataset, '=') << " ===== " << string(max_hyper, '=') << std::endl;
bool odd = true;
for (auto const& item : combinations) {
auto color = odd ? Colors::CYAN() : Colors::BLUE();
std::cout << color;
auto num_combinations = data.getNumCombinations(item.first);
std::cout << setw(3) << fixed << right << ++index << left << " " << setw(max_dataset) << item.first
<< " " << setw(5) << right << num_combinations << " " << setw(max_hyper) << item.second.dump() << std::endl;
odd = !odd;
}
std::cout << Colors::RESET() << std::endl;
}
std::string headerLine(const std::string& text, int utf = 0)
{
int n = MAXL - text.length() - 3;
n = n < 0 ? 0 : n;
return "* " + text + std::string(n + utf, ' ') + "*\n";
}
void list_results(json& results, std::string& model)
{
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
std::cout << headerLine("Listing computed hyperparameters for model " + model);
std::cout << headerLine("Date & time: " + results["date"].get<std::string>() + " Duration: " + results["duration"].get<std::string>());
std::cout << headerLine("Score: " + results["score"].get<std::string>());
std::cout << headerLine(
"Random seeds: " + results["seeds"].dump()
+ " Discretized: " + (results["discretize"].get<bool>() ? "True" : "False")
+ " Stratified: " + (results["stratified"].get<bool>() ? "True" : "False")
+ " #Folds: " + std::to_string(results["n_folds"].get<int>())
+ " Nested: " + (results["nested"].get<int>() == 0 ? "False" : to_string(results["nested"].get<int>()))
);
std::cout << std::string(MAXL, '*') << std::endl;
int spaces = 7;
int hyperparameters_spaces = 15;
for (const auto& item : results["results"].items()) {
auto key = item.key();
auto value = item.value();
if (key.size() > spaces) {
spaces = key.size();
}
if (value["hyperparameters"].dump().size() > hyperparameters_spaces) {
hyperparameters_spaces = value["hyperparameters"].dump().size();
}
}
std::cout << Colors::GREEN() << " # " << left << setw(spaces) << "Dataset" << " " << setw(19) << "Date" << " "
<< "Duration " << setw(8) << "Score" << " " << "Hyperparameters" << std::endl;
std::cout << "=== " << string(spaces, '=') << " " << string(19, '=') << " " << string(8, '=') << " "
<< string(8, '=') << " " << string(hyperparameters_spaces, '=') << std::endl;
bool odd = true;
int index = 0;
for (const auto& item : results["results"].items()) {
auto color = odd ? Colors::CYAN() : Colors::BLUE();
auto value = item.value();
std::cout << color;
std::cout << std::setw(3) << std::right << index++ << " ";
std::cout << left << setw(spaces) << item.key() << " " << value["date"].get<string>()
<< " " << setw(8) << right << value["duration"].get<string>() << " " << setw(8) << setprecision(6)
<< fixed << right << value["score"].get<double>() << " " << value["hyperparameters"].dump() << std::endl;
odd = !odd;
}
std::cout << Colors::RESET() << std::endl;
}
/*
* Main
*/
int main(int argc, char** argv)
{
argparse::ArgumentParser program("b_grid", { project_version.begin(), project_version.end() });
manageArguments(program);
struct platform::ConfigGrid config;
bool dump, compute;
try {
program.parse_args(argc, argv);
config.model = program.get<std::string>("model");
config.score = program.get<std::string>("score");
config.discretize = program.get<bool>("discretize");
config.stratified = program.get<bool>("stratified");
config.n_folds = program.get<int>("folds");
config.quiet = program.get<bool>("quiet");
config.only = program.get<bool>("only");
config.seeds = program.get<std::vector<int>>("seeds");
config.nested = program.get<int>("nested");
config.continue_from = program.get<std::string>("continue");
if (config.continue_from == platform::GridSearch::NO_CONTINUE() && config.only) {
throw std::runtime_error("Cannot use --only without --continue");
}
dump = program.get<bool>("dump");
compute = program.get<bool>("compute");
if (dump && (config.continue_from != platform::GridSearch::NO_CONTINUE() || config.only)) {
throw std::runtime_error("Cannot use --dump with --continue or --only");
}
auto excluded = program.get<std::string>("exclude");
config.excluded = json::parse(excluded);
}
catch (const exception& err) {
cerr << err.what() << std::endl;
cerr << program;
exit(1);
}
/*
* Begin Processing
*/
auto env = platform::DotEnv();
config.platform = env.get("platform");
platform::Paths::createPath(platform::Paths::grid());
auto grid_search = platform::GridSearch(config);
platform::Timer timer;
timer.start();
if (dump) {
list_dump(config.model);
} else {
if (compute) {
struct platform::ConfigMPI mpi_config;
mpi_config.manager = 0; // which process is the manager
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs);
if (mpi_config.n_procs < 2) {
throw std::runtime_error("Cannot use --compute with less than 2 mpi processes, try mpirun -np 2 ...");
}
grid_search.go(mpi_config);
if (mpi_config.rank == mpi_config.manager) {
auto results = grid_search.loadResults();
list_results(results, config.model);
std::cout << "Process took " << timer.getDurationString() << std::endl;
}
MPI_Finalize();
} else {
// List results
auto results = grid_search.loadResults();
if (results.empty()) {
std::cout << "** No results found" << std::endl;
} else {
list_results(results, config.model);
}
}
}
std::cout << "Done!" << std::endl;
return 0;
}

View File

@@ -1,56 +0,0 @@
#include <iostream>
#include <locale>
#include "Paths.h"
#include "Colors.h"
#include "Datasets.h"
const int BALANCE_LENGTH = 75;
struct separated : numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
std::string do_grouping() const { return "\03"; }
};
void outputBalance(const std::string& balance)
{
auto temp = std::string(balance);
while (temp.size() > BALANCE_LENGTH - 1) {
auto part = temp.substr(0, BALANCE_LENGTH);
std::cout << part << std::endl;
std::cout << setw(48) << " ";
temp = temp.substr(BALANCE_LENGTH);
}
std::cout << temp << std::endl;
}
int main(int argc, char** argv)
{
auto data = platform::Datasets(false, platform::Paths::datasets());
locale mylocale(std::cout.getloc(), new separated);
locale::global(mylocale);
std::cout.imbue(mylocale);
std::cout << Colors::GREEN() << "Dataset Sampl. Feat. Cls. Balance" << std::endl;
std::string balanceBars = std::string(BALANCE_LENGTH, '=');
std::cout << "============================== ====== ===== === " << balanceBars << std::endl;
bool odd = true;
for (const auto& dataset : data.getNames()) {
auto color = odd ? Colors::CYAN() : Colors::BLUE();
std::cout << color << setw(30) << left << dataset << " ";
data.loadDataset(dataset);
auto nSamples = data.getNSamples(dataset);
std::cout << setw(6) << right << nSamples << " ";
std::cout << setw(5) << right << data.getFeatures(dataset).size() << " ";
std::cout << setw(3) << right << data.getNClasses(dataset) << " ";
std::stringstream oss;
std::string sep = "";
for (auto number : data.getClassesCounts(dataset)) {
oss << sep << std::setprecision(2) << fixed << (float)number / nSamples * 100.0 << "% (" << number << ")";
sep = " / ";
}
outputBalance(oss.str());
odd = !odd;
}
std::cout << Colors::RESET() << std::endl;
return 0;
}

View File

@@ -1,135 +0,0 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include "Experiment.h"
#include "Datasets.h"
#include "DotEnv.h"
#include "Models.h"
#include "modelRegister.h"
#include "Paths.h"
#include "config.h"
using json = nlohmann::json;
void manageArguments(argparse::ArgumentParser& program)
{
auto env = platform::DotEnv();
program.add_argument("-d", "--dataset").default_value("").help("Dataset file name");
program.add_argument("--hyperparameters").default_value("{}").help("Hyperparameters passed to the model in Experiment");
program.add_argument("--hyper-file").default_value("").help("Hyperparameters file name." \
"Mutually exclusive with hyperparameters. This file should contain hyperparameters for each dataset in json format.");
program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->tostring())
.action([](const std::string& value) {
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw std::runtime_error("Model must be one of " + platform::Models::instance()->tostring());
}
);
program.add_argument("--title").default_value("").help("Experiment title");
program.add_argument("--discretize").help("Discretize input dataset").default_value((bool)stoi(env.get("discretize"))).implicit_value(true);
program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true);
program.add_argument("--save").help("Save result (always save if no dataset is supplied)").default_value(false).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw std::runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw std::runtime_error(err.what());
}
catch (...) {
throw std::runtime_error("Number of folds must be an integer");
}});
auto seed_values = env.getSeeds();
program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
}
int main(int argc, char** argv)
{
argparse::ArgumentParser program("b_main", { project_version.begin(), project_version.end() });
manageArguments(program);
std::string file_name, model_name, title, hyperparameters_file;
json hyperparameters_json;
bool discretize_dataset, stratified, saveResults, quiet;
std::vector<int> seeds;
std::vector<std::string> filesToTest;
int n_folds;
try {
program.parse_args(argc, argv);
file_name = program.get<std::string>("dataset");
model_name = program.get<std::string>("model");
discretize_dataset = program.get<bool>("discretize");
stratified = program.get<bool>("stratified");
quiet = program.get<bool>("quiet");
n_folds = program.get<int>("folds");
seeds = program.get<std::vector<int>>("seeds");
auto hyperparameters = program.get<std::string>("hyperparameters");
hyperparameters_json = json::parse(hyperparameters);
hyperparameters_file = program.get<std::string>("hyper-file");
if (hyperparameters_file != "" && hyperparameters != "{}") {
throw runtime_error("hyperparameters and hyper_file are mutually exclusive");
}
title = program.get<std::string>("title");
if (title == "" && file_name == "") {
throw runtime_error("title is mandatory if dataset is not provided");
}
saveResults = program.get<bool>("save");
}
catch (const exception& err) {
cerr << err.what() << std::endl;
cerr << program;
exit(1);
}
auto datasets = platform::Datasets(discretize_dataset, platform::Paths::datasets());
if (file_name != "") {
if (!datasets.isDataset(file_name)) {
cerr << "Dataset " << file_name << " not found" << std::endl;
exit(1);
}
if (title == "") {
title = "Test " + file_name + " " + model_name + " " + to_string(n_folds) + " folds";
}
filesToTest.push_back(file_name);
} else {
filesToTest = datasets.getNames();
saveResults = true;
}
platform::HyperParameters test_hyperparams;
if (hyperparameters_file != "") {
test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_file);
} else {
test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_json);
}
/*
* Begin Processing
*/
auto env = platform::DotEnv();
auto experiment = platform::Experiment();
experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("14.0.3");
experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform"));
experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy");
experiment.setHyperparameters(test_hyperparams);
for (auto seed : seeds) {
experiment.addRandomSeed(seed);
}
platform::Timer timer;
timer.start();
experiment.go(filesToTest, quiet);
experiment.setDuration(timer.getDuration());
if (saveResults) {
experiment.save(platform::Paths::results());
}
if (!quiet)
experiment.report();
std::cout << "Done!" << std::endl;
return 0;
}

View File

@@ -1,49 +0,0 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include "ManageResults.h"
#include "config.h"
void manageArguments(argparse::ArgumentParser& program, int argc, char** argv)
{
program.add_argument("-n", "--number").default_value(0).help("Number of results to show (0 = all)").scan<'i', int>();
program.add_argument("-m", "--model").default_value("any").help("Filter results of the selected model)");
program.add_argument("-s", "--score").default_value("any").help("Filter results of the score name supplied");
program.add_argument("--complete").help("Show only results with all datasets").default_value(false).implicit_value(true);
program.add_argument("--partial").help("Show only partial results").default_value(false).implicit_value(true);
program.add_argument("--compare").help("Compare with best results").default_value(false).implicit_value(true);
try {
program.parse_args(argc, argv);
auto number = program.get<int>("number");
if (number < 0) {
throw std::runtime_error("Number of results must be greater than or equal to 0");
}
auto model = program.get<std::string>("model");
auto score = program.get<std::string>("score");
auto complete = program.get<bool>("complete");
auto partial = program.get<bool>("partial");
auto compare = program.get<bool>("compare");
}
catch (const std::exception& err) {
std::cerr << err.what() << std::endl;
std::cerr << program;
exit(1);
}
}
int main(int argc, char** argv)
{
auto program = argparse::ArgumentParser("b_manage", { project_version.begin(), project_version.end() });
manageArguments(program, argc, argv);
int number = program.get<int>("number");
std::string model = program.get<std::string>("model");
std::string score = program.get<std::string>("score");
auto complete = program.get<bool>("complete");
auto partial = program.get<bool>("partial");
auto compare = program.get<bool>("compare");
if (complete)
partial = false;
auto manager = platform::ManageResults(number, model, score, complete, partial, compare);
manager.doMenu();
return 0;
}

View File

@@ -1,29 +0,0 @@
#ifndef MODEL_REGISTER_H
#define MODEL_REGISTER_H
static platform::Registrar registrarT("TAN",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::TAN();});
static platform::Registrar registrarTLD("TANLd",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::TANLd();});
static platform::Registrar registrarS("SPODE",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::SPODE(2);});
static platform::Registrar registrarSLD("SPODELd",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::SPODELd(2);});
static platform::Registrar registrarK("KDB",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::KDB(2);});
static platform::Registrar registrarKLD("KDBLd",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::KDBLd(2);});
static platform::Registrar registrarA("AODE",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODE();});
static platform::Registrar registrarALD("AODELd",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODELd();});
static platform::Registrar registrarBA("BoostAODE",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::BoostAODE();});
static platform::Registrar registrarSt("STree",
[](void) -> bayesnet::BaseClassifier* { return new pywrap::STree();});
static platform::Registrar registrarOdte("Odte",
[](void) -> bayesnet::BaseClassifier* { return new pywrap::ODTE();});
static platform::Registrar registrarSvc("SVC",
[](void) -> bayesnet::BaseClassifier* { return new pywrap::SVC();});
static platform::Registrar registrarRaF("RandomForest",
[](void) -> bayesnet::BaseClassifier* { return new pywrap::RandomForest();});
#endif

View File

@@ -4,12 +4,17 @@
#include <iostream>
#include <sstream>
#include <algorithm>
#include "BestResults.h"
#include "Result.h"
#include "Colors.h"
#include "Statistics.h"
#include <cctype>
#include "common/Colors.h"
#include "common/CLocale.h"
#include "common/Paths.h"
#include "common/Utils.h" // compute_std
#include "results/Result.h"
#include "BestResultsExcel.h"
#include "CLocale.h"
#include "BestResultsTex.h"
#include "BestResultsMd.h"
#include "best/Statistics.h"
#include "BestResults.h"
namespace fs = std::filesystem;
@@ -37,41 +42,36 @@ namespace platform {
json bests;
for (const auto& file : files) {
auto result = Result(path, file);
auto data = result.load();
auto data = result.getJson();
for (auto const& item : data.at("results")) {
bool update = false;
// Check if results file contains only one dataset
bool update = true;
auto datasetName = item.at("dataset").get<std::string>();
if (dataset != "any" && dataset != datasetName) {
continue;
}
if (bests.contains(datasetName)) {
if (item.at("score").get<double>() > bests[datasetName].at(0).get<double>()) {
update = true;
if (item.at("score").get<double>() < bests[datasetName].at(0).get<double>()) {
update = false;
}
} else {
update = true;
}
if (update) {
bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file };
bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file, item.at("score_std").get<double>() };
}
}
}
std::string bestFileName = path + bestResultFile();
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest);
std::cout << Colors::MAGENTA() << "File " << bestFileName << " already exists and it shall be overwritten." << Colors::RESET() << std::endl;
if (bests.empty()) {
std::cerr << Colors::MAGENTA() << "No results found for model " << model << " and score " << score << Colors::RESET() << std::endl;
exit(1);
}
std::string bestFileName = path + Paths::bestResultsFile(score, model);
std::ofstream file(bestFileName);
file << bests;
file.close();
return bestFileName;
}
std::string BestResults::bestResultFile()
{
return "best_results_" + score + "_" + model + ".json";
}
std::pair<std::string, std::string> getModelScore(std::string name)
{
// results_accuracy_BoostAODE_MacBookpro16_2023-09-06_12:27:00_1.json
int i = 0;
auto pos = name.find("_");
auto pos2 = name.find("_", pos + 1);
std::string score = name.substr(pos + 1, pos2 - pos - 1);
@@ -93,6 +93,7 @@ namespace platform {
}
}
}
std::sort(files.begin(), files.end());
return files;
}
json BestResults::loadFile(const std::string& fileName)
@@ -121,29 +122,44 @@ namespace platform {
models.insert(fileModel);
}
result = std::vector<std::string>(models.begin(), models.end());
maxModelName = (*max_element(result.begin(), result.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
maxModelName = std::max(minLength, maxModelName);
return result;
}
std::string toLower(std::string data)
{
std::transform(data.begin(), data.end(), data.begin(),
[](unsigned char c) { return std::tolower(c); });
return data;
}
std::vector<std::string> BestResults::getDatasets(json table)
{
std::vector<std::string> datasets;
for (const auto& dataset : table.items()) {
datasets.push_back(dataset.key());
for (const auto& dataset_ : table.items()) {
datasets.push_back(dataset_.key());
}
std::stable_sort(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) {
return toLower(a) < toLower(b);
});
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
maxDatasetName = std::max(7, maxDatasetName);
return datasets;
}
void BestResults::buildAll()
{
auto models = getModels();
std::cout << "Building best results for model: ";
for (const auto& model : models) {
std::cout << "Building best results for model: " << model << std::endl;
this->model = model;
std::cout << model << ", ";
build();
}
std::cout << "end." << std::endl << std::endl;
model = "any";
}
void BestResults::listFile()
{
std::string bestFileName = path + bestResultFile();
std::string bestFileName = path + Paths::bestResultsFile(score, model);
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest);
} else {
@@ -154,7 +170,6 @@ namespace platform {
auto date = ftime_to_string(std::filesystem::last_write_time(bestFileName));
auto data = loadFile(bestFileName);
auto datasets = getDatasets(data);
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
int maxFileName = 0;
int maxHyper = 15;
for (auto const& item : data.items()) {
@@ -168,10 +183,9 @@ namespace platform {
std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << "Dataset" << "Score " << std::setw(maxFileName) << "File" << " Hyperparameters" << std::endl;
std::cout << "=== " << std::string(maxDatasetName, '=') << " =========== " << std::string(maxFileName, '=') << " " << std::string(maxHyper, '=') << std::endl;
auto i = 0;
bool odd = true;
double total = 0;
for (auto const& item : data.items()) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
auto color = (i % 2) ? Colors::BLUE() : Colors::CYAN();
double value = item.value().at(0).get<double>();
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
std::cout << std::setw(maxDatasetName) << std::left << item.key() << " ";
@@ -180,10 +194,10 @@ namespace platform {
std::cout << item.value().at(1) << " ";
std::cout << std::endl;
total += value;
odd = !odd;
}
std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ===========" << std::endl;
std::cout << std::setw(5 + maxDatasetName) << "Total.................. " << std::setw(11) << std::setprecision(8) << std::fixed << total << std::endl;
std::cout << Colors::GREEN() << " Total" << std::string(maxDatasetName - 5, '.') << " " << std::setw(11) << std::setprecision(8) << std::fixed << total << std::endl;
}
json BestResults::buildTableResults(std::vector<std::string> models)
{
@@ -191,7 +205,7 @@ namespace platform {
auto maxDate = std::filesystem::file_time_type::max();
for (const auto& model : models) {
this->model = model;
std::string bestFileName = path + bestResultFile();
std::string bestFileName = path + Paths::bestResultsFile(score, model);
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest);
} else {
@@ -208,13 +222,20 @@ namespace platform {
table["dateTable"] = ftime_to_string(maxDate);
return table;
}
void BestResults::printTableResults(std::vector<std::string> models, json table)
void BestResults::printTableResults(std::vector<std::string> models, json table, bool tex, bool index)
{
std::stringstream oss;
oss << Colors::GREEN() << "Best results for " << score << " as of " << table.at("dateTable").get<std::string>() << std::endl;
std::cout << oss.str();
std::cout << std::string(oss.str().size() - 8, '-') << std::endl;
std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << std::string("Dataset");
auto bestResultsTex = BestResultsTex(score);
auto bestResultsMd = BestResultsMd();
if (tex) {
bestResultsTex.results_header(models, table.at("dateTable").get<std::string>(), index);
bestResultsMd.results_header(models, table.at("dateTable").get<std::string>());
}
for (const auto& model : models) {
std::cout << std::setw(maxModelName) << std::left << model << " ";
}
@@ -225,21 +246,27 @@ namespace platform {
}
std::cout << std::endl;
auto i = 0;
bool odd = true;
std::map<std::string, double> totals;
std::map<std::string, std::vector<double>> totals;
int nDatasets = table.begin().value().size();
for (const auto& model : models) {
totals[model] = 0.0;
}
auto datasets = getDatasets(table.begin().value());
for (auto const& dataset : datasets) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
if (tex) {
bestResultsTex.results_body(datasets, table, index);
bestResultsMd.results_body(datasets, table);
}
for (auto const& dataset_ : datasets) {
auto color = (i % 2) ? Colors::BLUE() : Colors::CYAN();
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
std::cout << std::setw(maxDatasetName) << std::left << dataset << " ";
std::cout << std::setw(maxDatasetName) << std::left << dataset_ << " ";
double maxValue = 0;
// Find out the max value for this dataset
for (const auto& model : models) {
double value = table[model].at(dataset).at(0).get<double>();
double value;
try {
value = table[model].at(dataset_).at(0).get<double>();
}
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
value = -1.0;
}
if (value > maxValue) {
maxValue = value;
}
@@ -247,34 +274,53 @@ namespace platform {
// Print the row with red colors on max values
for (const auto& model : models) {
std::string efectiveColor = color;
double value = table[model].at(dataset).at(0).get<double>();
double value, std;
try {
value = table[model].at(dataset_).at(0).get<double>();
std = table[model].at(dataset_).at(3).get<double>();
}
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
value = -1.0;
std = -1.0;
}
if (value == maxValue) {
efectiveColor = Colors::RED();
}
totals[model] += value;
std::cout << efectiveColor << std::setw(maxModelName) << std::setprecision(maxModelName - 2) << std::fixed << value << " ";
if (value == -1) {
std::cout << Colors::YELLOW() << std::setw(maxModelName) << std::right << "N/A" << " ";
} else {
totals[model].push_back(value);
std::cout << efectiveColor << std::setw(maxModelName - 6) << std::setprecision(maxModelName - 8) << std::fixed << value;
std::cout << efectiveColor << "±" << std::setw(5) << std::setprecision(3) << std::fixed << std << " ";
}
}
std::cout << std::endl;
odd = !odd;
}
std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ";
for (const auto& model : models) {
std::cout << std::string(maxModelName, '=') << " ";
}
std::cout << std::endl;
std::cout << Colors::GREEN() << std::setw(5 + maxDatasetName) << " Totals...................";
double max = 0.0;
std::cout << Colors::GREEN() << " Average" << std::string(maxDatasetName - 7, '.') << " ";
double max_value = 0.0;
std::string best_model = "";
for (const auto& total : totals) {
if (total.second > max) {
max = total.second;
auto actual = std::reduce(total.second.begin(), total.second.end());
if (actual > max_value) {
max_value = actual;
best_model = total.first;
}
}
if (tex) {
bestResultsTex.results_footer(totals, best_model);
bestResultsMd.results_footer(totals, best_model);
}
for (const auto& model : models) {
std::string efectiveColor = Colors::GREEN();
if (totals[model] == max) {
efectiveColor = Colors::RED();
}
std::cout << efectiveColor << std::right << std::setw(maxModelName) << std::setprecision(maxModelName - 4) << std::fixed << totals[model] << " ";
std::string efectiveColor = model == best_model ? Colors::RED() : Colors::GREEN();
double value = std::reduce(totals[model].begin(), totals[model].end()) / nDatasets;
double std = compute_std(totals[model], value);
std::cout << efectiveColor << std::right << std::setw(maxModelName - 6) << std::setprecision(maxModelName - 8) << std::fixed << value;
std::cout << efectiveColor << "±" << std::setw(5) << std::setprecision(3) << std::fixed << std << " ";
}
std::cout << std::endl;
}
@@ -286,58 +332,53 @@ namespace platform {
// Build the table of results
json table = buildTableResults(models);
std::vector<std::string> datasets = getDatasets(table.begin().value());
BestResultsExcel excel(score, datasets);
excel.reportSingle(model, path + bestResultFile());
messageExcelFile(excel.getFileName());
BestResultsExcel excel_report(path, score, datasets);
excel_report.reportSingle(model, path + Paths::bestResultsFile(score, model));
messageOutputFile("Excel", excel_report.getFileName());
excelFileName = excel_report.getFileName();
}
}
void BestResults::reportAll(bool excel)
void BestResults::reportAll(bool excel, bool tex, bool index)
{
auto models = getModels();
// Build the table of results
json table = buildTableResults(models);
std::vector<std::string> datasets = getDatasets(table.begin().value());
maxModelName = (*max_element(models.begin(), models.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
maxModelName = std::max(12, maxModelName);
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
maxDatasetName = std::max(25, maxDatasetName);
// Print the table of results
printTableResults(models, table);
printTableResults(models, table, tex, index);
// Compute the Friedman test
std::map<std::string, std::map<std::string, float>> ranksModels;
if (friedman) {
Statistics stats(models, datasets, table, significance);
Statistics stats(score, models, datasets, table, significance);
auto result = stats.friedmanTest();
stats.postHocHolmTest(result);
stats.postHocTest();
stats.postHocTestReport(result, tex);
ranksModels = stats.getRanks();
}
if (tex) {
messageOutputFile("TeX", Paths::tex() + Paths::tex_output());
messageOutputFile("MarkDown", Paths::tex() + Paths::md_output());
if (friedman) {
messageOutputFile("TeX", Paths::tex() + Paths::tex_post_hoc());
messageOutputFile("MarkDown", Paths::tex() + Paths::md_post_hoc());
}
}
if (excel) {
BestResultsExcel excel(score, datasets);
BestResultsExcel excel(path, score, datasets);
excel.reportAll(models, table, ranksModels, friedman, significance);
if (friedman) {
int idx = -1;
double min = 2000;
// Find out the control model
auto totals = std::vector<double>(models.size(), 0.0);
for (const auto& dataset : datasets) {
for (int i = 0; i < models.size(); ++i) {
totals[i] += ranksModels[dataset][models[i]];
}
}
for (int i = 0; i < models.size(); ++i) {
if (totals[i] < min) {
min = totals[i];
idx = i;
}
}
Statistics stats(score, models, datasets, table, significance);
int idx = stats.getControlIdx();
model = models.at(idx);
excel.reportSingle(model, path + bestResultFile());
excel.reportSingle(model, path + Paths::bestResultsFile(score, model));
}
messageExcelFile(excel.getFileName());
messageOutputFile("Excel", excel.getFileName());
excelFileName = excel.getFileName();
}
}
void BestResults::messageExcelFile(const std::string& fileName)
void BestResults::messageOutputFile(const std::string& title, const std::string& fileName)
{
std::cout << Colors::YELLOW() << "** Excel file generated: " << fileName << Colors::RESET() << std::endl;
std::cout << Colors::YELLOW() << "** " << std::setw(8) << std::left << title
<< " file generated: " << fileName << Colors::RESET() << std::endl;
}
}

View File

@@ -2,35 +2,39 @@
#define BESTRESULTS_H
#include <string>
#include <nlohmann/json.hpp>
using json = nlohmann::json;
namespace platform {
using json = nlohmann::ordered_json;
class BestResults {
public:
explicit BestResults(const std::string& path, const std::string& score, const std::string& model, bool friedman, double significance = 0.05)
: path(path), score(score), model(model), friedman(friedman), significance(significance)
explicit BestResults(const std::string& path, const std::string& score, const std::string& model, const std::string& dataset, bool friedman, double significance = 0.05)
: path(path), score(score), model(model), dataset(dataset), friedman(friedman), significance(significance)
{
}
std::string build();
void reportSingle(bool excel);
void reportAll(bool excel);
void reportAll(bool excel, bool tex, bool index);
void buildAll();
std::string getExcelFileName() const { return excelFileName; }
private:
std::vector<std::string> getModels();
std::vector<std::string> getDatasets(json table);
std::vector<std::string> loadResultFiles();
void messageExcelFile(const std::string& fileName);
void messageOutputFile(const std::string& title, const std::string& fileName);
json buildTableResults(std::vector<std::string> models);
void printTableResults(std::vector<std::string> models, json table);
std::string bestResultFile();
void printTableResults(std::vector<std::string> models, json table, bool tex, bool index);
json loadFile(const std::string& fileName);
void listFile();
std::string path;
std::string score;
std::string model;
std::string dataset;
bool friedman;
double significance;
int maxModelName = 0;
int maxDatasetName = 0;
int minLength = 13; // Minimum length for scores
std::string excelFileName;
};
}
#endif //BESTRESULTS_H
#endif

View File

@@ -1,10 +1,10 @@
#include <sstream>
#include "BestResultsExcel.h"
#include "Paths.h"
#include <map>
#include <nlohmann/json.hpp>
#include "Statistics.h"
#include "ReportExcel.h"
#include "common/Paths.h"
#include "reports/ReportExcel.h"
#include "best/Statistics.h"
#include "BestResultsExcel.h"
namespace platform {
json loadResultData(const std::string& fileName)
@@ -30,9 +30,10 @@ namespace platform {
}
return columnName;
}
BestResultsExcel::BestResultsExcel(const std::string& score, const std::vector<std::string>& datasets) : score(score), datasets(datasets)
BestResultsExcel::BestResultsExcel(const std::string& path, const std::string& score, const std::vector<std::string>& datasets) : path(path), score(score), datasets(datasets)
{
workbook = workbook_new((Paths::excel() + fileName).c_str());
file_name = Paths::bestResultsExcel(score);
workbook = workbook_new(getFileName().c_str());
setProperties("Best Results");
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
datasetNameSize = std::max(datasetNameSize, maxDatasetName);
@@ -63,19 +64,21 @@ namespace platform {
json data = loadResultData(fileName);
std::string title = "Best results for " + model;
worksheet_merge_range(worksheet, 0, 0, 0, 4, title.c_str(), styles["headerFirst"]);
worksheet_merge_range(worksheet, 0, 0, 0, 5, title.c_str(), styles["headerFirst"]);
// Body header
row = 3;
int col = 1;
writeString(row, 0, "", "bodyHeader");
writeString(row, 0, "#", "bodyHeader");
writeString(row, 1, "Dataset", "bodyHeader");
writeString(row, 2, "Score", "bodyHeader");
writeString(row, 3, "File", "bodyHeader");
writeString(row, 4, "Hyperparameters", "bodyHeader");
writeString(row, 5, "F", "bodyHeader");
auto i = 0;
std::string hyperparameters;
int hypSize = 22;
std::map<std::string, std::string> files; // map of files imported and their tabs
int numLines = data.size();
for (auto const& item : data.items()) {
row++;
writeInt(row, 0, i++, "ints");
@@ -89,7 +92,7 @@ namespace platform {
catch (const std::out_of_range& oor) {
auto tabName = "table_" + std::to_string(i);
auto worksheetNew = workbook_add_worksheet(workbook, tabName.c_str());
json data = loadResultData(Paths::results() + fileName);
json data = loadResultData(path + fileName);
auto report = ReportExcel(data, false, workbook, worksheetNew);
report.show();
hyperlink = "#table_" + std::to_string(i);
@@ -103,6 +106,8 @@ namespace platform {
hypSize = hyperparameters.size();
}
writeString(row, 4, hyperparameters, "text");
std::string countHyperparameters = "=COUNTIF(e5:e" + std::to_string(numLines + 4) + ", e" + std::to_string(row + 1) + ")";
worksheet_write_formula(worksheet, row, 5, countHyperparameters.c_str(), efectiveStyle("ints"));
}
row++;
// Set Totals
@@ -159,21 +164,20 @@ namespace platform {
addConditionalFormat("max");
footer(false);
if (friedman) {
// Create Sheet with ranks
worksheet = workbook_add_worksheet(workbook, "Ranks");
formatColumns();
header(true);
body(true);
addConditionalFormat("min");
footer(true);
if (score == "accuracy") {
// Create Sheet with ranks
worksheet = workbook_add_worksheet(workbook, "Ranks");
formatColumns();
header(true);
body(true);
addConditionalFormat("min");
footer(true);
}
// Create Sheet with Friedman Test
doFriedman();
}
}
std::string BestResultsExcel::getFileName()
{
return Paths::excel() + fileName;
}
void BestResultsExcel::header(bool ranks)
{
row = 0;
@@ -182,7 +186,7 @@ namespace platform {
// Body header
row = 3;
int col = 1;
writeString(row, 0, "", "bodyHeader");
writeString(row, 0, "#", "bodyHeader");
writeString(row, 1, "Dataset", "bodyHeader");
for (const auto& model : models) {
writeString(row, ++col, model.c_str(), "bodyHeader");
@@ -237,14 +241,15 @@ namespace platform {
for (int i = 0; i < columns_sizes.size(); ++i) {
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
}
worksheet_merge_range(worksheet, 0, 0, 0, 1 + models.size(), "Friedman Test", styles["headerFirst"]);
worksheet_merge_range(worksheet, 0, 0, 0, 7, "Friedman Test", styles["headerFirst"]);
row = 2;
Statistics stats(models, datasets, table, significance, false);
Statistics stats(score, models, datasets, table, significance, false); // No output
auto result = stats.friedmanTest();
stats.postHocHolmTest(result);
stats.postHocTest();
stats.postHocTestReport(result, false); // No tex output
auto friedmanResult = stats.getFriedmanResult();
auto holmResult = stats.getHolmResult();
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Null hypothesis: H0 'There is no significant differences between all the classifiers.'", styles["headerSmall"]);
auto postHocResults = stats.getPostHocResults();
worksheet_merge_range(worksheet, row, 0, row, 7, "Null hypothesis: H0 'There is no significant differences between all the classifiers.'", styles["headerSmall"]);
row += 2;
writeString(row, 1, "Friedman Q", "bodyHeader");
writeDouble(row, 2, friedmanResult.statistic, "bodyHeader");
@@ -258,11 +263,11 @@ namespace platform {
writeDouble(row, 4, significance, "bodyHeader");
writeString(row, 5, friedmanResult.reject ? "Reject H0" : "Accept H0", "bodyHeader");
row += 3;
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Holm Test", styles["headerFirst"]);
worksheet_merge_range(worksheet, row, 0, row, 7, "Holm Test", styles["headerFirst"]);
row += 2;
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Null hypothesis: H0 'There is no significant differences between the control model and the other models.'", styles["headerSmall"]);
worksheet_merge_range(worksheet, row, 0, row, 7, "Null hypothesis: H0 'There is no significant differences between the control model and the other models.'", styles["headerSmall"]);
row += 2;
std::string controlModel = "Control Model: " + holmResult.model;
std::string controlModel = "Control Model: " + postHocResults.at(0).model;
worksheet_merge_range(worksheet, row, 1, row, 7, controlModel.c_str(), styles["bodyHeader_odd"]);
row++;
writeString(row, 1, "Model", "bodyHeader");
@@ -274,7 +279,7 @@ namespace platform {
writeString(row, 7, "Reject H0", "bodyHeader");
row++;
bool first = true;
for (const auto& item : holmResult.holmLines) {
for (const auto& item : postHocResults) {
writeString(row, 1, item.model, "text");
if (first) {
// Control model info
@@ -296,5 +301,8 @@ namespace platform {
}
row++;
}
// set column width for the 5th and the 7th column
worksheet_set_column(worksheet, 4, 5, 10, NULL);
worksheet_set_column(worksheet, 6, 7, 10, NULL);
}
}

View File

@@ -1,21 +1,19 @@
#ifndef BESTRESULTS_EXCEL_H
#define BESTRESULTS_EXCEL_H
#include "ExcelFile.h"
#ifndef BESTRESULTSEXCEL_H
#define BESTRESULTSEXCEL_H
#include <vector>
#include <map>
#include <nlohmann/json.hpp>
#include "reports/ExcelFile.h"
using json = nlohmann::json;
namespace platform {
class BestResultsExcel : ExcelFile {
using json = nlohmann::ordered_json;
class BestResultsExcel : public ExcelFile {
public:
BestResultsExcel(const std::string& score, const std::vector<std::string>& datasets);
BestResultsExcel(const std::string& path, const std::string& score, const std::vector<std::string>& datasets);
~BestResultsExcel();
void reportAll(const std::vector<std::string>& models, const json& table, const std::map<std::string, std::map<std::string, float>>& ranks, bool friedman, double significance);
void reportSingle(const std::string& model, const std::string& fileName);
std::string getFileName();
private:
void build();
void header(bool ranks);
@@ -24,7 +22,7 @@ namespace platform {
void formatColumns();
void doFriedman();
void addConditionalFormat(std::string formula);
const std::string fileName = "BestResults.xlsx";
std::string path;
std::string score;
std::vector<std::string> models;
std::vector<std::string> datasets;
@@ -36,4 +34,4 @@ namespace platform {
int datasetNameSize = 25; // Min size of the column
};
}
#endif //BESTRESULTS_EXCEL_H
#endif

105
src/best/BestResultsMd.cpp Normal file
View File

@@ -0,0 +1,105 @@
#include <iostream>
#include "BestResultsMd.h"
#include "common/Utils.h" // compute_std
namespace platform {
using json = nlohmann::ordered_json;
void BestResultsMd::openMdFile(const std::string& name)
{
handler.open(name);
if (!handler.is_open()) {
std::cerr << "Error opening file " << name << std::endl;
exit(1);
}
}
void BestResultsMd::results_header(const std::vector<std::string>& models, const std::string& date)
{
this->models = models;
auto file_name = Paths::tex() + Paths::md_output();
openMdFile(file_name);
handler << "<!-- This file has been generated by the platform program" << std::endl;
handler << " Date: " << date.c_str() << std::endl;
handler << "" << std::endl;
handler << " Table of results" << std::endl;
handler << "-->" << std::endl;
handler << "| # | Dataset |";
for (const auto& model : models) {
handler << " " << model.c_str() << " |";
}
handler << std::endl;
handler << "|--: | :--- |";
for (const auto& model : models) {
handler << " :---: |";
}
handler << std::endl;
}
void BestResultsMd::results_body(const std::vector<std::string>& datasets, json& table)
{
int i = 0;
for (auto const& dataset : datasets) {
// Find out max value for this dataset
double max_value = 0;
// Find out the max value for this dataset
for (const auto& model : models) {
double value;
try {
value = table[model].at(dataset).at(0).get<double>();
}
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
value = -1.0;
}
if (value > max_value) {
max_value = value;
}
}
handler << "| " << ++i << " | " << dataset.c_str() << " | ";
for (const auto& model : models) {
double value = table[model].at(dataset).at(0).get<double>();
double std_value = table[model].at(dataset).at(3).get<double>();
const char* bold = value == max_value ? "**" : "";
handler << bold << std::setprecision(4) << std::fixed << value << "±" << std::setprecision(3) << std_value << bold << " | ";
}
handler << std::endl;
}
}
void BestResultsMd::results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model)
{
handler << "| | **Average Score** | ";
int nDatasets = totals.begin()->second.size();
for (const auto& model : models) {
double value = std::reduce(totals.at(model).begin(), totals.at(model).end()) / nDatasets;
double std_value = compute_std(totals.at(model), value);
const char* bold = model == best_model ? "**" : "";
handler << bold << std::setprecision(4) << std::fixed << value << "±" << std::setprecision(3) << std::fixed << std_value << bold << " | ";
}
handler.close();
}
void BestResultsMd::postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date)
{
auto file_name = Paths::tex() + Paths::md_post_hoc();
openMdFile(file_name);
handler << "<!-- This file has been generated by the platform program" << std::endl;
handler << " Date: " << date.c_str() << std::endl;
handler << std::endl;
handler << " Post-hoc handler test" << std::endl;
handler << "-->" << std::endl;
handler << "Post-hoc " << kind << " test: H<sub>0</sub>: There is no significant differences between the control model and the other models." << std::endl << std::endl;
handler << "| classifier | pvalue | rank | win | tie | loss | H<sub>0</sub> |" << std::endl;
handler << "| :-- | --: | --: | --:| --: | --: | :--: |" << std::endl;
bool first = true;
for (auto const& line : postHocResults) {
auto textStatus = !line.reject ? "**" : " ";
if (first) {
handler << "| " << line.model << " | - | " << std::fixed << std::setprecision(2) << line.rank << " | - | - | - |" << std::endl;
first = false;
} else {
handler << "| " << line.model << " | " << textStatus << std::scientific << std::setprecision(4) << line.pvalue << textStatus << " |";
handler << std::fixed << std::setprecision(2) << line.rank << " | " << line.wtl.win << " | " << line.wtl.tie << " | " << line.wtl.loss << " |";
handler << (line.reject ? "rejected" : "**accepted**") << " |" << std::endl;
}
}
handler << std::endl;
handler.close();
}
}

24
src/best/BestResultsMd.h Normal file
View File

@@ -0,0 +1,24 @@
#ifndef BEST_RESULTS_MD_H
#define BEST_RESULTS_MD_H
#include <map>
#include <vector>
#include <nlohmann/json.hpp>
#include "common/Paths.h"
#include "Statistics.h"
namespace platform {
using json = nlohmann::ordered_json;
class BestResultsMd {
public:
BestResultsMd() = default;
~BestResultsMd() = default;
void results_header(const std::vector<std::string>& models, const std::string& date);
void results_body(const std::vector<std::string>& datasets, json& table);
void results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model);
void postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date);
private:
void openMdFile(const std::string& name);
std::ofstream handler;
std::vector<std::string> models;
};
}
#endif

124
src/best/BestResultsTex.cpp Normal file
View File

@@ -0,0 +1,124 @@
#include <iostream>
#include "BestResultsTex.h"
#include "common/Utils.h" // compute_std
namespace platform {
using json = nlohmann::ordered_json;
void BestResultsTex::openTexFile(const std::string& name)
{
handler.open(name);
if (!handler.is_open()) {
std::cerr << "Error opening file " << name << std::endl;
exit(1);
}
}
void BestResultsTex::results_header(const std::vector<std::string>& models, const std::string& date, bool index)
{
this->models = models;
auto file_name = Paths::tex() + Paths::tex_output();
openTexFile(file_name);
handler << "%% This file has been generated by the platform program" << std::endl;
handler << "%% Date: " << date.c_str() << std::endl;
handler << "%%" << std::endl;
handler << "%% Table of results" << std::endl;
handler << "%%" << std::endl;
handler << "\\begin{table}[htbp] " << std::endl;
handler << "\\centering " << std::endl;
handler << "\\tiny " << std::endl;
handler << "\\renewcommand{\\arraystretch }{1.2} " << std::endl;
handler << "\\renewcommand{\\tabcolsep }{0.07cm} " << std::endl;
auto umetric = score;
umetric[0] = toupper(umetric[0]);
handler << "\\caption{" << umetric << " results(mean $\\pm$ std) for all the algorithms and datasets} " << std::endl;
handler << "\\label{tab:results_" << score << "}" << std::endl;
std::string header_dataset_name = index ? "r" : "l";
handler << "\\begin{tabular} {{" << header_dataset_name << std::string(models.size(), 'c').c_str() << "}}" << std::endl;
handler << "\\hline " << std::endl;
handler << "" << std::endl;
for (const auto& model : models) {
handler << "& " << model.c_str();
}
handler << "\\\\" << std::endl;
handler << "\\hline" << std::endl;
}
void BestResultsTex::results_body(const std::vector<std::string>& datasets, json& table, bool index)
{
int i = 0;
for (auto const& dataset : datasets) {
// Find out max value for this dataset
double max_value = 0;
for (const auto& model : models) {
double value;
try {
value = table[model].at(dataset).at(0).get<double>();
}
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
value = -1.0;
}
if (value > max_value) {
max_value = value;
}
}
if (index)
handler << ++i << " ";
else
handler << dataset << " ";
for (const auto& model : models) {
double value = table[model].at(dataset).at(0).get<double>();
double std_value = table[model].at(dataset).at(3).get<double>();
const char* bold = value == max_value ? "\\bfseries" : "";
handler << "& " << bold << std::setprecision(4) << std::fixed << value << "$\\pm$" << std::setprecision(3) << std_value;
}
handler << "\\\\" << std::endl;
}
}
void BestResultsTex::results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model)
{
handler << "\\hline" << std::endl;
handler << "Average ";
int nDatasets = totals.begin()->second.size();
for (const auto& model : models) {
double value = std::reduce(totals.at(model).begin(), totals.at(model).end()) / nDatasets;
double std_value = compute_std(totals.at(model), value);
const char* bold = model == best_model ? "\\bfseries" : "";
handler << "& " << bold << std::setprecision(4) << std::fixed << value << "$\\pm$" << std::setprecision(3) << std::fixed << std_value;
}
handler << "\\\\" << std::endl;
handler << "\\hline " << std::endl;
handler << "\\end{tabular}" << std::endl;
handler << "\\end{table}" << std::endl;
handler.close();
}
void BestResultsTex::postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date)
{
auto file_name = Paths::tex() + Paths::tex_post_hoc();
openTexFile(file_name);
handler << "%% This file has been generated by the platform program" << std::endl;
handler << "%% Date: " << date.c_str() << std::endl;
handler << "%%" << std::endl;
handler << "%% Post-hoc " << kind << " test" << std::endl;
handler << "%%" << std::endl;
handler << "\\begin{table}[htbp]" << std::endl;
handler << "\\centering" << std::endl;
handler << "\\caption{Results of the post-hoc " << kind << " test for the mean " << score << " of the algorithms.}\\label{ tab:tests }" << std::endl;
handler << "\\begin{tabular}{lrrrrr}" << std::endl;
handler << "\\hline" << std::endl;
handler << "classifier & pvalue & rank & win & tie & loss\\\\" << std::endl;
handler << "\\hline" << std::endl;
bool first = true;
for (auto const& line : postHocResults) {
auto textStatus = !line.reject ? "\\bf " : " ";
if (first) {
handler << line.model << " & - & " << std::fixed << std::setprecision(2) << line.rank << " & - & - & - \\\\" << std::endl;
first = false;
} else {
handler << line.model << " & " << textStatus << std::scientific << std::setprecision(4) << line.pvalue << " & ";
handler << std::fixed << std::setprecision(2) << line.rank << " & " << line.wtl.win << " & " << line.wtl.tie << " & " << line.wtl.loss << "\\\\" << std::endl;
}
}
handler << "\\hline " << std::endl;
handler << "\\end{tabular}" << std::endl;
handler << "\\end{table}" << std::endl;
handler.close();
}
}

26
src/best/BestResultsTex.h Normal file
View File

@@ -0,0 +1,26 @@
#ifndef BEST_RESULTS_TEX_H
#define BEST_RESULTS_TEX_H
#include <map>
#include <vector>
#include <nlohmann/json.hpp>
#include "common/Paths.h"
#include "Statistics.h"
namespace platform {
using json = nlohmann::ordered_json;
class BestResultsTex {
public:
BestResultsTex(const std::string score, bool dataset_name = true) : score{ score }, dataset_name{ dataset_name } {};
~BestResultsTex() = default;
void results_header(const std::vector<std::string>& models, const std::string& date, bool index);
void results_body(const std::vector<std::string>& datasets, json& table, bool index);
void results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model);
void postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date);
private:
std::string score;
bool dataset_name;
void openTexFile(const std::string& name);
std::ofstream handler;
std::vector<std::string> models;
};
}
#endif

View File

@@ -3,7 +3,7 @@
#include <string>
#include <map>
#include <utility>
#include "DotEnv.h"
#include "common/DotEnv.h"
namespace platform {
class BestScore {
public:
@@ -24,5 +24,4 @@ namespace platform {
}
};
}
#endif

View File

@@ -1,22 +1,31 @@
#include <sstream>
#include "Statistics.h"
#include "Colors.h"
#include "Symbols.h"
#include <boost/math/distributions/chi_squared.hpp>
#include <boost/math/distributions/normal.hpp>
#include "CLocale.h"
#include "common/Colors.h"
#include "common/Symbols.h"
#include "common/CLocale.h"
#include "BestResultsTex.h"
#include "BestResultsMd.h"
#include "Statistics.h"
#include "WilcoxonTest.hpp"
namespace platform {
Statistics::Statistics(const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance, bool output) :
models(models), datasets(datasets), data(data), significance(significance), output(output)
Statistics::Statistics(const std::string& score, const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance, bool output) :
score(score), models(models), datasets(datasets), data(data), significance(significance), output(output)
{
if (score == "accuracy") {
postHocType = "Holm";
hlen = 85;
} else {
postHocType = "Wilcoxon";
hlen = 88;
}
nModels = models.size();
nDatasets = datasets.size();
auto temp = ConfigLocale();
};
}
void Statistics::fit()
{
if (nModels < 3 || nDatasets < 3) {
@@ -25,9 +34,11 @@ namespace platform {
throw std::runtime_error("Can't make the Friedman test with less than 3 models and/or less than 3 datasets.");
}
ranksModels.clear();
computeRanks();
computeRanks(); // compute greaterAverage and ranks
// Set the control model as the one with the lowest average rank
controlIdx = distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; }));
controlIdx = score == "accuracy" ?
distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; }))
: greaterAverage; // The model with the greater average score
computeWTL();
maxModelName = (*std::max_element(models.begin(), models.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
maxDatasetName = (*std::max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
@@ -64,11 +75,16 @@ namespace platform {
void Statistics::computeRanks()
{
std::map<std::string, float> ranksLine;
std::map<std::string, float> averages;
for (const auto& model : models) {
averages[model] = 0;
}
for (const auto& dataset : datasets) {
std::vector<std::pair<std::string, double>> ranksOrder;
for (const auto& model : models) {
double value = data[model].at(dataset).at(0).get<double>();
ranksOrder.push_back({ model, value });
averages[model] += value;
}
// Assign the ranks
ranksLine = assignRanks(ranksOrder);
@@ -86,10 +102,17 @@ namespace platform {
for (const auto& rank : ranks) {
ranks[rank.first] /= nDatasets;
}
// Average the scores
for (const auto& average : averages) {
averages[average.first] /= nDatasets;
}
// Get the model with the greater average score
greaterAverage = distance(averages.begin(), max_element(averages.begin(), averages.end(), [](const auto& l, const auto& r) { return l.second < r.second; }));
}
void Statistics::computeWTL()
{
// Compute the WTL matrix
const double practical_threshold = 0.0005;
// Compute the WTL matrix (Win Tie Loss)
for (int i = 0; i < nModels; ++i) {
wtl[i] = { 0, 0, 0 };
}
@@ -102,23 +125,85 @@ namespace platform {
continue;
}
double value = data[models[i]].at(item.key()).at(0).get<double>();
if (value < controlValue) {
wtl[i].win++;
} else if (value == controlValue) {
double diff = controlValue - value; // control comparison
if (std::fabs(diff) <= practical_threshold) {
wtl[i].tie++;
} else if (diff < 0) {
wtl[i].win++;
} else {
wtl[i].loss++;
}
}
}
}
void Statistics::postHocHolmTest(bool friedmanResult)
int Statistics::getControlIdx()
{
if (!fitted) {
fit();
}
return controlIdx;
}
void Statistics::postHocTest()
{
if (score == "accuracy") {
postHocHolmTest();
} else {
postHocWilcoxonTest();
}
}
void Statistics::postHocWilcoxonTest()
{
if (!fitted) {
fit();
}
// Reference: Wilcoxon, F. (1945). “Individual Comparisons by Ranking Methods”. Biometrics Bulletin, 1(6), 80-83.
auto wilcoxon = WilcoxonTest(models, datasets, data, significance);
controlIdx = wilcoxon.getControlIdx();
postHocResults = wilcoxon.getPostHocResults();
setResultsOrder();
// Fill the ranks info
for (const auto& item : postHocResults) {
ranks[item.model] = item.rank;
}
Holm_Bonferroni();
restoreResultsOrder();
}
void Statistics::Holm_Bonferroni()
{
// The algorithm need the p-values sorted from the lowest to the highest
// Sort the models by p-value
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
return a.pvalue < b.pvalue;
});
// Holm adjustment
for (int i = 0; i < postHocResults.size(); ++i) {
auto item = postHocResults.at(i);
double before = i == 0 ? 0.0 : postHocResults.at(i - 1).pvalue;
double p_value = std::min((long double)1.0, item.pvalue * (nModels - i));
p_value = std::max(before, p_value);
postHocResults[i].pvalue = p_value;
}
}
void Statistics::setResultsOrder()
{
int c = 0;
for (auto& item : postHocResults) {
item.idx = c++;
}
}
void Statistics::restoreResultsOrder()
{
// Restore the order of the results
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
return a.idx < b.idx;
});
}
void Statistics::postHocHolmTest()
{
if (!fitted) {
fit();
}
std::stringstream oss;
// Reference https://link.springer.com/article/10.1007/s44196-022-00083-8
// Post-hoc Holm test
// Calculate the p-value for the models paired with the control model
@@ -126,75 +211,67 @@ namespace platform {
boost::math::normal dist(0.0, 1.0);
double diff = sqrt(nModels * (nModels + 1) / (6.0 * nDatasets));
for (int i = 0; i < nModels; i++) {
PostHocLine line;
line.model = models[i];
line.rank = ranks.at(models[i]);
line.wtl = wtl.at(i);
line.reject = false;
if (i == controlIdx) {
stats[i] = 0.0;
postHocResults.push_back(line);
continue;
}
double z = abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
double p_value = (long double)2 * (1 - cdf(dist, z));
stats[i] = p_value;
double z = std::abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
line.pvalue = (long double)2 * (1 - cdf(dist, z));
line.reject = (line.pvalue < significance);
postHocResults.push_back(line);
}
// Sort the models by p-value
std::vector<std::pair<int, double>> statsOrder;
for (const auto& stat : stats) {
statsOrder.push_back({ stat.first, stat.second });
}
std::sort(statsOrder.begin(), statsOrder.end(), [](const std::pair<int, double>& a, const std::pair<int, double>& b) {
return a.second < b.second;
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
return a.rank < b.rank;
});
setResultsOrder();
Holm_Bonferroni();
restoreResultsOrder();
}
// Holm adjustment
for (int i = 0; i < statsOrder.size(); ++i) {
auto item = statsOrder.at(i);
double before = i == 0 ? 0.0 : statsOrder.at(i - 1).second;
double p_value = std::min((double)1.0, item.second * (nModels - i));
p_value = std::max(before, p_value);
statsOrder[i] = { item.first, p_value };
}
holmResult.model = models.at(controlIdx);
void Statistics::postHocTestReport(bool friedmanResult, bool tex)
{
std::stringstream oss;
auto color = friedmanResult ? Colors::CYAN() : Colors::YELLOW();
oss << color;
oss << " *************************************************************************************************************" << std::endl;
oss << " Post-hoc Holm test: H0: 'There is no significant differences between the control model and the other models.'" << std::endl;
oss << " " << std::string(hlen + 25, '*') << std::endl;
oss << " Post-hoc " << postHocType << " test: H0: 'There is no significant differences between the control model and the other models.'" << std::endl;
oss << " Control model: " << models.at(controlIdx) << std::endl;
oss << " " << std::left << std::setw(maxModelName) << std::string("Model") << " p-value rank win tie loss Status" << std::endl;
oss << " " << std::string(maxModelName, '=') << " ============ ========= === === ==== =============" << std::endl;
// sort ranks from lowest to highest
std::vector<std::pair<std::string, float>> ranksOrder;
for (const auto& rank : ranks) {
ranksOrder.push_back({ rank.first, rank.second });
}
std::sort(ranksOrder.begin(), ranksOrder.end(), [](const std::pair<std::string, float>& a, const std::pair<std::string, float>& b) {
return a.second < b.second;
});
// Show the control model info.
oss << " " << Colors::BLUE() << std::left << std::setw(maxModelName) << ranksOrder.at(0).first << " ";
oss << std::setw(12) << " " << std::setprecision(7) << std::fixed << " " << ranksOrder.at(0).second << std::endl;
for (const auto& item : ranksOrder) {
auto idx = distance(models.begin(), find(models.begin(), models.end(), item.first));
double pvalue = 0.0;
for (const auto& stat : statsOrder) {
if (stat.first == idx) {
pvalue = stat.second;
}
}
holmResult.holmLines.push_back({ item.first, pvalue, item.second, wtl.at(idx), pvalue < significance });
if (item.first == models.at(controlIdx)) {
bool first = true;
for (const auto& item : postHocResults) {
if (first) {
oss << " " << Colors::BLUE() << std::left << std::setw(maxModelName) << item.model << " ";
oss << std::setw(12) << " " << std::setprecision(7) << std::fixed << " " << item.rank << std::endl;
first = false;
continue;
}
auto pvalue = item.pvalue;
auto colorStatus = pvalue > significance ? Colors::GREEN() : Colors::MAGENTA();
auto status = pvalue > significance ? Symbols::check_mark : Symbols::cross;
auto textStatus = pvalue > significance ? " accepted H0" : " rejected H0";
oss << " " << colorStatus << std::left << std::setw(maxModelName) << item.first << " ";
oss << std::setprecision(6) << std::scientific << pvalue << std::setprecision(7) << std::fixed << " " << item.second;
oss << " " << std::right << std::setw(3) << wtl.at(idx).win << " " << std::setw(3) << wtl.at(idx).tie << " " << std::setw(4) << wtl.at(idx).loss;
oss << " " << colorStatus << std::left << std::setw(maxModelName) << item.model << " ";
oss << std::setprecision(6) << std::scientific << pvalue << std::setprecision(7) << std::fixed << " " << item.rank;
oss << " " << std::right << std::setw(3) << item.wtl.win << " " << std::setw(3) << item.wtl.tie << " " << std::setw(4) << item.wtl.loss;
oss << " " << status << textStatus << std::endl;
}
oss << color << " *************************************************************************************************************" << std::endl;
oss << color << " " << std::string(hlen + 25, '*') << std::endl;
oss << Colors::RESET();
if (output) {
std::cout << oss.str();
}
if (tex) {
BestResultsTex bestResultsTex(score);
BestResultsMd bestResultsMd;
bestResultsTex.postHoc_test(postHocResults, postHocType, get_date() + " " + get_time());
bestResultsMd.postHoc_test(postHocResults, postHocType, get_date() + " " + get_time());
}
}
bool Statistics::friedmanTest()
{
@@ -205,7 +282,7 @@ namespace platform {
// Friedman test
// Calculate the Friedman statistic
oss << Colors::BLUE() << std::endl;
oss << "***************************************************************************************************************" << std::endl;
oss << std::string(hlen, '*') << std::endl;
oss << Colors::GREEN() << "Friedman test: H0: 'There is no significant differences between all the classifiers.'" << Colors::BLUE() << std::endl;
double degreesOfFreedom = nModels - 1.0;
double sumSquared = 0;
@@ -230,23 +307,11 @@ namespace platform {
oss << Colors::YELLOW() << "The null hypothesis H0 is accepted. Computed p-values will not be significant." << std::endl;
result = false;
}
oss << Colors::BLUE() << "***************************************************************************************************************" << Colors::RESET() << std::endl;
oss << Colors::BLUE() << std::string(hlen, '*') << Colors::RESET() << std::endl;
if (output) {
std::cout << oss.str();
}
friedmanResult = { friedmanQ, criticalValue, p_value, result };
return result;
}
FriedmanResult& Statistics::getFriedmanResult()
{
return friedmanResult;
}
HolmResult& Statistics::getHolmResult()
{
return holmResult;
}
std::map<std::string, std::map<std::string, float>>& Statistics::getRanks()
{
return ranksModels;
}
} // namespace platform

72
src/best/Statistics.h Normal file
View File

@@ -0,0 +1,72 @@
#ifndef STATISTICS_H
#define STATISTICS_H
#include <iostream>
#include <vector>
#include <map>
#include <nlohmann/json.hpp>
namespace platform {
using json = nlohmann::ordered_json;
struct WTL {
uint win;
uint tie;
uint loss;
};
struct FriedmanResult {
double statistic;
double criticalValue;
long double pvalue;
bool reject;
};
struct PostHocLine {
uint idx; //index of the main order
std::string model;
long double pvalue;
double rank;
WTL wtl;
bool reject;
};
class Statistics {
public:
Statistics(const std::string& score, const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance = 0.05, bool output = true);
bool friedmanTest();
void postHocTest();
void postHocTestReport(bool friedmanResult, bool tex);
int getControlIdx();
FriedmanResult& getFriedmanResult() { return friedmanResult; }
std::vector<PostHocLine>& getPostHocResults() { return postHocResults; }
std::map<std::string, std::map<std::string, float>>& getRanks() { return ranksModels; } // ranks of the models per dataset
private:
void fit();
void postHocHolmTest();
void postHocWilcoxonTest();
void computeRanks();
void computeWTL();
void Holm_Bonferroni();
void setResultsOrder(); // Set the order of the results based on the statistic analysis needed
void restoreResultsOrder(); // Restore the order of the results after the Holm-Bonferroni adjustment
const std::string& score;
std::string postHocType;
const std::vector<std::string>& models;
const std::vector<std::string>& datasets;
const json& data;
double significance;
bool output;
bool fitted = false;
int nModels = 0;
int nDatasets = 0;
int controlIdx = 0;
int greaterAverage = -1; // The model with the greater average score
std::map<int, WTL> wtl;
std::map<std::string, float> ranks;
int maxModelName = 0;
int maxDatasetName = 0;
int hlen; // length of the line
FriedmanResult friedmanResult;
std::vector<PostHocLine> postHocResults;
std::map<std::string, std::map<std::string, float>> ranksModels;
};
}
#endif

245
src/best/WilcoxonTest.hpp Normal file
View File

@@ -0,0 +1,245 @@
#ifndef BEST_WILCOXON_TEST_HPP
#define BEST_WILCOXON_TEST_HPP
// WilcoxonTest.hpp
// Standalone class for paired Wilcoxon signedrank posthoc analysis
// ------------------------------------------------------------------
// * Constructor takes the *alreadyloaded* nlohmann::json object plus the
// vectors of model and dataset names.
// * Internally selects a control model (highest average AUC) and builds all
// statistics (ranks, W/T/L counts, Wilcoxon pvalues).
// * Public API:
// int getControlIdx() const;
// PostHocResult getPostHocResult() const;
//
#include <vector>
#include <string>
#include <cmath>
#include <algorithm>
#include <numeric>
#include <limits>
#include <nlohmann/json.hpp>
#include "Statistics.h"
namespace platform {
class WilcoxonTest {
public:
WilcoxonTest(const std::vector<std::string>& models, const std::vector<std::string>& datasets,
const json& data, double alpha = 0.05) : models_(models), datasets_(datasets), data_(data), alpha_(alpha)
{
buildAUCTable(); // extracts all AUCs into a dense matrix
computeAverageAUCs(); // permodel mean (→ control selection)
computeAverageRanks(); // Friedmanstyle ranks per model
selectControlModel(); // sets control_idx_
buildPostHocResult(); // fills postHocResult_
}
int getControlIdx() const noexcept { return control_idx_; }
const std::vector<PostHocLine>& getPostHocResults() const noexcept { return postHocResults_; }
private:
//-------------------------------------------------- helper structs ----
// When a value is missing we keep NaN so that ordinary arithmetic still
// works (NaN simply propagates and we can test with std::isnan).
using Matrix = std::vector<std::vector<double>>; // [model][dataset]
//------------------------------------------------- implementation ----
void buildAUCTable()
{
const std::size_t M = models_.size();
const std::size_t D = datasets_.size();
auc_.assign(M, std::vector<double>(D, std::numeric_limits<double>::quiet_NaN()));
for (std::size_t i = 0; i < M; ++i) {
const auto& model = models_[i];
for (std::size_t j = 0; j < D; ++j) {
const auto& ds = datasets_[j];
try {
auc_[i][j] = data_.at(model).at(ds).at(0).get<double>();
}
catch (...) {
// leave as NaN when value missing
}
}
}
}
void computeAverageAUCs()
{
const std::size_t M = models_.size();
avg_auc_.resize(M, std::numeric_limits<double>::quiet_NaN());
for (std::size_t i = 0; i < M; ++i) {
double sum = 0.0;
std::size_t cnt = 0;
for (double v : auc_[i]) {
if (!std::isnan(v)) { sum += v; ++cnt; }
}
avg_auc_[i] = cnt ? sum / cnt : std::numeric_limits<double>::quiet_NaN();
}
}
// Average rank across datasets (1 = best).
void computeAverageRanks()
{
const std::size_t M = models_.size();
const std::size_t D = datasets_.size();
rank_sum_.assign(M, 0.0);
rank_cnt_.assign(M, 0);
const double EPS = 1e-10;
for (std::size_t j = 0; j < D; ++j) {
// Collect present values for this dataset
std::vector<std::pair<double, std::size_t>> vals; // (auc, model_idx)
vals.reserve(M);
for (std::size_t i = 0; i < M; ++i) {
if (!std::isnan(auc_[i][j]))
vals.emplace_back(auc_[i][j], i);
}
if (vals.empty()) continue; // no info for this dataset
// Sort descending (higher AUC better)
std::sort(vals.begin(), vals.end(), [](auto a, auto b) {
return a.first > b.first;
});
// Assign ranks with average for ties
std::size_t k = 0;
while (k < vals.size()) {
std::size_t l = k + 1;
while (l < vals.size() && std::fabs(vals[l].first - vals[k].first) < EPS) ++l;
const double avg_rank = (k + 1 + l) * 0.5; // average of ranks (1based)
for (std::size_t m = k; m < l; ++m) {
const auto idx = vals[m].second;
rank_sum_[idx] += avg_rank;
++rank_cnt_[idx];
}
k = l;
}
}
// Final average
avg_rank_.resize(M, std::numeric_limits<double>::quiet_NaN());
for (std::size_t i = 0; i < M; ++i) {
avg_rank_[i] = rank_cnt_[i] ? rank_sum_[i] / rank_cnt_[i]
: std::numeric_limits<double>::quiet_NaN();
}
}
void selectControlModel()
{
// pick model with highest average AUC (ties → first)
control_idx_ = 0;
for (std::size_t i = 1; i < avg_auc_.size(); ++i) {
if (avg_auc_[i] > avg_auc_[control_idx_]) control_idx_ = static_cast<int>(i);
}
}
void buildPostHocResult()
{
const std::size_t M = models_.size();
const std::size_t D = datasets_.size();
const std::string& control_name = models_[control_idx_];
const double practical_threshold = 0.0005; // same heuristic as original code
for (std::size_t i = 0; i < M; ++i) {
PostHocLine line;
line.model = models_[i];
line.rank = avg_auc_[i];
WTL wtl = { 0, 0, 0 }; // win, tie, loss
std::vector<double> differences;
differences.reserve(D);
for (std::size_t j = 0; j < D; ++j) {
double auc_control = auc_[control_idx_][j];
double auc_other = auc_[i][j];
if (std::isnan(auc_control) || std::isnan(auc_other)) continue;
double diff = auc_control - auc_other; // control comparison
if (std::fabs(diff) <= practical_threshold) {
++wtl.tie;
} else if (diff < 0) {
++wtl.win; // comparison wins
} else {
++wtl.loss; // control wins
}
differences.push_back(diff);
}
line.wtl = wtl;
line.pvalue = differences.empty() ? 1.0L : static_cast<long double>(wilcoxonSignedRankTest(differences));
line.reject = (line.pvalue < alpha_);
postHocResults_.push_back(std::move(line));
}
// Sort results by rank (descending)
std::sort(postHocResults_.begin(), postHocResults_.end(), [](const PostHocLine& a, const PostHocLine& b) {
return a.rank > b.rank;
});
}
// ------------------------------------------------ Wilcoxon (private) --
static double wilcoxonSignedRankTest(const std::vector<double>& diffs)
{
if (diffs.empty()) return 1.0;
// Build |diff| + sign vector (exclude zeros)
struct Node { double absval; int sign; };
std::vector<Node> v;
v.reserve(diffs.size());
for (double d : diffs) {
if (d != 0.0) v.push_back({ std::fabs(d), d > 0 ? 1 : -1 });
}
if (v.empty()) return 1.0;
// Sort by absolute value
std::sort(v.begin(), v.end(), [](const Node& a, const Node& b) { return a.absval < b.absval; });
const double EPS = 1e-10;
const std::size_t n = v.size();
std::vector<double> ranks(n, 0.0);
std::size_t i = 0;
while (i < n) {
std::size_t j = i + 1;
while (j < n && std::fabs(v[j].absval - v[i].absval) < EPS) ++j;
double avg_rank = (i + 1 + j) * 0.5; // 1based ranks
for (std::size_t k = i; k < j; ++k) ranks[k] = avg_rank;
i = j;
}
double w_plus = 0.0, w_minus = 0.0;
for (std::size_t k = 0; k < n; ++k) {
if (v[k].sign > 0) w_plus += ranks[k];
else w_minus += ranks[k];
}
double w = std::min(w_plus, w_minus);
double mean_w = n * (n + 1) / 4.0;
double sd_w = std::sqrt(n * (n + 1) * (2 * n + 1) / 24.0);
if (sd_w == 0.0) return 1.0; // degenerate (all diffs identical)
double z = (w - mean_w) / sd_w;
double p_two = std::erfc(std::fabs(z) / std::sqrt(2.0)); // 2sided tail
return p_two;
}
//-------------------------------------------------------- data ----
std::vector<std::string> models_;
std::vector<std::string> datasets_;
json data_;
double alpha_;
Matrix auc_; // [model][dataset]
std::vector<double> avg_auc_; // mean AUC per model
std::vector<double> avg_rank_; // mean rank per model
std::vector<double> rank_sum_; // helper for ranks
std::vector<int> rank_cnt_; // datasets counted per model
int control_idx_ = -1;
std::vector<PostHocLine> postHocResults_;
};
} // namespace platform
#endif // BEST_WILCOXON_TEST_HPP

91
src/commands/b_best.cpp Normal file
View File

@@ -0,0 +1,91 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include "main/Models.h"
#include "main/modelRegister.h"
#include "common/Paths.h"
#include "common/Colors.h"
#include "common/Utils.h"
#include "best/BestResults.h"
#include "common/DotEnv.h"
#include "config_platform.h"
void manageArguments(argparse::ArgumentParser& program)
{
auto env = platform::DotEnv();
program.add_argument("-m", "--model").help("Model to use or any").default_value("any");
program.add_argument("--folder").help("Results folder to use").default_value(platform::Paths::results());
program.add_argument("-d", "--dataset").default_value("any").help("Filter results of the selected model) (any for all datasets)");
program.add_argument("-s", "--score").default_value(env.get("score")).help("Filter results of the score name supplied");
program.add_argument("--friedman").help("Friedman test").default_value(false).implicit_value(true);
program.add_argument("--excel").help("Output to excel").default_value(false).implicit_value(true);
program.add_argument("--tex").help("Output results to TeX & Markdown files").default_value(false).implicit_value(true);
program.add_argument("--index").help("In tex output show the index of the dataset instead of the name to save space").default_value(false).implicit_value(true);
program.add_argument("--level").help("significance level").default_value(0.05).scan<'g', double>().action([](const std::string& value) {
try {
auto k = std::stod(value);
if (k < 0.01 || k > 0.15) {
throw std::runtime_error("Significance level hast to be a number in [0.01, 0.15]");
}
return k;
}
catch (const std::runtime_error& err) {
throw std::runtime_error(err.what());
}
catch (...) {
throw std::runtime_error("Number of folds must be an decimal number");
}});
}
int main(int argc, char** argv)
{
argparse::ArgumentParser program("b_best", { platform_project_version.begin(), platform_project_version.end() });
manageArguments(program);
std::string model, dataset, score, folder;
bool build, report, friedman, excel, tex, index;
double level;
try {
program.parse_args(argc, argv);
model = program.get<std::string>("model");
folder = program.get<std::string>("folder");
if (folder.back() != '/') {
folder += '/';
}
dataset = program.get<std::string>("dataset");
score = program.get<std::string>("score");
friedman = program.get<bool>("friedman");
excel = program.get<bool>("excel");
tex = program.get<bool>("tex");
index = program.get<bool>("index");
level = program.get<double>("level");
if (model == "" || score == "") {
throw std::runtime_error("Model and score name must be supplied");
}
if (friedman && (model != "any" || dataset != "any")) {
std::cerr << "Friedman test can only be used with all models and all the datasets" << std::endl;
std::cerr << program;
exit(1);
}
}
catch (const std::exception& err) {
std::cerr << err.what() << std::endl;
std::cerr << program;
exit(1);
}
// Generate report
auto results = platform::BestResults(folder, score, model, dataset, friedman, level);
if (model == "any") {
results.buildAll();
results.reportAll(excel, tex, index);
} else {
std::string fileName = results.build();
std::cout << Colors::GREEN() << fileName << " created!" << Colors::RESET() << std::endl;
results.reportSingle(excel);
}
if (excel) {
auto fileName = results.getExcelFileName();
std::cout << "Opening " << fileName << std::endl;
platform::openFile(fileName);
}
std::cout << Colors::RESET();
return 0;
}

318
src/commands/b_grid.cpp Normal file
View File

@@ -0,0 +1,318 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include <map>
#include <nlohmann/json.hpp>
#include <mpi.h>
#include "main/Models.h"
#include "main/ArgumentsExperiment.h"
#include "common/Paths.h"
#include "common/Timer.hpp"
#include "common/Colors.h"
#include "common/DotEnv.h"
#include "grid/GridSearch.h"
#include "grid/GridExperiment.h"
#include "config_platform.h"
using json = nlohmann::ordered_json;
const int MAXL = 133;
void assignModel(argparse::ArgumentParser& parser)
{
auto models = platform::Models::instance();
parser.add_argument("-m", "--model")
.help("Model to use " + models->toString())
.required()
.action([models](const std::string& value) {
static const std::vector<std::string> choices = models->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw std::runtime_error("Model must be one of " + models->toString());
}
);
}
void add_search_args(argparse::ArgumentParser& program)
{
auto env = platform::DotEnv();
program.add_argument("--discretize").help("Discretize input datasets").default_value((bool)stoi(env.get("discretize"))).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true);
program.add_argument("--continue").help("Continue computing from that dataset").default_value(platform::GridSearch::NO_CONTINUE());
program.add_argument("--only").help("Used with continue to search with that dataset only").default_value(false).implicit_value(true);
program.add_argument("--exclude").default_value("[]").help("Datasets to exclude in json format, e.g. [\"dataset1\", \"dataset2\"]");
auto valid_choices = env.valid_tokens("smooth_strat");
auto& smooth_arg = program.add_argument("--smooth-strat").help("Smooth strategy used in Bayes Network node initialization. Valid values: " + env.valid_values("smooth_strat")).default_value(env.get("smooth_strat"));
for (auto choice : valid_choices) {
smooth_arg.choices(choice);
}
program.add_argument("--nested").help("Set the double/nested cross validation number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw std::runtime_error("Number of nested folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw std::runtime_error(err.what());
}
catch (...) {
throw std::runtime_error("Number of nested folds must be an integer");
}});
program.add_argument("--score").help("Score used in gridsearch").default_value("accuracy");
program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw std::runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw std::runtime_error(err.what());
}
catch (...) {
throw std::runtime_error("Number of folds must be an integer");
}});
auto seed_values = env.getSeeds();
program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
}
std::string headerLine(const std::string& text, int utf = 0)
{
int n = MAXL - text.length() - 3;
n = n < 0 ? 0 : n;
return "* " + text + std::string(n + utf, ' ') + "*\n";
}
void list_dump(std::string& model)
{
auto data = platform::GridData(platform::Paths::grid_input(model));
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
std::cout << headerLine("Listing configuration input file (Grid)");
std::cout << headerLine("Model: " + model);
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
int index = 0;
int max_hyper = 15;
int max_dataset = 7;
auto combinations = data.getGridFile();
for (auto const& item : combinations) {
if (item.first.size() > max_dataset) {
max_dataset = item.first.size();
}
for (auto const& [key, value] : item.second.items()) {
if (value.dump().size() > max_hyper) {
max_hyper = value.dump().size();
}
}
}
std::cout << Colors::GREEN() << left << " # " << left << setw(max_dataset) << "Dataset" << " #Com. "
<< setw(max_hyper) << "Hyperparameters" << std::endl;
std::cout << "=== " << string(max_dataset, '=') << " ===== " << string(max_hyper, '=') << std::endl;
int i = 0;
for (auto const& item : combinations) {
auto color = (i++ % 2) ? Colors::CYAN() : Colors::BLUE();
std::cout << color;
auto num_combinations = data.getNumCombinations(item.first);
std::cout << setw(3) << fixed << right << ++index << left << " " << setw(max_dataset) << item.first
<< " " << setw(5) << right << num_combinations << " ";
std::string prefix = "";
for (auto const& [key, value] : item.second.items()) {
std::cout << prefix << setw(max_hyper) << std::left << value.dump() << std::endl;
prefix = string(11 + max_dataset, ' ');
}
}
std::cout << Colors::RESET() << std::endl;
}
void list_results(json& results, std::string& model)
{
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
std::cout << headerLine("Listing computed hyperparameters for model " + model);
std::cout << headerLine("Date & time: " + results["date"].get<std::string>() + " Duration: " + results["duration"].get<std::string>());
std::cout << headerLine("Score: " + results["score"].get<std::string>());
std::cout << headerLine(
"Random seeds: " + results["seeds"].dump()
+ " Discretized: " + (results["discretize"].get<bool>() ? "True" : "False")
+ " Stratified: " + (results["stratified"].get<bool>() ? "True" : "False")
+ " #Folds: " + std::to_string(results["n_folds"].get<int>())
+ " Nested: " + (results["nested"].get<int>() == 0 ? "False" : to_string(results["nested"].get<int>()))
);
std::cout << std::string(MAXL, '*') << std::endl;
int spaces = 7;
int hyperparameters_spaces = 15;
nlohmann::json temp = results["results"]; // To show in alphabetical order of the dataset
for (const auto& item : temp.items()) {
auto key = item.key();
auto value = item.value();
if (key.size() > spaces) {
spaces = key.size();
}
if (value["hyperparameters"].dump().size() > hyperparameters_spaces) {
hyperparameters_spaces = value["hyperparameters"].dump().size();
}
}
std::cout << Colors::GREEN() << " # " << left << setw(spaces) << "Dataset" << " " << setw(19) << "Date" << " "
<< "Duration " << setw(8) << "Score" << " " << "Hyperparameters" << std::endl;
std::cout << "=== " << string(spaces, '=') << " " << string(19, '=') << " " << string(8, '=') << " "
<< string(8, '=') << " " << string(hyperparameters_spaces, '=') << std::endl;
int index = 0;
for (const auto& item : temp.items()) {
auto color = (index % 2) ? Colors::CYAN() : Colors::BLUE();
auto value = item.value();
std::cout << color;
std::cout << std::setw(3) << std::right << index++ << " ";
std::cout << left << setw(spaces) << item.key() << " " << value["date"].get<string>()
<< " " << setw(8) << right << value["duration"].get<string>() << " " << setw(8) << setprecision(6)
<< fixed << right << value["score"].get<double>() << " " << value["hyperparameters"].dump() << std::endl;
}
std::cout << Colors::RESET() << std::endl;
}
/*
* Main
*/
void dump(argparse::ArgumentParser& program)
{
auto model = program.get<std::string>("model");
list_dump(model);
}
void report(argparse::ArgumentParser& program)
{
// List results
struct platform::ConfigGrid config;
config.model = program.get<std::string>("model");
auto grid_search = platform::GridSearch(config);
auto results = grid_search.loadResults();
if (results.empty()) {
std::cout << "** No results found" << std::endl;
} else {
list_results(results, config.model);
}
}
void search(argparse::ArgumentParser& program)
{
struct platform::ConfigGrid config;
config.model = program.get<std::string>("model");
config.score = program.get<std::string>("score");
config.discretize = program.get<bool>("discretize");
config.stratified = program.get<bool>("stratified");
config.smooth_strategy = program.get<std::string>("smooth-strat");
config.n_folds = program.get<int>("folds");
config.quiet = program.get<bool>("quiet");
config.only = program.get<bool>("only");
config.seeds = program.get<std::vector<int>>("seeds");
config.nested = program.get<int>("nested");
config.continue_from = program.get<std::string>("continue");
if (config.continue_from == platform::GridSearch::NO_CONTINUE() && config.only) {
throw std::runtime_error("Cannot use --only without --continue");
}
auto excluded = program.get<std::string>("exclude");
config.excluded = json::parse(excluded);
platform::Paths::createPath(platform::Paths::grid());
auto grid_search = platform::GridSearch(config);
platform::Timer timer;
timer.start();
struct platform::ConfigMPI mpi_config;
mpi_config.manager = 0; // which process is the manager
MPI_Init(nullptr, nullptr);
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs);
if (mpi_config.n_procs < 2) {
throw std::runtime_error("Cannot use --search with less than 2 mpi processes, try mpirun -np 2 ...");
}
grid_search.go(mpi_config);
if (mpi_config.rank == mpi_config.manager) {
auto results = grid_search.loadResults();
std::cout << Colors::RESET() << "* Report of the computed hyperparameters" << std::endl;
list_results(results, config.model);
std::cout << "Process took " << timer.getDurationString() << std::endl;
}
MPI_Finalize();
}
void experiment(argparse::ArgumentParser& program)
{
struct platform::ConfigGrid config;
auto arguments = platform::ArgumentsExperiment(program, platform::experiment_t::GRID);
arguments.parse();
auto path_results = arguments.getPathResults();
auto grid_experiment = platform::GridExperiment(arguments, config);
platform::Timer timer;
timer.start();
struct platform::ConfigMPI mpi_config;
mpi_config.manager = 0; // which process is the manager
MPI_Init(nullptr, nullptr);
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs);
if (mpi_config.n_procs < 2) {
throw std::runtime_error("Cannot use --experiment with less than 2 mpi processes, try mpirun -np 2 ...");
}
grid_experiment.go(mpi_config);
if (mpi_config.rank == mpi_config.manager) {
auto experiment = grid_experiment.getExperiment();
std::cout << "* Report of the computed hyperparameters" << std::endl;
auto duration = timer.getDuration();
experiment.setDuration(duration);
if (grid_experiment.haveToSaveResults()) {
experiment.saveResult(path_results);
}
experiment.report();
std::cout << "Process took " << duration << std::endl;
}
MPI_Finalize();
}
int main(int argc, char** argv)
{
//
// Manage arguments
//
argparse::ArgumentParser program("b_grid", { platform_project_version.begin(), platform_project_version.end() });
// grid dump subparser
argparse::ArgumentParser dump_command("dump");
dump_command.add_description("Dump the combinations of hyperparameters of a model.");
assignModel(dump_command);
// grid report subparser
argparse::ArgumentParser report_command("report");
assignModel(report_command);
report_command.add_description("Report the computed hyperparameters of a model.");
// grid search subparser
argparse::ArgumentParser search_command("search");
search_command.add_description("Search using mpi the hyperparameters of a model.");
assignModel(search_command);
add_search_args(search_command);
// grid experiment subparser
argparse::ArgumentParser experiment_command("experiment");
experiment_command.add_description("Experiment like b_main using mpi.");
auto arguments = platform::ArgumentsExperiment(experiment_command, platform::experiment_t::GRID);
arguments.add_arguments();
program.add_subparser(dump_command);
program.add_subparser(report_command);
program.add_subparser(search_command);
program.add_subparser(experiment_command);
//
// Process options
//
try {
program.parse_args(argc, argv);
bool found = false;
map<std::string, void(*)(argparse::ArgumentParser&)> commands = { {"dump", &dump}, {"report", &report}, {"search", &search}, { "experiment",&experiment } };
for (const auto& command : commands) {
if (program.is_subcommand_used(command.first)) {
std::invoke(command.second, program.at<argparse::ArgumentParser>(command.first));
found = true;
break;
}
}
if (!found) {
throw std::runtime_error("You must specify one of the following commands: dump, experiment, report, search \n");
}
}
catch (const exception& err) {
cerr << err.what() << std::endl;
cerr << program;
exit(1);
}
std::cout << "Done!" << std::endl;
return 0;
}

119
src/commands/b_list.cpp Normal file
View File

@@ -0,0 +1,119 @@
#include <iostream>
#include <locale>
#include <map>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include "main/Models.h"
#include "main/modelRegister.h"
#include "common/Paths.h"
#include "common/Colors.h"
#include "common/Datasets.h"
#include "common/Utils.h"
#include "reports/DatasetsExcel.h"
#include "reports/DatasetsConsole.h"
#include "results/ResultsDatasetConsole.h"
#include "results/ResultsDataset.h"
#include "results/ResultsDatasetExcel.h"
#include "config_platform.h"
void list_datasets(argparse::ArgumentParser& program)
{
auto excel = program.get<bool>("excel");
auto report = platform::DatasetsConsole();
report.report();
std::cout << report.getOutput();
if (excel) {
auto data = report.getData();
auto ereport = new platform::DatasetsExcel();
ereport->report(data);
std::cout << std::endl << Colors::GREEN() << "Output saved in " << ereport->getFileName() << std::endl;
auto fileName = ereport->getExcelFileName();
delete ereport;
std::cout << "Opening " << fileName << std::endl;
platform::openFile(fileName);
}
}
void list_results(argparse::ArgumentParser& program)
{
auto dataset = program.get<string>("dataset");
auto score = program.get<string>("score");
auto model = program.get<string>("model");
auto excel = program.get<bool>("excel");
auto report = platform::ResultsDatasetsConsole();
if (!report.report(dataset, score, model))
return;
std::cout << report.getOutput();
if (excel) {
auto data = report.getData();
auto ereport = new platform::ResultsDatasetExcel();
ereport->report(data);
std::cout << std::endl << Colors::GREEN() << "Output saved in " << ereport->getFileName() << std::endl;
auto fileName = ereport->getExcelFileName();
delete ereport;
std::cout << "Opening " << fileName << std::endl;
platform::openFile(fileName);
}
}
int main(int argc, char** argv)
{
argparse::ArgumentParser program("b_list", { platform_project_version.begin(), platform_project_version.end() });
//
// datasets subparser
//
argparse::ArgumentParser datasets_command("datasets");
datasets_command.add_description("List datasets available in the platform.");
datasets_command.add_argument("--excel").help("Output in Excel format").default_value(false).implicit_value(true);
//
// results subparser
//
argparse::ArgumentParser results_command("results");
results_command.add_description("List the results of a given dataset.");
auto datasets = platform::Datasets(false, platform::Paths::datasets());
results_command.add_argument("-d", "--dataset")
.help("Dataset to use " + datasets.toString())
.required()
.action([](const std::string& value) {
auto datasets = platform::Datasets(false, platform::Paths::datasets());
static const std::vector<std::string> choices = datasets.getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw std::runtime_error("Dataset must be one of " + datasets.toString());
}
);
results_command.add_argument("-m", "--model")
.help("Model to use or any")
.default_value("any");
results_command.add_argument("--excel").help("Output in Excel format").default_value(false).implicit_value(true);
results_command.add_argument("-s", "--score").default_value("accuracy").help("Filter results of the score name supplied");
// Add subparsers
program.add_subparser(datasets_command);
program.add_subparser(results_command);
// Parse command line and execute
try {
program.parse_args(argc, argv);
bool found = false;
map<std::string, void(*)(argparse::ArgumentParser&)> commands = { {"datasets", &list_datasets}, {"results", &list_results} };
for (const auto& command : commands) {
if (program.is_subcommand_used(command.first)) {
std::invoke(command.second, program.at<argparse::ArgumentParser>(command.first));
found = true;
break;
}
}
if (!found) {
throw std::runtime_error("You must specify one of the following commands: {datasets, results}\n");
}
}
catch (const exception& err) {
cerr << err.what() << std::endl;
cerr << program;
exit(1);
}
std::cout << Colors::RESET() << std::endl;
return 0;
}

37
src/commands/b_main.cpp Normal file
View File

@@ -0,0 +1,37 @@
#include <argparse/argparse.hpp>
#include "main/Experiment.h"
#include "main/ArgumentsExperiment.h"
#include "config_platform.h"
using json = nlohmann::ordered_json;
int main(int argc, char** argv)
{
argparse::ArgumentParser program("b_main", { platform_project_version.begin(), platform_project_version.end() });
auto arguments = platform::ArgumentsExperiment(program, platform::experiment_t::NORMAL);
arguments.add_arguments();
arguments.parse_args(argc, argv);
/*
* Begin Processing
*/
// Initialize the experiment class with the command line arguments
auto experiment = arguments.initializedExperiment();
auto path_results = arguments.getPathResults();
platform::Timer timer;
timer.start();
experiment.go();
experiment.setDuration(timer.getDuration());
if (!arguments.isQuiet()) {
// Classification report if only one dataset is tested
experiment.report();
}
if (arguments.haveToSaveResults()) {
experiment.saveResult(path_results);
}
if (arguments.doGraph()) {
experiment.saveGraph();
}
return 0;
}

85
src/commands/b_manage.cpp Normal file
View File

@@ -0,0 +1,85 @@
#include <utility>
#include <iostream>
#include <sys/ioctl.h>
#include "common/Paths.h"
#include <argparse/argparse.hpp>
#include "manage/ManageScreen.h"
#include <signal.h>
#include "config_platform.h"
platform::ManageScreen* manager = nullptr;
void manageArguments(argparse::ArgumentParser& program, int argc, char** argv)
{
program.add_argument("-m", "--model").default_value("any").help("Filter results of the selected model)");
program.add_argument("-s", "--score").default_value("any").help("Filter results of the score name supplied");
program.add_argument("--folder").help("Results folder to use").default_value(platform::Paths::results());
program.add_argument("--platform").default_value("any").help("Filter results of the selected platform");
program.add_argument("--complete").help("Show only results with all datasets").default_value(false).implicit_value(true);
program.add_argument("--partial").help("Show only partial results").default_value(false).implicit_value(true);
program.add_argument("--compare").help("Compare with best results").default_value(false).implicit_value(true);
try {
program.parse_args(argc, argv);
auto platform = program.get<std::string>("platform");
auto model = program.get<std::string>("model");
auto score = program.get<std::string>("score");
auto complete = program.get<bool>("complete");
auto partial = program.get<bool>("partial");
auto compare = program.get<bool>("compare");
}
catch (const std::exception& err) {
std::cerr << err.what() << std::endl;
std::cerr << program;
exit(1);
}
}
std::pair<int, int> numRowsCols()
{
#ifdef TIOCGSIZE
struct ttysize ts;
ioctl(STDIN_FILENO, TIOCGSIZE, &ts);
return { ts.ts_lines, ts.ts_cols };
#elif defined(TIOCGWINSZ)
struct winsize ts;
ioctl(STDIN_FILENO, TIOCGWINSZ, &ts);
return { ts.ws_row, ts.ws_col };
#endif /* TIOCGSIZE */
}
void handleResize(int sig)
{
auto [rows, cols] = numRowsCols();
manager->updateSize(rows, cols);
}
int main(int argc, char** argv)
{
auto program = argparse::ArgumentParser("b_manage", { platform_project_version.begin(), platform_project_version.end() });
manageArguments(program, argc, argv);
std::string model = program.get<std::string>("model");
std::string path = program.get<std::string>("folder");
if (path.back() != '/') {
path += '/';
}
std::string score = program.get<std::string>("score");
std::string platform = program.get<std::string>("platform");
bool complete = program.get<bool>("complete");
bool partial = program.get<bool>("partial");
bool compare = program.get<bool>("compare");
if (complete)
partial = false;
signal(SIGWINCH, handleResize);
auto [rows, cols] = numRowsCols();
manager = new platform::ManageScreen(path, rows, cols, model, score, platform, complete, partial, compare);
manager->doMenu();
auto fileName = manager->getExcelFileName();
delete manager;
if (!fileName.empty()) {
std::cout << "Opening " << fileName << std::endl;
platform::openFile(fileName);
}
return 0;
}

102
src/commands/b_results.cpp Normal file
View File

@@ -0,0 +1,102 @@
#include <iostream>
#include <filesystem>
#include <fstream>
#include <vector>
#include "nlohmann/json.hpp"
#include "argparse/argparse.hpp"
#include "common/Paths.h"
#include "results/JsonValidator.h"
#include "results/SchemaV1_0.h"
#include "config_platform.h"
using json = nlohmann::json;
namespace fs = std::filesystem;
void header(const std::string& message, int length, const std::string& symbol)
{
std::cout << std::string(length + 11, symbol[0]) << std::endl;
std::cout << symbol << " " << std::setw(length + 7) << std::left << message << " " << symbol << std::endl;
std::cout << std::string(length + 11, symbol[0]) << std::endl;
}
int main(int argc, char* argv[])
{
argparse::ArgumentParser program("b_results", { platform_project_version.begin(), platform_project_version.end() });
program.add_description("Check the results files and optionally fixes them.");
program.add_argument("--fix").help("Fix any errors in results").default_value(false).implicit_value(true);
program.add_argument("--file").help("check only this results file").default_value("");
std::string nameSuffix = "results_";
std::string schemaVersion = "1.0";
bool fix_it = false;
std::string selected_file;
try {
program.parse_args(argc, argv);
fix_it = program.get<bool>("fix");
selected_file = program.get<std::string>("file");
}
catch (const std::exception& err) {
std::cerr << err.what() << std::endl;
std::cerr << program;
exit(1);
}
//
// Determine the files to process
//
std::vector<std::string> result_files;
int max_length = 0;
if (selected_file != "") {
if (!selected_file.starts_with(platform::Paths::results())) {
selected_file = platform::Paths::results() + selected_file;
}
// Only check the selected file
result_files.push_back(selected_file);
max_length = selected_file.length();
} else {
// Load the result files and find the longest file name
for (const auto& entry : fs::directory_iterator(platform::Paths::results())) {
if (entry.is_regular_file() && entry.path().filename().string().starts_with(nameSuffix) && entry.path().filename().string().ends_with(".json")) {
std::string fileName = entry.path().string();
if (fileName.length() > max_length) {
max_length = fileName.length();
}
result_files.push_back(fileName);
}
}
}
//
// Process the results files
//
if (result_files.empty()) {
std::cerr << "Error: No result files found." << std::endl;
return 1;
}
std::string header_message = "Processing " + std::to_string(result_files.size()) + " result files.";
header(header_message, max_length, "*");
platform::JsonValidator validator(platform::SchemaV1_0::schema);
int n_errors = 0;
std::vector<std::string> files_with_errors;
for (const auto& file_name : result_files) {
std::vector<std::string> errors = validator.validate_file(file_name);
if (!errors.empty()) {
n_errors++;
std::cout << std::setw(max_length) << std::left << file_name << ": " << errors.size() << " Errors:" << std::endl;
for (const auto& error : errors) {
std::cout << " - " << error << std::endl;
}
if (fix_it) {
validator.fix_it(file_name);
std::cout << " -> File fixed." << std::endl;
}
files_with_errors.push_back(file_name);
}
}
if (n_errors == 0) {
header("All files are valid.", max_length, "*");
} else {
std::string $verb = (fix_it) ? "had" : "have";
std::string msg = std::to_string(n_errors) + " files " + $verb + " errors.";
header(msg, max_length, "*");
for (const auto& file_name : files_with_errors) {
std::cout << "- " << file_name << std::endl;
}
}
return 0;
}

View File

@@ -1,5 +1,5 @@
#ifndef LOCALE_H
#define LOCALE_H
#ifndef CLOCALE_H
#define CLOCALE_H
#include <locale>
#include <iostream>
#include <string>
@@ -19,4 +19,4 @@ namespace platform {
}
};
}
#endif
#endif

30
src/common/Colors.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef COLORS_H
#define COLORS_H
#include <string>
class Colors {
public:
static std::string BLACK() { return "\033[1;30m"; }
static std::string IBLACK() { return "\033[0;90m"; }
static std::string BLUE() { return "\033[1;34m"; }
static std::string IBLUE() { return "\033[0;94m"; }
static std::string CYAN() { return "\033[1;36m"; }
static std::string ICYAN() { return "\033[0;96m"; }
static std::string GREEN() { return "\033[1;32m"; }
static std::string IGREEN() { return "\033[0;92m"; }
static std::string MAGENTA() { return "\033[1;35m"; }
static std::string IMAGENTA() { return "\033[0;95m"; }
static std::string RED() { return "\033[1;31m"; }
static std::string IRED() { return "\033[0;91m"; }
static std::string YELLOW() { return "\033[1;33m"; }
static std::string IYELLOW() { return "\033[0;93m"; }
static std::string WHITE() { return "\033[1;37m"; }
static std::string IWHITE() { return "\033[0;97m"; }
static std::string RESET() { return "\033[0m"; }
static std::string BOLD() { return "\033[1m"; }
static std::string UNDERLINE() { return "\033[4m"; }
static std::string BLINK() { return "\033[5m"; }
static std::string REVERSE() { return "\033[7m"; }
static std::string CONCEALED() { return "\033[8m"; }
static std::string CLRSCR() { return "\033[2J\033[1;1H"; }
};
#endif

278
src/common/Dataset.cpp Normal file
View File

@@ -0,0 +1,278 @@
#include <ArffFiles.hpp>
#include <fstream>
#include "Dataset.h"
namespace platform {
const std::string message_dataset_not_loaded = "Dataset not loaded.";
Dataset::Dataset(const Dataset& dataset) :
path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples),
n_features(dataset.n_features), numericFeatures(dataset.numericFeatures), features(dataset.features),
states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y),
X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), yv(dataset.yv),
fileType(dataset.fileType)
{
}
std::string Dataset::getName() const
{
return name;
}
std::vector<std::string> Dataset::getFeatures() const
{
if (loaded) {
return features;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
int Dataset::getNFeatures() const
{
if (loaded) {
return n_features;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
int Dataset::getNSamples() const
{
if (loaded) {
return n_samples;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
std::string Dataset::getClassName() const
{
return className;
}
int Dataset::getNClasses() const
{
if (loaded) {
return *std::max_element(yv.begin(), yv.end()) + 1;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
std::vector<std::string> Dataset::getLabels() const
{
// Return the labels factorization result
if (loaded) {
return labels;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
std::vector<int> Dataset::getClassesCounts() const
{
if (loaded) {
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
for (auto y : yv) {
counts[y]++;
}
return counts;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
std::map<std::string, std::vector<int>> Dataset::getStates() const
{
if (loaded) {
return states;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
return { X, y };
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (!file.is_open()) {
throw std::invalid_argument("Unable to open dataset file.");
}
labels.clear();
std::string line;
getline(file, line);
std::vector<std::string> tokens = split(line, ',');
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
if (className == "-1") {
className = tokens.back();
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
auto label = trim(tokens.back());
if (find(labels.begin(), labels.end(), label) == labels.end()) {
labels.push_back(label);
}
yv.push_back(stoi(label));
}
file.close();
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
auto [max_value, idx] = torch::max(X_train.index({ i, "..." }), 0);
states[features[i]] = std::vector<int>(max_value.item<int>() + 1);
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
}
auto [max_value, idx] = torch::max(y_train, 0);
states[className] = std::vector<int>(max_value.item<int>() + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
labels = arff.getLabels();
}
std::vector<std::string> tokenize(std::string line)
{
std::vector<std::string> tokens;
for (auto i = 0; i < line.size(); ++i) {
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
std::string token = line.substr(0, i);
tokens.push_back(token);
line.erase(line.begin(), line.begin() + i + 1);
i = 0;
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
line.erase(line.begin(), line.begin() + i + 1);
}
}
if (line.size() > 0) {
tokens.push_back(line);
}
return tokens;
}
void Dataset::load_rdata()
{
ifstream file(path + "/" + name + "_R.dat");
if (!file.is_open()) {
throw std::invalid_argument("Unable to open dataset file.");
}
std::string line;
labels.clear();
getline(file, line);
line = ArffFiles::trim(line);
std::vector<std::string> tokens = tokenize(line);
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
if (className == "-1") {
className = ArffFiles::trim(tokens.back());
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = tokenize(line);
// We have to skip the first token, which is the instance number.
for (auto i = 1; i < features.size() + 1; ++i) {
const float value = stof(tokens[i]);
Xv[i - 1].push_back(value);
}
auto label = trim(tokens.back());
if (find(labels.begin(), labels.end(), label) == labels.end()) {
labels.push_back(label);
}
yv.push_back(stoi(label));
}
file.close();
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
} else if (fileType == RDATA) {
load_rdata();
}
n_samples = Xv[0].size();
n_features = Xv.size();
if (numericFeaturesIdx.size() == 0) {
numericFeatures = std::vector<bool>(n_features, false);
} else {
if (numericFeaturesIdx.at(0) == -1) {
numericFeatures = std::vector<bool>(n_features, true);
} else {
numericFeatures = std::vector<bool>(n_features, false);
for (auto i : numericFeaturesIdx) {
numericFeatures[i] = true;
}
}
}
// Build Tensors
X = torch::zeros({ n_features, n_samples }, torch::kFloat32);
for (int i = 0; i < features.size(); ++i) {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
y = torch::tensor(yv, torch::kInt32);
loaded = true;
}
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> Dataset::getTrainTestTensors(std::vector<int>& train, std::vector<int>& test)
{
if (!loaded) {
throw std::invalid_argument(message_dataset_not_loaded);
}
auto train_t = torch::tensor(train);
int samples_train = train.size();
int samples_test = test.size();
auto test_t = torch::tensor(test);
X_train = X.index({ "...", train_t });
y_train = y.index({ train_t });
X_test = X.index({ "...", test_t });
y_test = y.index({ test_t });
if (discretize) {
auto discretizer = Discretization::instance()->create(discretizer_algorithm);
auto X_train_d = torch::zeros({ n_features, samples_train }, torch::kInt32);
auto X_test_d = torch::zeros({ n_features, samples_test }, torch::kInt32);
for (auto feature = 0; feature < n_features; ++feature) {
if (numericFeatures[feature]) {
auto feature_train = X_train.index({ feature, "..." });
auto feature_test = X_test.index({ feature, "..." });
auto feature_train_disc = discretizer->fit_transform_t(feature_train, y_train);
auto feature_test_disc = discretizer->transform_t(feature_test);
X_train_d.index_put_({ feature, "..." }, feature_train_disc);
X_test_d.index_put_({ feature, "..." }, feature_test_disc);
} else {
X_train_d.index_put_({ feature, "..." }, X_train.index({ feature, "..." }).to(torch::kInt32));
X_test_d.index_put_({ feature, "..." }, X_test.index({ feature, "..." }).to(torch::kInt32));
}
}
X_train = X_train_d;
X_test = X_test_d;
assert(X_train.dtype() == torch::kInt32);
assert(X_test.dtype() == torch::kInt32);
computeStates();
}
assert(y_train.dtype() == torch::kInt32);
assert(y_test.dtype() == torch::kInt32);
return { X_train, X_test, y_train, y_test };
}
}

View File

@@ -4,75 +4,57 @@
#include <map>
#include <vector>
#include <string>
#include "CPPFImdlp.h"
#include <tuple>
#include <common/DiscretizationRegister.h>
#include "Utils.h"
#include "SourceData.h"
namespace platform {
enum fileType_t { CSV, ARFF, RDATA };
class SourceData {
public:
SourceData(std::string source)
{
if (source == "Surcov") {
path = "datasets/";
fileType = CSV;
} else if (source == "Arff") {
path = "datasets/";
fileType = ARFF;
} else if (source == "Tanveer") {
path = "data/";
fileType = RDATA;
} else {
throw std::invalid_argument("Unknown source.");
}
}
std::string getPath()
{
return path;
}
fileType_t getFileType()
{
return fileType;
}
private:
std::string path;
fileType_t fileType;
};
class Dataset {
public:
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType, std::vector<int> numericFeaturesIdx, std::string discretizer_algo = "none") :
path(path), name(name), className(className), discretize(discretize),
loaded(false), fileType(fileType), numericFeaturesIdx(numericFeaturesIdx), discretizer_algorithm(discretizer_algo)
{
};
explicit Dataset(const Dataset&);
std::string getName() const;
std::string getClassName() const;
int getNClasses() const;
std::vector<std::string> getLabels() const; // return the labels factorization result
std::vector<int> getClassesCounts() const;
std::vector<string> getFeatures() const;
std::map<std::string, std::vector<int>> getStates() const;
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> getTrainTestTensors(std::vector<int>& train, std::vector<int>& test);
int getNFeatures() const;
int getNSamples() const;
std::vector<bool>& getNumericFeatures() { return numericFeatures; }
void load();
const bool inline isLoaded() const { return loaded; };
private:
std::string path;
std::string name;
fileType_t fileType;
std::string className;
int n_samples{ 0 }, n_features{ 0 };
std::vector<int> numericFeaturesIdx;
std::string discretizer_algorithm;
std::vector<bool> numericFeatures; // true if feature is numeric
std::vector<std::string> features;
std::vector<std::string> labels;
std::map<std::string, std::vector<int>> states;
bool loaded;
bool discretize;
torch::Tensor X, y;
torch::Tensor X_train, X_test, y_train, y_test;
std::vector<std::vector<float>> Xv;
std::vector<std::vector<int>> Xd;
std::vector<int> yv;
void buildTensors();
void load_csv();
void load_arff();
void load_rdata();
void computeStates();
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y);
public:
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
explicit Dataset(const Dataset&);
std::string getName() const;
std::string getClassName() const;
std::vector<string> getFeatures() const;
std::map<std::string, std::vector<int>> getStates() const;
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures() const;
int getNSamples() const;
void load();
const bool inline isLoaded() const { return loaded; };
};
};
#endif
#endif

105
src/common/Datasets.cpp Normal file
View File

@@ -0,0 +1,105 @@
#include <fstream>
#include<algorithm>
#include "Datasets.h"
#include <nlohmann/json.hpp>
namespace platform {
using json = nlohmann::ordered_json;
const std::string message_dataset_not_loaded = "dataset not loaded.";
Datasets::Datasets(bool discretize, std::string sfileType, std::string discretizer_algorithm) :
discretize(discretize), sfileType(sfileType), discretizer_algorithm(discretizer_algorithm)
{
if ((discretizer_algorithm == "none" || discretizer_algorithm == "") && discretize) {
throw std::runtime_error("Can't discretize without discretization algorithm");
}
load();
}
void Datasets::load()
{
auto sd = SourceData(sfileType);
fileType = sd.getFileType();
path = sd.getPath();
ifstream catalog(path + "all.txt");
std::vector<int> numericFeaturesIdx;
if (!catalog.is_open()) {
throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
}
std::string line;
std::vector<std::string> sorted_lines;
while (getline(catalog, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
sorted_lines.push_back(line);
}
sort(sorted_lines.begin(), sorted_lines.end(), [](const auto& lhs, const auto& rhs) {
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
});
for (const auto& line : sorted_lines) {
std::vector<std::string> tokens = split(line, ';');
std::string name = tokens[0];
std::string className;
numericFeaturesIdx.clear();
int size = tokens.size();
switch (size) {
case 1:
className = "-1";
numericFeaturesIdx.push_back(-1);
break;
case 2:
className = tokens[1];
numericFeaturesIdx.push_back(-1);
break;
case 3:
{
className = tokens[1];
auto numericFeatures = tokens[2];
if (numericFeatures == "all") {
numericFeaturesIdx.push_back(-1);
} else {
if (numericFeatures != "none") {
auto features = json::parse(numericFeatures);
for (auto& f : features) {
numericFeaturesIdx.push_back(f);
}
}
}
}
break;
default:
throw std::invalid_argument("Invalid catalog file format.");
}
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType, numericFeaturesIdx, discretizer_algorithm);
}
catalog.close();
}
std::vector<std::string> Datasets::getNames()
{
std::vector<std::string> result;
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
sort(result.begin(), result.end(), [](const auto& lhs, const auto& rhs) {
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
});
return result;
}
bool Datasets::isDataset(const std::string& name) const
{
return datasets.find(name) != datasets.end();
}
std::string Datasets::toString() const
{
std::string result;
std::string sep = "";
for (const auto& d : datasets) {
result += sep + d.first;
sep = ", ";
}
return "{" + result + "}";
}
}

22
src/common/Datasets.h Normal file
View File

@@ -0,0 +1,22 @@
#ifndef DATASETS_H
#define DATASETS_H
#include "Dataset.h"
namespace platform {
class Datasets {
public:
explicit Datasets(bool discretize, std::string sfileType, std::string discretizer_algorithm = "none");
std::vector<std::string> getNames();
bool isDataset(const std::string& name) const;
Dataset& getDataset(const std::string& name) const { return *datasets.at(name); }
std::string toString() const;
private:
std::string path;
fileType_t fileType;
std::string sfileType;
std::string discretizer_algorithm;
std::map<std::string, std::unique_ptr<Dataset>> datasets;
bool discretize;
void load(); // Loads the list of datasets
};
};
#endif

View File

@@ -0,0 +1,55 @@
#include "Discretization.h"
namespace platform {
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
Discretization* Discretization::factory = nullptr;
Discretization* Discretization::instance()
{
//manages singleton
if (factory == nullptr)
factory = new Discretization();
return factory;
}
void Discretization::registerFactoryFunction(const std::string& name,
function<mdlp::Discretizer* (void)> classFactoryFunction)
{
// register the class factory function
functionRegistry[name] = classFactoryFunction;
}
std::shared_ptr<mdlp::Discretizer> Discretization::create(const std::string& name)
{
mdlp::Discretizer* instance = nullptr;
// find name in the registry and call factory method.
auto it = functionRegistry.find(name);
if (it != functionRegistry.end())
instance = it->second();
// wrap instance in a shared ptr and return
if (instance != nullptr)
return std::unique_ptr<mdlp::Discretizer>(instance);
else
throw std::runtime_error("Discretizer not found: " + name);
}
std::vector<std::string> Discretization::getNames()
{
std::vector<std::string> names;
transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names),
[](const pair<std::string, function<mdlp::Discretizer* (void)>>& pair) { return pair.first; });
return names;
}
std::string Discretization::toString()
{
std::string result = "";
std::string sep = "";
for (const auto& pair : functionRegistry) {
result += sep + pair.first;
sep = ", ";
}
return "{" + result + "}";
}
RegistrarDiscretization::RegistrarDiscretization(const std::string& name, function<mdlp::Discretizer* (void)> classFactoryFunction)
{
// register the class factory function
Discretization::instance()->registerFactoryFunction(name, classFactoryFunction);
}
}

View File

@@ -0,0 +1,33 @@
#ifndef DISCRETIZATION_H
#define DISCRETIZATION_H
#include <map>
#include <memory>
#include <string>
#include <functional>
#include <vector>
#include <fimdlp/Discretizer.h>
#include <fimdlp/BinDisc.h>
#include <fimdlp/CPPFImdlp.h>
namespace platform {
class Discretization {
public:
Discretization(Discretization&) = delete;
void operator=(const Discretization&) = delete;
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
static Discretization* instance();
std::shared_ptr<mdlp::Discretizer> create(const std::string& name);
void registerFactoryFunction(const std::string& name,
function<mdlp::Discretizer* (void)> classFactoryFunction);
std::vector<string> getNames();
std::string toString();
private:
map<std::string, function<mdlp::Discretizer* (void)>> functionRegistry;
static Discretization* factory; //singleton
Discretization() {};
};
class RegistrarDiscretization {
public:
RegistrarDiscretization(const std::string& className, function<mdlp::Discretizer* (void)> classFactoryFunction);
};
}
#endif

View File

@@ -0,0 +1,45 @@
#ifndef DISCRETIZATIONREGISTER_H
#define DISCRETIZATIONREGISTER_H
#include <common/Discretization.h>
#include <limits>
static platform::RegistrarDiscretization registrarM("mdlp",
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp();});
static platform::RegistrarDiscretization registrarM3("mdlp3",
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp(3, numeric_limits<int>::max(), 3);});
static platform::RegistrarDiscretization registrarM4("mdlp4",
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp(3, numeric_limits<int>::max(), 4);});
static platform::RegistrarDiscretization registrarM5("mdlp5",
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp(3, numeric_limits<int>::max(), 5);});
static platform::RegistrarDiscretization registrarBU3("bin3u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ3("bin3q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU4("bin4u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ4("bin4q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU5("bin5u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(5, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ5("bin5q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(5, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU6("bin6u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(6, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ6("bin6q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(6, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU7("bin7u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(7, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ7("bin7q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(7, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU8("bin8u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(8, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ8("bin8q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(8, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU9("bin9u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(9, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ9("bin9q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(9, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU10("bin10u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(10, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ10("bin10q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(10, mdlp::strategy_t::QUANTILE);});
#endif

151
src/common/DotEnv.h Normal file
View File

@@ -0,0 +1,151 @@
#ifndef DOTENV_H
#define DOTENV_H
#include <string>
#include <map>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <iostream>
#include "Utils.h"
//#include "Dataset.h"
namespace platform {
class DotEnv {
private:
std::map<std::string, std::string> env;
std::map<std::string, std::vector<std::string>> valid;
public:
DotEnv(bool create = false)
{
valid =
{
{"depth", {"any"}},
{"discretize", {"0", "1"}},
{"discretize_algo", {"mdlp", "mdlp3", "mdlp4", "mdlp5", "bin3u", "bin3q", "bin4u", "bin4q", "bin5q", "bin5u", "bin6q", "bin6u", "bin7q", "bin7u", "bin8q", "bin8u", "bin9q", "bin9u", "bin10q", "bin10u"}},
{"experiment", {"discretiz", "odte", "covid", "Test"}},
{"fit_features", {"0", "1"}},
{"framework", {"bulma", "bootstrap"}},
{"ignore_nan", {"0", "1"}},
{"leaves", {"any"}},
{"margin", {"0.1", "0.2", "0.3"}},
{"model", {"any"}},
{"n_folds", {"5", "10"}},
{"nodes", {"any"}},
{"platform", {"any"}},
{"stratified", {"0", "1"}},
{"score", {"accuracy", "roc-auc-ovr"}},
{"seeds", {"any"}},
{"smooth_strat", {"ORIGINAL", "LAPLACE", "CESTNIK"}},
{"source_data", {"Arff", "Tanveer", "Surcov", "Test"}},
};
if (create) {
// For testing purposes
std::ofstream file(".env");
file << "experiment=Test" << std::endl;
file << "source_data=Test" << std::endl;
file << "margin=0.1" << std::endl;
file << "score=accuracy" << std::endl;
file << "platform=um790Linux" << std::endl;
file << "n_folds=5" << std::endl;
file << "discretize_algo=mdlp" << std::endl;
file << "smooth_strat=ORIGINAL" << std::endl;
file << "stratified=0" << std::endl;
file << "model=TAN" << std::endl;
file << "seeds=[271]" << std::endl;
file << "discretize=0" << std::endl;
file << "ignore_nan=0" << std::endl;
file << "nodes=Nodes" << std::endl;
file << "leaves=Edges" << std::endl;
file << "depth=States" << std::endl;
file << "fit_features=0" << std::endl;
file << "framework=bulma" << std::endl;
file << "margin=0.1" << std::endl;
file.close();
}
std::ifstream file(".env");
if (!file.is_open()) {
std::cerr << "File .env not found" << std::endl;
exit(1);
}
std::string line;
while (std::getline(file, line)) {
line = trim(line);
if (line.empty() || line[0] == '#') {
continue;
}
std::istringstream iss(line);
std::string key, value;
if (std::getline(iss, key, '=') && std::getline(iss, value)) {
key = trim(key);
value = trim(value);
parse(key, value);
env[key] = value;
}
}
parseEnv();
}
void parse(const std::string& key, const std::string& value)
{
if (valid.find(key) == valid.end()) {
std::cerr << "Invalid key in .env: " << key << std::endl;
exit(1);
}
if (valid[key].front() == "any") {
return;
}
if (std::find(valid[key].begin(), valid[key].end(), value) == valid[key].end()) {
std::cerr << "Invalid value in .env: " << key << " = " << value << std::endl;
exit(1);
}
}
std::vector<std::string> valid_tokens(const std::string& key)
{
if (valid.find(key) == valid.end()) {
return {};
}
return valid.at(key);
}
std::string valid_values(const std::string& key)
{
std::string valid_values = "{", sep = "";
if (valid.find(key) == valid.end()) {
return "{}";
}
for (const auto& value : valid.at(key)) {
valid_values += sep + value;
sep = ", ";
}
return valid_values + "}";
}
void parseEnv()
{
for (auto& [key, values] : valid) {
if (env.find(key) == env.end()) {
std::cerr << "Key not found in .env: " << key << ", valid values: " << valid_values(key) << std::endl;
exit(1);
}
}
}
std::string get(const std::string& key)
{
if (env.find(key) == env.end()) {
std::cerr << "Key not found in .env: " << key << std::endl;
exit(1);
}
return env.at(key);
}
std::vector<int> getSeeds()
{
auto seeds = std::vector<int>();
auto seeds_str = env["seeds"];
seeds_str = trim(seeds_str);
seeds_str = seeds_str.substr(1, seeds_str.size() - 2);
auto seeds_str_split = split(seeds_str, ',');
transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) {
return stoi(str);
});
return seeds;
}
};
}
#endif

80
src/common/Paths.h Normal file
View File

@@ -0,0 +1,80 @@
#ifndef PATHS_H
#define PATHS_H
#include <string>
#include <filesystem>
#include "DotEnv.h"
namespace platform {
class Paths {
public:
static std::string createIfNotExists(const std::string& folder)
{
if (!std::filesystem::exists(folder)) {
std::filesystem::create_directory(folder);
}
return folder;
}
static std::string results() { return createIfNotExists("results/"); }
static std::string hiddenResults() { return createIfNotExists("hidden_results/"); }
static std::string excel() { return createIfNotExists("excel/"); }
static std::string grid() { return createIfNotExists("grid/"); }
static std::string graphs() { return createIfNotExists("graphs/"); }
static std::string tex() { return createIfNotExists("tex/"); }
static std::string datasets()
{
auto env = platform::DotEnv();
return env.get("source_data");
}
static std::string experiment_file(const std::string& fileName, bool discretize, bool stratified, int seed, int nfold)
{
std::string disc = discretize ? "_disc_" : "_ndisc_";
std::string strat = stratified ? "strat_" : "nstrat_";
return "datasets_experiment/" + fileName + disc + strat + std::to_string(seed) + "_" + std::to_string(nfold) + ".json";
}
static void createPath(const std::string& path)
{
// Create directory if it does not exist
try {
std::filesystem::create_directory(path);
}
catch (std::exception& e) {
throw std::runtime_error("Could not create directory " + path);
}
}
static std::string bestResultsFile(const std::string& score, const std::string& model)
{
return "best_results_" + score + "_" + model + ".json";
}
static std::string bestResultsExcel(const std::string& score)
{
return "BestResults_" + score + ".xlsx";
}
static std::string excelResults() { return "some_results.xlsx"; }
static std::string excelDatasets() { return "datasets.xlsx"; }
static std::string grid_input(const std::string& model)
{
return grid() + "grid_" + model + "_input.json";
}
static std::string grid_output(const std::string& model)
{
return grid() + "grid_" + model + "_output.json";
}
static std::string tex_output()
{
return "results.tex";
}
static std::string md_output()
{
return "results.md";
}
static std::string tex_post_hoc()
{
return "post_hoc.tex";
}
static std::string md_post_hoc()
{
return "post_hoc.md";
}
};
}
#endif

View File

@@ -0,0 +1,38 @@
#ifndef SOURCEDATA_H
#define SOURCEDATA_H
namespace platform {
enum fileType_t { CSV, ARFF, RDATA };
class SourceData {
public:
SourceData(std::string source)
{
if (source == "Surcov") {
path = "datasets/";
fileType = CSV;
} else if (source == "Arff") {
path = "datasets/";
fileType = ARFF;
} else if (source == "Tanveer") {
path = "data/";
fileType = RDATA;
} else if (source == "Test") {
path = "@TEST_DATA_PATH@/";
fileType = ARFF;
} else {
throw std::invalid_argument("Unknown source.");
}
}
std::string getPath()
{
return path;
}
fileType_t getFileType()
{
return fileType;
}
private:
std::string path;
fileType_t fileType;
};
}
#endif

View File

@@ -9,9 +9,13 @@ namespace platform {
inline static const std::string black_star{ "\u2605" };
inline static const std::string cross{ "\u2717" };
inline static const std::string upward_arrow{ "\u27B6" };
inline static const std::string down_arrow{ "\u27B4" };
inline static const std::string downward_arrow{ "\u27B4" };
inline static const std::string up_arrow{ "\u2B06" };
inline static const std::string down_arrow{ "\u2B07" };
inline static const std::string ellipsis{ "\u2026" };
inline static const std::string equal_best{ check_mark };
inline static const std::string better_best{ black_star };
inline static const std::string notebook{ "\U0001F5C8" };
};
}
#endif // !SYMBOLS_H
#endif

106
src/common/TensorUtils.hpp Normal file
View File

@@ -0,0 +1,106 @@
#ifndef TENSORUTILS_HPP
#define TENSORUTILS_HPP
#include <torch/torch.h>
#include <vector>
namespace platform {
class TensorUtils {
public:
template <typename T>
static std::vector<T> tensorToVector(const torch::Tensor& tensor)
{
torch::Tensor contig_tensor = tensor.contiguous();
auto num_elements = contig_tensor.numel();
const T* tensor_data = contig_tensor.data_ptr<T>();
std::vector<T> result(tensor_data, tensor_data + num_elements);
return result;
}
static std::vector<std::vector<int>> to_matrix(const torch::Tensor& X)
{
// Ensure tensor is contiguous in memory
auto X_contig = X.contiguous();
// Access tensor data pointer directly
auto data_ptr = X_contig.data_ptr<int>();
// IF you are using int64_t as the data type, use the following line
//auto data_ptr = X_contig.data_ptr<int64_t>();
//std::vector<std::vector<int64_t>> data(X.size(0), std::vector<int64_t>(X.size(1)));
// Prepare output container
std::vector<std::vector<int>> data(X.size(0), std::vector<int>(X.size(1)));
// Fill the 2D vector in a single loop using pointer arithmetic
int rows = X.size(0);
int cols = X.size(1);
for (int i = 0; i < rows; ++i) {
std::copy(data_ptr + i * cols, data_ptr + (i + 1) * cols, data[i].begin());
}
return data;
}
template <typename T>
static std::vector<T> to_vector(const torch::Tensor& y)
{
// Ensure the tensor is contiguous in memory
auto y_contig = y.contiguous();
// Access data pointer
auto data_ptr = y_contig.data_ptr<T>();
// Prepare output container
std::vector<T> data(y.size(0));
// Copy data efficiently
std::copy(data_ptr, data_ptr + y.size(0), data.begin());
return data;
}
static torch::Tensor to_matrix(const std::vector<std::vector<int>>& data)
{
if (data.empty()) return torch::empty({ 0, 0 }, torch::kInt64);
size_t rows = data.size();
size_t cols = data[0].size();
torch::Tensor tensor = torch::empty({ static_cast<long>(rows), static_cast<long>(cols) }, torch::kInt64);
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
tensor.index_put_({static_cast<int64_t>(i), static_cast<int64_t>(j)}, torch::scalar_tensor(data[i][j]));
}
}
return tensor;
}
};
static void dumpVector(const std::vector<std::vector<int>>& vec, const std::string& name)
{
std::cout << name << ": " << std::endl;
for (const auto& row : vec) {
std::cout << "[";
for (const auto& val : row) {
std::cout << val << " ";
}
std::cout << "]" << std::endl;
}
std::cout << std::endl;
}
static void dumpTensor(const torch::Tensor& tensor, const std::string& name)
{
std::cout << name << ": " << std::endl;
for (auto i = 0; i < tensor.size(0); i++) {
std::cout << "[";
for (auto j = 0; j < tensor.size(1); j++) {
std::cout << tensor[i][j].item<int>() << " ";
}
std::cout << "]" << std::endl;
}
std::cout << std::endl;
}
static void dumpTensorV(const torch::Tensor& tensor, const std::string& name)
{
std::cout << name << ": " << std::endl;
std::cout << "[";
for (int i = 0; i < tensor.size(0); i++) {
std::cout << tensor[i].item<int>() << " ";
}
std::cout << "]" << std::endl;
}
}
#endif // TENSORUTILS_HPP

View File

@@ -40,4 +40,4 @@ namespace platform {
}
};
} /* namespace platform */
#endif /* TIMER_H */
#endif

129
src/common/Utils.h Normal file
View File

@@ -0,0 +1,129 @@
#ifndef UTILS_H
#define UTILS_H
#include <unistd.h>
#include <sstream>
#include <string>
#include <vector>
#include <algorithm>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <iomanip>
#include <string.h>
extern char** environ;
namespace platform {
static std::string trim(const std::string& str)
{
std::string result = str;
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
return !std::isspace(ch);
}));
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
return !std::isspace(ch);
}).base(), result.end());
return result;
}
static std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(trim(token));
}
return result;
}
inline double compute_std(std::vector<double> values, double mean)
{
// Compute standard devation of the values
double sum = 0.0;
for (const auto& value : values) {
sum += std::pow(value - mean, 2);
}
double variance = sum / values.size();
return std::sqrt(variance);
}
inline std::string get_date()
{
time_t rawtime;
tm* timeinfo;
time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::ostringstream oss;
oss << std::put_time(timeinfo, "%Y-%m-%d");
return oss.str();
}
inline std::string get_time()
{
time_t rawtime;
tm* timeinfo;
time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::ostringstream oss;
oss << std::put_time(timeinfo, "%H:%M:%S");
return oss.str();
}
static void openFile(const std::string& fileName)
{
// #ifdef __APPLE__
// // macOS uses the "open" command
// std::string command = "open";
// #elif defined(__linux__)
// // Linux typically uses "xdg-open"
// std::string command = "xdg-open";
// #else
// // For other OSes, do nothing or handle differently
// std::cerr << "Unsupported platform." << std::endl;
// return;
// #endif
// execlp(command.c_str(), command.c_str(), fileName.c_str(), NULL);
#ifdef __APPLE__
const char* tool = "/usr/bin/open";
#elif defined(__linux__)
const char* tool = "/usr/bin/xdg-open";
#else
std::cerr << "Unsupported platform." << std::endl;
return;
#endif
// We'll build an argv array for execve:
std::vector<char*> argv;
argv.push_back(const_cast<char*>(tool)); // argv[0]
argv.push_back(const_cast<char*>(fileName.c_str())); // argv[1]
argv.push_back(nullptr);
// Make a new environment array, skipping BASH_FUNC_ variables
std::vector<std::string> filteredEnv;
for (char** env = environ; *env != nullptr; ++env) {
// *env is a string like "NAME=VALUE"
// We want to skip those starting with "BASH_FUNC_"
if (strncmp(*env, "BASH_FUNC_", 10) == 0) {
// skip it
continue;
}
filteredEnv.push_back(*env);
}
// Convert filteredEnv into a char* array
std::vector<char*> envp;
for (auto& var : filteredEnv) {
envp.push_back(const_cast<char*>(var.c_str()));
}
envp.push_back(nullptr);
// Now call execve with the cleaned environment
// NOTE: You may need a full path to the tool if it's not in PATH, or use which() logic
// For now, let's assume "open" or "xdg-open" is found in the default PATH:
execve(tool, argv.data(), envp.data());
// If we reach here, execve failed
perror("execve failed");
// This would terminate your current process if it's not in a child
// Usually you'd do something like:
_exit(EXIT_FAILURE);
}
}
#endif

View File

@@ -0,0 +1,492 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#include "AdaBoost.h"
#include "DecisionTree.h"
#include <cmath>
#include <algorithm>
#include <numeric>
#include <sstream>
#include <iomanip>
#include "common/TensorUtils.hpp"
// Conditional debug macro for performance-critical sections
#define DEBUG_LOG(condition, ...) \
do { \
if (__builtin_expect((condition), 0)) { \
std::cout << __VA_ARGS__ << std::endl; \
} \
} while(0)
namespace bayesnet {
AdaBoost::AdaBoost(int n_estimators, int max_depth)
: Ensemble(true), n_estimators(n_estimators), base_max_depth(max_depth), n(0), n_classes(0)
{
validHyperparameters = { "n_estimators", "base_max_depth" };
}
// Versión optimizada de buildModel - Reemplazar en AdaBoost.cpp:
void AdaBoost::buildModel(const torch::Tensor& weights)
{
// Initialize variables
models.clear();
alphas.clear();
training_errors.clear();
// Initialize n (number of features) and n_classes
n = dataset.size(0) - 1; // Exclude the label row
n_classes = states[className].size();
// Initialize sample weights uniformly
int n_samples = dataset.size(1);
sample_weights = torch::ones({ n_samples }) / n_samples;
// If initial weights are provided, incorporate them
if (weights.defined() && weights.numel() > 0) {
if (weights.size(0) != n_samples) {
throw std::runtime_error("weights must have the same length as number of samples");
}
sample_weights = weights.clone();
normalizeWeights();
}
// Conditional debug information (only when debug is enabled)
DEBUG_LOG(debug, "Starting AdaBoost training with " << n_estimators << " estimators\n"
<< "Number of classes: " << n_classes << "\n"
<< "Number of features: " << n << "\n"
<< "Number of samples: " << n_samples);
// Pre-compute random guess error threshold
const double random_guess_error = 1.0 - (1.0 / static_cast<double>(n_classes));
// Main AdaBoost training loop (SAMME algorithm)
for (int iter = 0; iter < n_estimators; ++iter) {
// Train base estimator with current sample weights
auto estimator = trainBaseEstimator(sample_weights);
// Calculate weighted error
double weighted_error = calculateWeightedError(estimator.get(), sample_weights);
training_errors.push_back(weighted_error);
// According to SAMME, we need error < random_guess_error
if (weighted_error >= random_guess_error) {
DEBUG_LOG(debug, "Error >= random guess (" << random_guess_error << "), stopping");
// If only one estimator and it's worse than random, keep it with zero weight
if (models.empty()) {
models.push_back(std::move(estimator));
alphas.push_back(0.0);
}
break; // Stop boosting
}
// Check for perfect classification BEFORE calculating alpha
if (weighted_error <= 1e-10) {
DEBUG_LOG(debug, "Perfect classification achieved (error=" << weighted_error << ")");
// For perfect classification, use a large but finite alpha
double alpha = 10.0 + std::log(static_cast<double>(n_classes - 1));
// Store the estimator and its weight
models.push_back(std::move(estimator));
alphas.push_back(alpha);
DEBUG_LOG(debug, "Iteration " << iter << ":\n"
<< " Weighted error: " << weighted_error << "\n"
<< " Alpha (finite): " << alpha << "\n"
<< " Random guess error: " << random_guess_error);
break; // Stop training as we have a perfect classifier
}
// Calculate alpha (estimator weight) using SAMME formula
// alpha = log((1 - err) / err) + log(K - 1)
// Clamp weighted_error to avoid division by zero and infinite alpha
double clamped_error = std::max(1e-15, std::min(1.0 - 1e-15, weighted_error));
double alpha = std::log((1.0 - clamped_error) / clamped_error) +
std::log(static_cast<double>(n_classes - 1));
// Clamp alpha to reasonable bounds to avoid numerical issues
alpha = std::max(-10.0, std::min(10.0, alpha));
// Store the estimator and its weight
models.push_back(std::move(estimator));
alphas.push_back(alpha);
// Update sample weights (only if this is not the last iteration)
if (iter < n_estimators - 1) {
updateSampleWeights(models.back().get(), alpha);
normalizeWeights();
}
DEBUG_LOG(debug, "Iteration " << iter << ":\n"
<< " Weighted error: " << weighted_error << "\n"
<< " Alpha: " << alpha << "\n"
<< " Random guess error: " << random_guess_error);
}
// Set the number of models actually trained
n_models = models.size();
DEBUG_LOG(debug, "AdaBoost training completed with " << n_models << " models");
}
void AdaBoost::trainModel(const torch::Tensor& weights, const Smoothing_t smoothing)
{
// Call buildModel which does the actual training
buildModel(weights);
fitted = true;
}
std::unique_ptr<Classifier> AdaBoost::trainBaseEstimator(const torch::Tensor& weights)
{
// Create a decision tree with specified max depth
auto tree = std::make_unique<DecisionTree>(base_max_depth);
// Ensure weights are properly normalized
auto normalized_weights = weights / weights.sum();
// Fit the tree with the current sample weights
tree->fit(dataset, features, className, states, normalized_weights, Smoothing_t::NONE);
return tree;
}
double AdaBoost::calculateWeightedError(Classifier* estimator, const torch::Tensor& weights)
{
// Get features and labels from dataset (avoid repeated indexing)
auto X = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), torch::indexing::Slice() });
auto y_true = dataset.index({ -1, torch::indexing::Slice() });
// Get predictions from the estimator
auto y_pred = estimator->predict(X);
// Vectorized error calculation using PyTorch operations
auto incorrect = (y_pred != y_true).to(torch::kDouble);
// Direct dot product for weighted error (more efficient than sum)
double weighted_error = torch::dot(incorrect, weights).item<double>();
// Clamp to valid range in one operation
return std::clamp(weighted_error, 1e-15, 1.0 - 1e-15);
}
void AdaBoost::updateSampleWeights(Classifier* estimator, double alpha)
{
// Get predictions from the estimator (reuse from calculateWeightedError if possible)
auto X = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), torch::indexing::Slice() });
auto y_true = dataset.index({ -1, torch::indexing::Slice() });
auto y_pred = estimator->predict(X);
// Vectorized weight update using PyTorch operations
auto incorrect = (y_pred != y_true).to(torch::kDouble);
// Single vectorized operation instead of element-wise multiplication
sample_weights *= torch::exp(alpha * incorrect);
// Vectorized clamping for numerical stability
sample_weights = torch::clamp(sample_weights, 1e-15, 1e15);
}
void AdaBoost::normalizeWeights()
{
// Single-pass normalization using PyTorch operations
double sum_weights = torch::sum(sample_weights).item<double>();
if (__builtin_expect(sum_weights <= 0, 0)) {
// Reset to uniform if all weights are zero/negative (rare case)
sample_weights = torch::ones_like(sample_weights) / sample_weights.size(0);
} else {
// Vectorized normalization
sample_weights /= sum_weights;
// Vectorized minimum weight enforcement
sample_weights = torch::clamp_min(sample_weights, 1e-15);
// Renormalize after clamping (if any weights were clamped)
double new_sum = torch::sum(sample_weights).item<double>();
if (new_sum != 1.0) {
sample_weights /= new_sum;
}
}
}
std::vector<std::string> AdaBoost::graph(const std::string& title) const
{
// Create a graph representation of the AdaBoost ensemble
std::vector<std::string> graph_lines;
// Header
graph_lines.push_back("digraph AdaBoost {");
graph_lines.push_back(" rankdir=TB;");
graph_lines.push_back(" node [shape=box];");
if (!title.empty()) {
graph_lines.push_back(" label=\"" + title + "\";");
graph_lines.push_back(" labelloc=t;");
}
// Add input node
graph_lines.push_back(" Input [shape=ellipse, label=\"Input Features\"];");
// Add base estimators
for (size_t i = 0; i < models.size(); ++i) {
std::stringstream ss;
ss << " Estimator" << i << " [label=\"Base Estimator " << i + 1
<< "\\nα = " << std::fixed << std::setprecision(3) << alphas[i] << "\"];";
graph_lines.push_back(ss.str());
// Connect input to estimator
ss.str("");
ss << " Input -> Estimator" << i << ";";
graph_lines.push_back(ss.str());
}
// Add combination node
graph_lines.push_back(" Combination [shape=diamond, label=\"Weighted Vote\"];");
// Connect estimators to combination
for (size_t i = 0; i < models.size(); ++i) {
std::stringstream ss;
ss << " Estimator" << i << " -> Combination;";
graph_lines.push_back(ss.str());
}
// Add output node
graph_lines.push_back(" Output [shape=ellipse, label=\"Final Prediction\"];");
graph_lines.push_back(" Combination -> Output;");
// Close graph
graph_lines.push_back("}");
return graph_lines;
}
void AdaBoost::checkValues() const
{
if (n_estimators <= 0) {
throw std::invalid_argument("n_estimators must be positive");
}
if (base_max_depth <= 0) {
throw std::invalid_argument("base_max_depth must be positive");
}
}
void AdaBoost::setHyperparameters(const nlohmann::json& hyperparameters_)
{
auto hyperparameters = hyperparameters_;
// Set hyperparameters from JSON
auto it = hyperparameters.find("n_estimators");
if (it != hyperparameters.end()) {
n_estimators = it->get<int>();
hyperparameters.erase("n_estimators");
}
it = hyperparameters.find("base_max_depth");
if (it != hyperparameters.end()) {
base_max_depth = it->get<int>();
hyperparameters.erase("base_max_depth");
}
checkValues();
Ensemble::setHyperparameters(hyperparameters);
}
int AdaBoost::predictSample(const torch::Tensor& x) const
{
// Early validation (keep essential checks only)
if (!fitted || models.empty()) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
// Pre-allocate and reuse memory
static thread_local std::vector<double> class_votes_cache;
if (class_votes_cache.size() != static_cast<size_t>(n_classes)) {
class_votes_cache.resize(n_classes);
}
std::fill(class_votes_cache.begin(), class_votes_cache.end(), 0.0);
// Optimized voting loop - avoid exception handling in hot path
for (size_t i = 0; i < models.size(); ++i) {
double alpha = alphas[i];
if (alpha <= 0 || !std::isfinite(alpha)) continue;
// Direct cast and call - avoid virtual dispatch overhead
int predicted_class = static_cast<DecisionTree*>(models[i].get())->predictSample(x);
// Bounds check with branch prediction hint
if (__builtin_expect(predicted_class >= 0 && predicted_class < n_classes, 1)) {
class_votes_cache[predicted_class] += alpha;
}
}
// Fast argmax using iterators
return std::distance(class_votes_cache.begin(),
std::max_element(class_votes_cache.begin(), class_votes_cache.end()));
}
torch::Tensor AdaBoost::predictProbaSample(const torch::Tensor& x) const
{
// Early validation
if (!fitted || models.empty()) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
// Use stack allocation for small arrays (typical case: n_classes <= 32)
constexpr int STACK_THRESHOLD = 32;
double stack_votes[STACK_THRESHOLD];
std::vector<double> heap_votes;
double* class_votes;
if (n_classes <= STACK_THRESHOLD) {
class_votes = stack_votes;
std::fill_n(class_votes, n_classes, 0.0);
} else {
heap_votes.resize(n_classes, 0.0);
class_votes = heap_votes.data();
}
double total_votes = 0.0;
// Optimized voting loop
for (size_t i = 0; i < models.size(); ++i) {
double alpha = alphas[i];
if (alpha <= 0 || !std::isfinite(alpha)) continue;
int predicted_class = static_cast<DecisionTree*>(models[i].get())->predictSample(x);
if (__builtin_expect(predicted_class >= 0 && predicted_class < n_classes, 1)) {
class_votes[predicted_class] += alpha;
total_votes += alpha;
}
}
// Direct tensor creation with pre-computed size
torch::Tensor class_probs = torch::empty({ n_classes }, torch::TensorOptions().dtype(torch::kFloat32));
auto probs_accessor = class_probs.accessor<float, 1>();
if (__builtin_expect(total_votes > 0.0, 1)) {
// Vectorized probability calculation
const double inv_total = 1.0 / total_votes;
for (int j = 0; j < n_classes; ++j) {
probs_accessor[j] = static_cast<float>(class_votes[j] * inv_total);
}
} else {
// Uniform distribution fallback
const float uniform_prob = 1.0f / n_classes;
for (int j = 0; j < n_classes; ++j) {
probs_accessor[j] = uniform_prob;
}
}
return class_probs;
}
torch::Tensor AdaBoost::predict_proba(torch::Tensor& X)
{
if (!fitted || models.empty()) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
// Input validation
if (X.size(0) != n) {
throw std::runtime_error("Input has wrong number of features. Expected " +
std::to_string(n) + " but got " + std::to_string(X.size(0)));
}
const int n_samples = X.size(1);
// Pre-allocate output tensor with correct layout
torch::Tensor probabilities = torch::empty({ n_samples, n_classes },
torch::TensorOptions().dtype(torch::kFloat32));
// Convert to contiguous memory if needed (optimization for memory access)
if (!X.is_contiguous()) {
X = X.contiguous();
}
// Batch processing with memory-efficient sample extraction
for (int i = 0; i < n_samples; ++i) {
// Extract sample without unnecessary copies
auto sample = X.select(1, i);
// Direct assignment to pre-allocated tensor
probabilities[i] = predictProbaSample(sample);
}
return probabilities;
}
std::vector<std::vector<double>> AdaBoost::predict_proba(std::vector<std::vector<int>>& X)
{
const size_t n_samples = X[0].size();
// Pre-allocate result with exact size
std::vector<std::vector<double>> result;
result.reserve(n_samples);
// Avoid repeated allocations
for (size_t i = 0; i < n_samples; ++i) {
result.emplace_back(n_classes, 0.0);
}
// Convert to tensor only once (batch conversion is more efficient)
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
torch::Tensor proba_tensor = predict_proba(X_tensor);
// Optimized tensor-to-vector conversion
auto proba_accessor = proba_tensor.accessor<float, 2>();
for (size_t i = 0; i < n_samples; ++i) {
for (int j = 0; j < n_classes; ++j) {
result[i][j] = static_cast<double>(proba_accessor[i][j]);
}
}
return result;
}
torch::Tensor AdaBoost::predict(torch::Tensor& X)
{
if (!fitted || models.empty()) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
if (X.size(0) != n) {
throw std::runtime_error("Input has wrong number of features. Expected " +
std::to_string(n) + " but got " + std::to_string(X.size(0)));
}
const int n_samples = X.size(1);
// Pre-allocate with correct dtype
torch::Tensor predictions = torch::empty({ n_samples }, torch::TensorOptions().dtype(torch::kInt32));
auto pred_accessor = predictions.accessor<int32_t, 1>();
// Ensure contiguous memory layout
if (!X.is_contiguous()) {
X = X.contiguous();
}
// Optimized prediction loop
for (int i = 0; i < n_samples; ++i) {
auto sample = X.select(1, i);
pred_accessor[i] = predictSample(sample);
}
return predictions;
}
std::vector<int> AdaBoost::predict(std::vector<std::vector<int>>& X)
{
// Single tensor conversion for batch processing
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
torch::Tensor predictions_tensor = predict(X_tensor);
// Optimized tensor-to-vector conversion
std::vector<int> result = platform::TensorUtils::to_vector<int>(predictions_tensor);
return result;
}
} // namespace bayesnet

View File

@@ -0,0 +1,81 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#ifndef ADABOOST_H
#define ADABOOST_H
#include <vector>
#include <memory>
#include "bayesnet/ensembles/Ensemble.h"
namespace bayesnet {
class AdaBoost : public Ensemble {
public:
explicit AdaBoost(int n_estimators = 100, int max_depth = 1);
virtual ~AdaBoost() = default;
// Override base class methods
std::vector<std::string> graph(const std::string& title = "") const override;
// AdaBoost specific methods
void setNEstimators(int n_estimators) { this->n_estimators = n_estimators; checkValues(); }
int getNEstimators() const { return n_estimators; }
void setBaseMaxDepth(int depth) { this->base_max_depth = depth; checkValues(); }
int getBaseMaxDepth() const { return base_max_depth; }
// Get the weight of each base estimator
std::vector<double> getEstimatorWeights() const { return alphas; }
// Get training errors for each iteration
std::vector<double> getTrainingErrors() const { return training_errors; }
// Override setHyperparameters from BaseClassifier
void setHyperparameters(const nlohmann::json& hyperparameters) override;
torch::Tensor predict(torch::Tensor& X) override;
std::vector<int> predict(std::vector<std::vector<int>>& X) override;
torch::Tensor predict_proba(torch::Tensor& X) override;
std::vector<std::vector<double>> predict_proba(std::vector<std::vector<int>>& X) override;
void setDebug(bool debug) { this->debug = debug; }
protected:
void buildModel(const torch::Tensor& weights) override;
void trainModel(const torch::Tensor& weights, const Smoothing_t smoothing) override;
private:
int n_estimators;
int base_max_depth; // Max depth for base decision trees
std::vector<double> alphas; // Weight of each base estimator
std::vector<double> training_errors; // Training error at each iteration
torch::Tensor sample_weights; // Current sample weights
int n_classes; // Number of classes in the target variable
int n; // Number of features
// Train a single base estimator
std::unique_ptr<Classifier> trainBaseEstimator(const torch::Tensor& weights);
// Calculate weighted error
double calculateWeightedError(Classifier* estimator, const torch::Tensor& weights);
// Update sample weights based on predictions
void updateSampleWeights(Classifier* estimator, double alpha);
// Normalize weights to sum to 1
void normalizeWeights();
// Check if hyperparameters values are valid
void checkValues() const;
// Make predictions for a single sample
int predictSample(const torch::Tensor& x) const;
// Make probabilistic predictions for a single sample
torch::Tensor predictProbaSample(const torch::Tensor& x) const;
bool debug = false; // Enable debug mode for debug output
};
}
#endif // ADABOOST_H

Some files were not shown because too many files have changed in this diff Show More