Compare commits

270 Commits

Author SHA1 Message Date
c3c580a611 Fix ExcelFile warning 2025-07-22 00:24:25 +02:00
515455695b Add CMAKE_POSITION_INDEPENDENT_CODE to CMakeLists 2025-07-21 18:19:50 -04:00
f68d216150 Fix tensor utils ambiguous call 2025-07-21 11:40:16 +02:00
b990684581 Refactor TensorUtils to a unique header file 2025-07-21 11:10:26 +02:00
5fd0ef692d Fix typo in conanfile 2025-07-19 23:21:09 +02:00
dfcdadbf38 Merge branch 'main' of ssh://gitea.rmontanana.es:6422/rmontanana/Platform 2025-07-19 23:20:00 +02:00
613f4b6813 Update Requirements 2025-07-19 23:19:54 +02:00
dc324fe5f7 Add Seed to note in experiment 2025-07-08 18:50:48 +02:00
9816896240 Complete the conan integration 2025-07-04 10:20:59 +02:00
a3f765ce3c Fix compilation errors and enhance Makefile 2025-07-03 10:41:16 +02:00
3d814a79c6 Begin conan integration 2025-07-03 01:40:30 +02:00
1ef7ca6180 Merge pull request 'Integrate libraries with vcpkg' (#6) from vcpkg into main
Reviewed-on: #6
2025-07-02 17:39:44 +00:00
9448a971e8 fix vcpkg.json 2025-06-27 20:25:41 +02:00
24cef7496d Optimize AdaBoostPredict and default 100 estimators 2025-06-18 18:28:54 +02:00
a1a6d3d612 Optimize AdaBoost buildModel 2025-06-18 18:15:19 +02:00
dda9740e83 Test AdaBoost fine but unoptimized 2025-06-18 18:03:19 +02:00
41afa1b888 Enhance predictProbaSample 2025-06-18 17:33:56 +02:00
4e18dc87be Fix predict_proba in AdaBoost 2025-06-18 14:18:15 +02:00
56af1a5f85 AdaBoost a falta de predict_proba 2025-06-18 13:59:23 +02:00
415a7ae608 Begin AdaBoost integration 2025-06-18 11:27:11 +02:00
023d5613b4 Add DecisionTree with tests 2025-06-17 13:48:11 +02:00
8c413a1eb0 Begin to add AdaBoost implementation 2025-06-16 00:11:51 +02:00
3b158e9fc1 Add AdaBoost 2025-06-15 12:07:12 +02:00
514968a082 Open excel file automatically when generated 2025-05-28 17:37:53 +02:00
dcde8c01be ADd std to screen output 2025-05-28 10:53:29 +02:00
a6b6efce95 Remove uneeded output in Statistics 2025-05-25 10:41:36 +02:00
473d194dde Complete integration of Wilcoxon test 2025-05-24 12:59:28 +02:00
a56ec98ef9 Add Wilcoxon Test 2025-05-21 11:51:04 +02:00
70d8022926 Refactor postHoc 2025-05-17 18:12:57 +02:00
f5107abea7 Add comment in Statistics 2025-05-14 14:02:53 +02:00
e64e281b63 Return AUC 0.5 if nPos==0 || nNeg==0 2025-05-14 13:15:33 +02:00
b639a2d79a Fix folder param in b_manage 2025-05-14 12:51:56 +02:00
d6603dd638 Add folder parameter to best, grid and main 2025-05-14 11:46:15 +02:00
321e2a2f28 Add folder to manage 2025-05-13 14:09:25 +02:00
36c72491e7 Add folder to b_best 2025-05-13 13:50:07 +02:00
aa19ab6c21 Option to use BayesNet local or vcpkg in CMakeLists 2025-05-09 19:16:17 +02:00
16b4923851 Complete configuration xlsxwriter is still with the old config 2025-05-09 11:10:27 +02:00
b1965c8ae5 Add vcpkg config files 2025-05-09 10:54:27 +02:00
7d3a2dd713 Remove modules 2025-05-08 17:15:42 +02:00
50fde9521b Update last commit badge in README 2025-04-22 11:16:27 +00:00
cd2f47c58b Merge pull request 'Including XA1DE model' (#5) from XA1DE into main
Reviewed-on: #5
2025-03-20 14:58:37 +00:00
facf6f6ddd Fix GridBase to eliminate uneeded GridData 2025-03-20 15:54:13 +01:00
c9ab88e475 Update models and remove normalize weights in XA1DE 2025-03-17 13:28:35 +01:00
c2a4e3e64e Add XSPnDE n=2 2025-03-13 11:00:21 +01:00
664a6a5aeb Add XBAODE & XSPODE from bayesnet 2025-03-09 19:20:51 +01:00
ae7b89b134 tolerance <- 3 2025-03-08 18:07:56 +01:00
9c1852c6c3 First working version 2025-03-08 14:20:27 +01:00
7a23782b05 Add XSpode submodel 2025-03-07 18:34:16 +01:00
b2002d341c Create Xaode2 and add initializer factor in predict 2025-03-03 12:38:05 +01:00
9a8b960ce8 Remove uneeded commented code 2025-03-03 11:29:57 +01:00
7bc8633ed1 Enhance result 2025-03-03 10:56:20 +01:00
11155463b9 Fix predict_proba_spode 2025-03-02 21:41:21 +01:00
12e69a7f53 Add Prior probability to predict
Fix predict_spode
2025-03-01 20:29:45 +01:00
c127cb670a Fix predict_proba_spode mistake 2025-02-27 20:45:28 +01:00
610c2a6a4a Continue refactoring 2025-02-27 11:37:30 +01:00
2dcd073299 Refactor Xaode 2025-02-27 10:08:27 +01:00
f51d5b5e40 Continue refactoring 2025-02-27 09:57:40 +01:00
4e3043b2d1 Fix XA1DE integration 2025-02-27 09:23:47 +01:00
b055065e59 Fix predict_proba declaration 2025-02-26 21:08:33 +01:00
0d1e4b3c6f Continue refactoring 2025-02-26 21:03:01 +01:00
1a688f90b4 Complete refactor of XA1DE & XBAODE with new ExpClf class 2025-02-26 16:55:04 +01:00
c63baf419f Add log and fix some mistakes in integration 2025-02-25 20:35:13 +01:00
de7cf091be Add open excel file on b_manage termination 2025-02-25 13:41:06 +01:00
475a819a87 Continue integration into trainModel 2025-02-25 11:03:53 +01:00
ce6e192a33 Include BoostAODE trainModel method in XBAODE fit method 2025-02-24 10:27:24 +01:00
5daf7cbd69 Create XBAODE classifier 2025-02-23 19:44:13 +01:00
1b26de1e38 Set use_threads true as default for XA1DE 2025-02-23 18:54:55 +01:00
d3de429f2c Add room for nodes, depth and edges on screen report 2025-02-19 16:05:21 +01:00
f48864a415 Fix back button in manage
Fix sort datasets in b_main when --datasets is used
2025-02-19 13:32:07 +01:00
c1531dba2a Complete XA1DE integration 2025-02-19 11:40:33 +01:00
5556fbab03 Complete integration with memory failure 2025-02-18 22:57:02 +01:00
ac89cefab3 Add conversion methods 2025-02-18 12:07:56 +01:00
14dd8ebb66 First compilation 2025-02-18 11:04:24 +01:00
bd5ba14f04 Begin model inclusion 2025-02-18 10:48:46 +01:00
17728212c1 Ignore case in datasets sorting 2025-02-17 20:01:06 +01:00
86b4558f9d Add 1 char to b_list datasets headers 2025-02-17 19:44:23 +01:00
505edc79ac Fix sample issue 2025-02-04 18:53:23 +01:00
73a4b3d5e5 Add changeModel to b_manage 2025-02-04 17:34:00 +01:00
cbe8f4c79c Fix status length output in b_main 2025-02-01 21:42:56 +01:00
0d08a526fa Add score to b_main output 2025-01-30 17:36:45 +01:00
d0706da887 Fix sort order in bgrid report 2025-01-21 20:38:07 +01:00
07e3cc9599 Fix errors in grid Experiment 2025-01-19 13:51:51 +01:00
2a9652b450 Fix b_main order of datasets if --datasets parameter used 2025-01-18 20:31:58 +01:00
3397d0962f Refactor arguments management for Experimentation 2025-01-18 18:26:34 +01:00
7aaf6d1bf8 Add conditional saveResults to GridExperiment 2025-01-18 13:09:45 +01:00
eb430a84c4 Fix dataset name order in grid experiment 2025-01-17 16:58:39 +01:00
d0e65348e0 Complete b_grid experiment 2025-01-17 13:56:19 +01:00
c1d5dd74e3 Continue with grid experiment 2025-01-17 10:39:56 +01:00
9a9a9fb17a Continue grid Experiment 2025-01-14 22:04:23 +01:00
386faf960e Refactor grid classes and add summary of tasks at the end 2025-01-14 18:53:11 +01:00
28894004c8 Fix time output in b_main 2025-01-08 20:45:08 +01:00
ae41975fb4 Add nominal or index dataset name in tex output 2025-01-08 17:18:32 +01:00
0e475e4488 Sort datasets on input 2025-01-08 11:05:22 +01:00
909cec712c Complete schema validation 2025-01-07 18:24:55 +01:00
4901bb1f32 Add json results format validation 2025-01-07 11:58:18 +01:00
0318dcf8e5 Continue with grid_experiment refactor 2024-12-21 14:18:47 +01:00
1cc19a7b19 Refactor mpi classes 2024-12-20 19:10:17 +01:00
f88944de36 Add grid base class and static class 2024-12-20 18:54:08 +01:00
1a336a094e Refactor gridsearch and begin gridexperiment 2024-12-20 17:36:43 +01:00
8705adf3ee Begin b_grid experiment 2024-12-20 12:51:33 +01:00
017cb8a0dc Fix smoothing problem in gridsearch 2024-12-18 11:17:04 +01:00
e966c880e6 Refactor gridsearch output 2024-12-17 10:49:58 +01:00
70ea32dc9a Update folding library 2024-12-14 20:23:31 +01:00
ba455bb934 Rename config.h to config_platform.h 2024-12-13 19:57:05 +01:00
a65955248a Add mdlp as dependency 2024-12-13 10:28:27 +01:00
84930b0537 Remove lib/mdlp folder 2024-12-13 10:11:45 +01:00
10c65f44a0 Add mdlp library dependency 2024-12-13 09:55:37 +01:00
6d112f01e7 Remove external library dependency 2024-12-13 09:49:46 +01:00
401296293b Add header to b_main time 2024-12-11 23:18:20 +01:00
9566ae4cf6 Fix gridsearch discretize_algo mistake 2024-12-11 12:45:16 +01:00
55187ee521 Add time to experiment seed 2024-12-11 10:05:24 +01:00
68ea06d129 Fix fimdlp library includes 2024-11-20 21:19:35 +01:00
6c1d1d0d32 Remove mdlp files 2024-11-20 21:14:42 +01:00
b0853d169b Remove mdlp submodule 2024-11-20 21:14:19 +01:00
26f8e07774 Remove Python 3.11 only requirement 2024-11-20 20:21:39 +01:00
315dfb104f Add train test time to report console 2024-10-25 09:53:31 +02:00
381f226d53 Fix pm code in tex bestresults 2024-10-15 10:32:28 +02:00
ea13835701 Add Markdown best results output 2024-10-07 18:08:42 +02:00
d75468cf78 Replace Nº with # in output labels 2024-09-28 22:55:11 +02:00
c58bd9d60d add score name to best results excel file name 2024-09-28 18:58:49 +02:00
148a3b831a Add missing \ to results.tex 2024-09-03 12:57:22 +02:00
69063badbb Fix status error in holm.tex 2024-09-03 12:54:09 +02:00
6ae2b2182a Complete Tex output with Holm test 2024-09-03 12:43:50 +02:00
4dbd76df55 Continue TeX output 2024-09-02 20:30:47 +02:00
4545f76667 Begin adding TeX output to b_best -m any command 2024-09-02 18:14:53 +02:00
8372987dae Update sample to last library version 2024-08-31 12:41:11 +02:00
d72943c749 Fix hyperparams mistake 2024-08-07 10:52:04 +02:00
800246acd2 Accept nested hyperparameters in b_main 2024-08-04 17:19:31 +02:00
0ea967dd9d Support b_main with best hyperparameters 2024-08-02 19:10:25 +02:00
97abec8b69 Fix hide result error 2024-08-02 12:02:11 +02:00
17c9522e77 Add support to old format results 2024-07-25 17:06:31 +02:00
45af550cf9 Change time showed in report 2024-07-24 18:40:59 +02:00
5d5f49777e Fix wrong columns message 2024-07-16 11:30:28 +02:00
540a8ea06d Refactor update rows 2024-07-16 10:33:44 +02:00
1924c4392b Adapt screen to resized window 2024-07-16 10:25:15 +02:00
f2556a30af Add screen width control in b_manage 2024-07-15 18:06:39 +02:00
2f2ed00ca1 Add roc-auc-ovr as score to b_main 2024-07-14 12:48:33 +02:00
28f6a0d7a7 RocAuc refactor to speed up binary classif. problems 2024-07-13 16:54:34 +02:00
028522f180 Add AUC to reportConsole 2024-07-12 17:41:23 +02:00
84adf13a79 Add AUC computing in Experiment and store in result 2024-07-12 17:23:03 +02:00
26dfe6d056 Add Graphs to results
Add bin5..bin10 q & u discretizers algos
Fix trouble in computing states
Update mdlp to 2.0.0
2024-07-11 11:23:20 +02:00
3acc34e4c6 Fix title mistake in b_main 2024-06-17 19:07:15 +02:00
8f92b74260 Change Constant smooth type 2024-06-14 10:16:32 +02:00
3d900f8c81 Update models versions 2024-06-13 12:30:31 +02:00
e628d80f4c Experiment working with smoothing and disc-algo 2024-06-11 13:52:26 +02:00
0f06f8971e Change default smooth type in Experiment 2024-06-10 15:50:54 +02:00
f800772149 Add new hyperparameters validation in b_main 2024-06-10 10:16:07 +02:00
b8a8ddaf8c Add smooth strategy to hyperparameter in b_main
Add smooth strategy to reports
2024-06-09 20:46:14 +02:00
90555489ff Add discretiz_algo to b_main as hyperparameter 2024-06-09 11:35:50 +02:00
080f3cee34 Add discretization algo to reports 2024-06-09 01:11:56 +02:00
643633e6dd fit discretizer only with train data 2024-06-09 00:50:55 +02:00
361c51d864 Add traintest split in gridsearch 2024-06-07 11:05:59 +02:00
5dd3deca1a Add discretiz algorithm management to b_main & Dataset 2024-06-07 09:00:51 +02:00
2202a81782 Add discretization algo to result 2024-06-06 18:33:01 +02:00
c4f4e332f6 Add parsing to DotEnv 2024-06-06 17:55:39 +02:00
a7ec930fa0 Add numeric features management to Dataset 2024-06-06 13:03:57 +02:00
6858b3d89a Remove model selection from b_best and b_list 2024-06-03 17:09:45 +02:00
5fb176d78a Add message of the file saved in b_main 2024-05-29 20:52:25 +02:00
f5d5c35002 Add generate-fold-files to b_main 2024-05-28 10:52:08 +02:00
b34af13eea Add new Files library 2024-05-26 17:27:42 +02:00
e3a06264a9 Remove old Files library 2024-05-26 17:25:36 +02:00
df82f82e88 Add F column to b_best in excel 2024-05-21 08:45:17 +02:00
886dde7a06 Fix various classification reports in the same excel book 2024-05-19 18:53:55 +02:00
88468434e7 Add color and fix format in classification report in excel 2024-05-19 11:12:31 +02:00
ad5c3319bd Complete excel classification report 2024-05-18 22:59:37 +02:00
594adb0534 Begin classification report in excel 2024-05-18 21:37:34 +02:00
b9e0c92334 Move ResultsDatasetConsole to results folder 2024-05-18 18:41:17 +02:00
25bd7a42c6 Replacce pragma once with ifndef 2024-05-18 13:00:13 +02:00
c165a4bdda Fix refactor of static aggregate method 2024-05-17 23:38:21 +02:00
49a36904dc Refactor aggregate score to a constructor 2024-05-17 22:52:13 +02:00
577351eda5 put using json=nlohmann:ordered_json under namespace platform 2024-05-17 18:32:01 +02:00
a3c4bde460 Fix problem with num of classes in pyclassifiers experiments 2024-05-17 14:05:09 +02:00
696c0564a7 Add BoostA2DE model and fix some report errors 2024-05-17 01:25:27 +02:00
30a6d5e60d Complete reporconsole with classification report 2024-05-14 13:22:13 +02:00
f8f3ca28dc Fix colors of classification report 2024-05-14 12:06:08 +02:00
5c190d7c66 Add train classification report 2024-05-14 11:45:54 +02:00
99c9c6731f Add colors to confusion matrix and classification report 2024-05-14 00:41:29 +02:00
8d20545fd2 Git add Confusion Matrix to console report 2024-05-13 10:40:25 +02:00
2b480cdcb7 Merge pull request 'Fix json key automatic ordering error when creating Score from json' (#4) from temp into main
Reviewed-on: #4
2024-05-12 16:36:08 +00:00
ebaddf1a6c Fix json key automatic ordering error when creating Score from json 2024-05-12 18:23:48 +02:00
07a2efb298 Show classification report in b_manage 2024-05-12 12:52:22 +02:00
f88b223c46 Update libraries 2024-05-12 12:26:49 +02:00
69b9609154 Add labels to confusion_matrices in results 2024-05-10 17:12:11 +02:00
6d4117d188 Add Classification report to end of experiment if only one dataset is tested 2024-05-10 14:11:51 +02:00
ec0268c514 Add confusion matrix to json results
Add Aggregate method to Scores
2024-05-10 13:42:38 +02:00
dd94fd51f7 Add json constructor to Scores 2024-05-10 11:35:07 +02:00
009ed037b8 Add Scores class and TestsScores 2024-05-10 00:51:21 +02:00
6d1b78ada7 Remove trace message from report 2024-05-09 17:09:03 +02:00
3882ebd6e4 Add SPnDE & A2DE models 2024-05-05 19:53:14 +02:00
423242d280 Add logo to README 2024-05-02 11:36:58 +02:00
b9381aa453 Fix json keys in ReporExcelCompared 2024-05-01 11:53:21 +02:00
33cfb78554 Fix Nodes, Leaves, Depth vs Nodes, Edges, States headers in reports 2024-04-21 11:05:12 +02:00
1caa39c071 Add env to enable test data 2024-04-19 10:02:59 +02:00
018c94bfe6 add platform filter to b_manage 2024-04-18 15:43:39 +02:00
a54d6b8716 Fix paginator error when deleting in b_manage 2024-04-17 12:57:57 +02:00
6cde09d81e Change launch parameters 2024-04-17 11:36:21 +02:00
7be95d889d Fix some output mistakes in b_manage experiments list 2024-04-17 11:35:43 +02:00
42d61c6fc4 Add datasets-file to b_main 2024-04-15 18:14:21 +02:00
e5e947779f Add datasets hyperparameter to b_main 2024-04-15 17:34:37 +02:00
ad168d13ba Add stratified and discretize to b_manage list 2024-04-11 11:45:43 +02:00
78b8a8ae66 Add platform to b_manage, fix report after experiment 2024-04-11 10:54:18 +02:00
7ed9073d15 Add ascending/descending sort to b_manage 2024-04-10 19:42:40 +02:00
ee93789ca3 Fix CMakeLists PyClassifier install folder 2024-04-10 13:34:48 +02:00
375ed437ed Find BayesNet and PyClassifiers in $HOME/lib folder 2024-04-10 00:53:39 +02:00
5ec7fe8d00 Show model version in b_main 2024-04-09 23:20:19 +02:00
72ea62f783 Update main CMakeLists 2024-04-06 21:15:51 +02:00
4b91f2bde0 Update vscode c++ configuration 2024-04-05 23:10:27 +02:00
3bc51cb7b0 Add pagination to detail result
Add version of libraries info to header
2024-04-04 00:14:21 +02:00
cf83d1f8f4 Add tests for libraries required versions 2024-04-03 20:51:21 +02:00
0dd10bcbe4 Fix some console report formats 2024-04-02 10:23:32 +02:00
622b36b2c7 Fix divide by 0 error in excel compared 2024-03-23 22:25:09 +01:00
ea29a96ca1 hide make buildr command 2024-03-21 11:30:03 +01:00
673a41fc4d fix b_main dataset selection 2024-03-19 17:37:32 +01:00
634ea36169 Add optimization to compile flags in Release 2024-03-18 14:00:34 +01:00
20fef5b6b3 Add excel to experiment view in b_manage 2024-03-18 10:21:28 +01:00
7cf864c3f3 Fix report after experiment 2024-03-18 10:10:48 +01:00
4a0fa33917 Remove indexList variable in ManageScreen 2024-03-17 13:08:07 +01:00
d47da27571 Complete pagination of result report 2024-03-17 11:26:26 +01:00
faccb09c43 Begin result report pagination 2024-03-17 02:07:10 +01:00
fa4f47ff35 Create Base class for paged reports 2024-03-17 01:22:50 +01:00
106a36109e Refactor report folder 2024-03-17 00:06:00 +01:00
37eba57765 Rename ManageResults -> ManageScreen 2024-03-16 23:44:21 +01:00
67487ffce1 shorten dataset name to maximum length 2024-03-16 23:37:37 +01:00
9c11dee019 Complete Datasets in b_manage 2024-03-16 22:39:25 +01:00
58ae2c7690 Complete file output in ResultsDataset & ReportDataset 2024-03-16 17:05:26 +01:00
fa366a4c22 Convert DatasetsConsole & ResultsDatasetConsole to string output 2024-03-16 13:48:49 +01:00
b9af086c29 Refactor library folders
Add paginators per output type in b_manage
2024-03-16 12:02:24 +01:00
6a285b149b Fix report and showindex header in bmanage 2024-03-16 01:24:47 +01:00
ad402ac21e ReportConsole to string 2024-03-16 01:16:00 +01:00
38978aa7b7 Add message of Excel file created in b_manage 2024-03-15 19:54:03 +01:00
3691363b8e Parsing errors to to status in b_manage 2024-03-15 19:28:37 +01:00
fe24aa0b3e Change header color to white in b_manage 2024-03-15 14:04:16 +01:00
175e0eb591 Fix some status issue in b_manage 2024-03-15 12:45:08 +01:00
1912d17498 Add status to b_manage 2024-03-15 11:31:56 +01:00
54249e5304 Add different header colors in b_manage 2024-03-15 00:24:16 +01:00
d7f92c9682 Refactor colors in b_manage 2024-03-15 00:18:30 +01:00
00bb7f4680 Adjust sizes in b_manage 2024-03-14 23:52:33 +01:00
bf5dabb169 Add pagination to b_manage 2024-03-14 23:41:05 +01:00
cdf339856a Fix b_manage error if no results were present 2024-03-13 17:56:44 +01:00
3ceea5677c Remove odd variable in some sources 2024-03-12 13:35:07 +01:00
260fd122eb Fix number in header of b_manage 2024-03-12 13:27:22 +01:00
eff0be1c1c Add apply number of lines in terminal in b_manage 2024-03-12 13:23:30 +01:00
0ade72a37a Permit partial results comparison 2024-03-12 00:24:36 +01:00
72cda3784a Add bold max score per model in b_list results 2024-03-11 17:02:58 +01:00
52d689666a Update License & Readme 2024-03-11 10:21:40 +01:00
26e87c9cb1 Merge pull request 'list_results' (#3) from list_results into main
Reviewed-on: #3
2024-03-11 08:54:01 +00:00
03cd6e5a51 Complete b_list results 2024-03-10 20:12:13 +01:00
cd9ff89b52 Add results to b_list 2024-03-10 18:02:03 +01:00
05d05e25c2 Add make example command 2024-03-10 13:25:55 +01:00
5cd6e3d1a5 Rename tests from cc to cpp 2024-03-10 13:04:02 +01:00
d9e9356d92 Rename all from *.cc to *.cpp 2024-03-10 13:03:37 +01:00
0010c840d1 Replace #define ... with pragma once 2024-03-10 12:50:35 +01:00
51f32113c0 Add model argument validation in b_best 2024-03-10 12:31:13 +01:00
b3b3d9f1b9 Add command results to b_list
Rename tostring -> toString in models
Add datasets names to b_main command help - validation
2024-03-10 12:16:02 +01:00
4c847fc3f6 Add model selection to b_best to filter results 2024-03-09 20:19:27 +01:00
7e4ee0a9a9 Refactor to accept new Library structure 2024-03-08 22:20:13 +01:00
b7398db9b1 Update CMake to work in Linux 2024-03-08 13:21:25 +01:00
0a9bd0d9c4 Update sample 2024-03-08 12:49:21 +01:00
7a3adaf4a9 Remove source bayesnet & pyclassifiers libraries dependency 2024-03-08 12:30:04 +01:00
5c4efa08db Add # models to ReportExcelCompared 2024-03-07 11:40:36 +01:00
576016bbd9 Merge pull request 'Create an excel report with two complete results compared in b_manage' (#2) from report_compared into main
Reviewed-on: #2
2024-03-06 12:17:30 +00:00
e26b3c0970 Add fixed header to Delta 2024-03-06 11:22:43 +01:00
183cf12300 Refactor column count and header 2024-03-06 10:35:42 +01:00
4eb08cd281 Complete sheet with totals 2024-03-06 01:26:51 +01:00
4f5f629124 Create class ReportExcelCompared 2024-03-05 23:44:19 +01:00
df011f7e6b Update second menu color in b_manage 2024-03-02 18:24:36 +01:00
42648f3125 Add info to README.md 2024-03-01 19:03:16 +01:00
d2832ed2b3 Add back to submenu in b_manage 2024-03-01 11:20:49 +01:00
ec323d86ab Refactor datasetsExcel 2024-02-29 19:05:20 +01:00
e4a6575722 Fix block header in b_list excel 2024-02-29 18:21:15 +01:00
170 changed files with 12089 additions and 3154 deletions

View File

@@ -4,8 +4,8 @@ diagrams:
Platform: Platform:
type: class type: class
glob: glob:
- src/*.cc - src/*.cpp
- src/modules/*.cc - src/modules/*.cpp
using_namespace: platform using_namespace: platform
include: include:
namespaces: namespaces:
@@ -17,7 +17,7 @@ diagrams:
sequence: sequence:
type: sequence type: sequence
glob: glob:
- src/b_main.cc - src/b_main.cpp
combine_free_functions_into_file_participants: true combine_free_functions_into_file_participants: true
using_namespace: using_namespace:
- std - std

View File

4
.gitignore vendored
View File

@@ -41,3 +41,7 @@ puml/**
*.dot *.dot
diagrams/html/** diagrams/html/**
diagrams/latex/** diagrams/latex/**
.cache
vcpkg_installed
.claude/settings.local.json
CMakeUserPresets.json

19
.gitmodules vendored
View File

@@ -1,19 +0,0 @@
[submodule "lib/catch2"]
path = lib/catch2
url = https://github.com/catchorg/Catch2.git
[submodule "lib/argparse"]
path = lib/argparse
url = https://github.com/p-ranav/argparse
[submodule "lib/json"]
path = lib/json
url = https://github.com/nlohmann/json
[submodule "lib/libxlsxwriter"]
path = lib/libxlsxwriter
url = https://github.com/jmcnamara/libxlsxwriter.git
[submodule "lib/mdlp"]
path = lib/mdlp
url = https://github.com/rmontanana/mdlp
update = merge
[submodule "lib/PyClassifiers"]
path = lib/PyClassifiers
url = git@github.com:rmontanana/PyClassifiers

View File

@@ -11,7 +11,18 @@
], ],
"cStandard": "c17", "cStandard": "c17",
"cppStandard": "c++17", "cppStandard": "c++17",
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json" "compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json",
"configurationProvider": "ms-vscode.cmake-tools"
},
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/**"
],
"defines": [],
"cStandard": "c17",
"cppStandard": "c++17",
"configurationProvider": "ms-vscode.cmake-tools"
} }
], ],
"version": 4 "version": 4

15
.vscode/launch.json vendored
View File

@@ -62,9 +62,9 @@
"--stratified", "--stratified",
"--discretize", "--discretize",
"-d", "-d",
"iris", "glass",
"--hyperparameters", "--hyperparameters",
"{\"repeatSparent\": true, \"maxModels\": 12}" "{\"block_update\": true}"
], ],
"cwd": "/home/rmontanana/Code/discretizbench", "cwd": "/home/rmontanana/Code/discretizbench",
}, },
@@ -99,7 +99,9 @@
"request": "launch", "request": "launch",
"program": "${workspaceFolder}/build_debug/src/b_list", "program": "${workspaceFolder}/build_debug/src/b_list",
"args": [ "args": [
"--excel" "results",
"-d",
"mfeat-morphological"
], ],
//"cwd": "/Users/rmontanana/Code/discretizbench", //"cwd": "/Users/rmontanana/Code/discretizbench",
"cwd": "${workspaceFolder}/../discretizbench", "cwd": "${workspaceFolder}/../discretizbench",
@@ -108,12 +110,13 @@
"name": "test", "name": "test",
"type": "lldb", "type": "lldb",
"request": "launch", "request": "launch",
"program": "${workspaceFolder}/build_debug/tests/unit_tests", "program": "${workspaceFolder}/build_debug/tests/unit_tests_platform",
"args": [ "args": [
"-c=\"Metrics Test\"", "[Scores]",
// "-c=\"Metrics Test\"",
// "-s", // "-s",
], ],
"cwd": "${workspaceFolder}/build/tests", "cwd": "${workspaceFolder}/build_debug/tests",
}, },
{ {
"name": "Build & debug active file", "name": "Build & debug active file",

93
CHANGELOG.md Normal file
View File

@@ -0,0 +1,93 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Changed
- **BREAKING**: Migrated dependency management from vcpkg to Conan
- Updated build system to use Conan toolchain files instead of vcpkg
- Updated `make init` command to use `conan install` instead of `vcpkg install`
- Modified CMakeLists.txt to use Conan's find_package mechanism
- Updated documentation in CLAUDE.md to reflect Conan usage
### Added
- `conanfile.py` - Conan recipe for dependency management with all required dependencies
- CMakeUserPresets.json (generated by Conan)
- Support for Conan build profiles (Release/Debug)
### Removed
- `vcpkg.json` - vcpkg manifest file
- `vcpkg-configuration.json` - vcpkg registry configuration
- vcpkg toolchain dependency in build system
### Notes
- The migration maintains compatibility with existing make targets and workflow
- All dependencies now managed through Conan package manager
## [1.1.0] - 2025-07-02
### Added
- **AdaBoost Implementation**: Complete multi-class SAMME AdaBoost classifier with optimization
- Optimized AdaBoostPredict with 100 estimators as default
- Enhanced predictProbaSample functionality
- Full predict_proba support for probabilistic predictions
- **Decision Tree Classifier**: New base classifier implementation with comprehensive tests
- **XA1DE Model Family**: Extended Averaged One-Dependence Estimators
- XA1DE, XBAODE, XSPODE variants with threading support
- Complete integration with memory optimization
- Prior probability computation in prediction
- **Wilcoxon Statistical Test**: Statistical significance testing for model comparison
- **Folder Management**: Enhanced file organization with folder parameter support across tools
- Added folder parameter to b_best, b_grid, b_main, and b_manage
- **vcpkg Integration**: Package management system integration (now migrated to Conan)
### Enhanced
- **Grid Search System**: Complete refactoring with MPI parallelization
- Grid experiment functionality with conditional result saving
- Fixed smoothing problems and dataset ordering
- Enhanced reporting and summary generation
- **Excel Reporting**: Advanced Excel export capabilities
- ReportExcelCompared class for side-by-side result comparison
- Enhanced formatting with colors and fixed headers
- Automatic file opening after generation
- **Results Management**: Comprehensive result handling and validation
- JSON schema validation for result format integrity
- Improved console reporting with classification reports
- Pagination support for large result sets
- **Statistical Analysis**: Enhanced statistical testing and reporting
- AUC (Area Under Curve) computation and reporting
- Confusion matrix generation and visualization
- Classification reports with color coding
### Performance Improvements
- Optimized AdaBoost training and prediction algorithms
- Enhanced memory management in XA1DE implementations
- Improved discretization algorithms with MDLP integration
- Faster ROC-AUC computation for binary classification problems
### Developer Experience
- **Testing Framework**: Comprehensive test suite with Catch2
- **Build System**: Streamlined CMake configuration with dependency management
- **Documentation**: Enhanced project documentation and build instructions
- **Code Quality**: Refactored codebase with improved error handling and logging
### Bug Fixes
- Fixed predict_proba implementations across multiple classifiers
- Resolved grid search dataset ordering issues
- Fixed Excel report formatting and column width problems
- Corrected time output formatting in various tools
- Fixed memory leaks and stability issues in model implementations
## [1.0.0] - 2024-01-09
### Initial Release
- **Core Framework**: Machine learning experimentation platform for Bayesian Networks
- **Basic Classifiers**: Initial set of Bayesian network classifiers
- **Experiment Management**: Basic experiment orchestration and result storage
- **Dataset Support**: ARFF file format support with discretization
- **Build System**: CMake-based build system with external library integration
- **Command Line Tools**: Initial versions of b_main, b_best, b_list utilities

139
CLAUDE.md Normal file
View File

@@ -0,0 +1,139 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
Platform is a C++ machine learning framework for running experiments with Bayesian Networks and other classifiers. It supports both research-focused experimental classifiers and production-ready models through a unified interface.
## Build System
The project uses CMake with Make as the primary build system:
- **Release build**: `make release` (creates `build_Release/` directory)
- **Debug build**: `make debug` (creates `build_Debug/` directory with testing and coverage enabled)
- **Install binaries**: `make install` (copies executables to `~/bin` by default)
- **Clean project**: `make clean` (removes build directories)
- **Initialize dependencies**: `make init` (runs conan install for both Release and Debug)
### Testing
- **Run tests**: `make test` (builds debug version and runs all tests)
- **Coverage report**: `make coverage` (runs tests and generates coverage with gcovr)
- **Single test with options**: `make test opt="-s"` (verbose) or `make test opt="-c='Test Name'"` (specific test)
### Build Targets
Main executables (built from `src/commands/`):
- `b_main`: Main experiment runner
- `b_grid`: Grid search over hyperparameters
- `b_best`: Best results analysis and comparison
- `b_list`: Dataset listing and properties
- `b_manage`: Results management interface
- `b_results`: Results processing
## Dependencies
The project uses Conan for package management with these key dependencies:
- **libtorch**: PyTorch C++ backend for tensor operations
- **nlohmann_json**: JSON processing
- **catch2**: Unit testing framework
- **cli11**: Command-line argument parsing (replacement for argparse)
Custom dependencies (not available in ConanCenter):
- **fimdlp**: MDLP discretization library (needs manual integration)
- **folding**: Cross-validation utilities (needs manual integration)
- **arff-files**: ARFF dataset file handling (needs manual integration)
External dependencies (managed separately):
- **BayesNet**: Core Bayesian network classifiers (from `../lib/`)
- **PyClassifiers**: Python classifier wrappers (from `../lib/`)
- **MPI**: Message Passing Interface for parallel processing
- **Boost**: Python integration and utilities
**Note**: Some dependencies (fimdlp, folding, arff-files) are not available in ConanCenter and need to be:
- Built as custom Conan packages, or
- Integrated using CMake FetchContent, or
- Built separately and found via find_package
## Architecture
### Core Components
**Experiment Framework** (`src/main/`):
- `Experiment.cpp/h`: Main experiment orchestration
- `Models.cpp/h`: Classifier factory and registration system
- `Scores.cpp/h`: Performance metrics calculation
- `HyperParameters.cpp/h`: Parameter management
- `ArgumentsExperiment.cpp/h`: Command-line argument handling
**Data Handling** (`src/common/`):
- `Dataset.cpp/h`: Individual dataset representation
- `Datasets.cpp/h`: Dataset collection management
- `Discretization.cpp/h`: Data discretization utilities
**Classifiers** (`src/experimental_clfs/`):
- `AdaBoost.cpp/h`: Multi-class SAMME AdaBoost implementation
- `DecisionTree.cpp/h`: Decision tree base classifier
- `XA1DE.cpp/h`: Extended AODE variants
- Experimental implementations of Bayesian network classifiers
**Grid Search** (`src/grid/`):
- `GridSearch.cpp/h`: Hyperparameter optimization
- `GridExperiment.cpp/h`: Grid search experiment management
- Uses MPI for parallel hyperparameter evaluation
**Results & Reporting** (`src/results/`, `src/reports/`):
- JSON-based result storage with schema validation
- Excel export capabilities via libxlsxwriter
- Console and paginated result display
### Model Registration System
The framework uses a factory pattern with automatic registration:
- All classifiers inherit from `bayesnet::BaseClassifier`
- Registration happens in `src/main/modelRegister.h`
- Factory creates instances by string name via `Models::create()`
## Configuration
**Environment Configuration** (`.env` file):
- `experiment`: Experiment name/type
- `n_folds`: Cross-validation folds (default: 5)
- `seeds`: Random seeds for reproducibility
- `model`: Default classifier name
- `score`: Primary evaluation metric
- `platform`: System identifier for results
**Grid Search Configuration**:
- `grid_<model_name>_input.json`: Hyperparameter search space
- `grid_<model_name>_output.json`: Search results
## Data Format
**Dataset Requirements**:
- ARFF format files in `datasets/` directory
- `all.txt` file listing datasets: `<name>,<class_name>,<real_features>`
- Supports both discrete and continuous features
- Automatic discretization available via MDLP
**Experimental Data**:
- Results stored in JSON format with versioned schemas
- Test data in `tests/data/` for unit testing
- Sample datasets: iris, diabetes, ecoli, glass, etc.
## Development Workflow
1. **Setup**: Run `make init` to install dependencies via Conan
2. **Development**: Use `make debug` for development builds with testing
3. **Testing**: Run `make test` after changes
4. **Release**: Use `make release` for optimized builds
5. **Experiments**: Use `.env` configuration and run `b_main` with appropriate flags
## Key Features
- **Multi-threaded**: Uses MPI for parallel grid search and experiments
- **Cross-platform**: Supports Linux and macOS via vcpkg
- **Extensible**: Easy classifier registration and integration
- **Research-focused**: Designed for machine learning experimentation
- **Visualization**: DOT graph generation for decision trees and networks

View File

@@ -1,95 +1,99 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
project(Platform project(Platform
VERSION 1.0.2 VERSION 1.1.0
DESCRIPTION "Platform to run Experiments with classifiers." DESCRIPTION "Platform to run Experiments with classifiers."
HOMEPAGE_URL "https://github.com/rmontanana/platform" HOMEPAGE_URL "https://github.com/rmontanana/platform"
LANGUAGES CXX LANGUAGES CXX
) )
if (CODE_COVERAGE AND NOT ENABLE_TESTING)
MESSAGE(FATAL_ERROR "Code coverage requires testing enabled")
endif (CODE_COVERAGE AND NOT ENABLE_TESTING)
find_package(Torch REQUIRED)
if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif ()
# Global CMake variables # Global CMake variables
# ---------------------- # ----------------------
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
# Options # Options
# ------- # -------
option(ENABLE_CLANG_TIDY "Enable to add clang tidy." OFF)
option(ENABLE_TESTING "Unit testing build" OFF) option(ENABLE_TESTING "Unit testing build" OFF)
option(CODE_COVERAGE "Collect coverage from test library" OFF) option(CODE_COVERAGE "Collect coverage from test library" OFF)
# CMakes modules
# --------------
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
# MPI # MPI
find_package(MPI REQUIRED) find_package(MPI REQUIRED)
message("MPI_CXX_LIBRARIES=${MPI_CXX_LIBRARIES}") message("MPI_CXX_LIBRARIES=${MPI_CXX_LIBRARIES}")
message("MPI_CXX_INCLUDE_DIRS=${MPI_CXX_INCLUDE_DIRS}") message("MPI_CXX_INCLUDE_DIRS=${MPI_CXX_INCLUDE_DIRS}")
# Boost Library # Boost Library
cmake_policy(SET CMP0135 NEW)
cmake_policy(SET CMP0167 NEW) # For FindBoost
set(Boost_USE_STATIC_LIBS OFF) set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_MULTITHREADED ON) set(Boost_USE_MULTITHREADED ON)
set(Boost_USE_STATIC_RUNTIME OFF) set(Boost_USE_STATIC_RUNTIME OFF)
find_package(Boost 1.66.0 REQUIRED COMPONENTS python3 numpy3) find_package(Boost 1.66.0 REQUIRED COMPONENTS python3 numpy3)
# # Python
find_package(Python3 REQUIRED COMPONENTS Development)
# # Boost Python
# find_package(boost_python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR} CONFIG REQUIRED COMPONENTS python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR})
# # target_link_libraries(MyTarget PRIVATE Boost::python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR})
if(Boost_FOUND) if(Boost_FOUND)
message("Boost_INCLUDE_DIRS=${Boost_INCLUDE_DIRS}") message("Boost_INCLUDE_DIRS=${Boost_INCLUDE_DIRS}")
message("Boost_LIBRARIES=${Boost_LIBRARIES}")
message("Boost_VERSION=${Boost_VERSION}")
include_directories(${Boost_INCLUDE_DIRS}) include_directories(${Boost_INCLUDE_DIRS})
endif() endif()
# Python
find_package(Python3 3.11...3.11.9 COMPONENTS Interpreter Development REQUIRED)
message("Python3_LIBRARIES=${Python3_LIBRARIES}")
# CMakes modules
# --------------
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
include(AddGitSubmodule)
if (CODE_COVERAGE)
enable_testing()
include(CodeCoverage)
MESSAGE("Code coverage enabled")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
endif (CODE_COVERAGE)
if (ENABLE_CLANG_TIDY)
include(StaticAnalyzers) # clang-tidy
endif (ENABLE_CLANG_TIDY)
# External libraries - dependencies of Platform # External libraries - dependencies of Platform
# --------------------------------------------- # ---------------------------------------------
add_git_submodule("lib/PyClassifiers") find_package(nlohmann_json CONFIG REQUIRED)
add_git_submodule("lib/argparse") find_package(argparse CONFIG REQUIRED)
find_package(Torch CONFIG REQUIRED)
find_library(XLSXWRITER_LIB NAMES libxlsxwriter.dylib libxlsxwriter.so PATHS ${Platform_SOURCE_DIR}/lib/libxlsxwriter/lib) find_package(arff-files CONFIG REQUIRED)
message("XLSXWRITER_LIB=${XLSXWRITER_LIB}") find_package(fimdlp CONFIG REQUIRED)
find_package(folding CONFIG REQUIRED)
find_package(bayesnet CONFIG REQUIRED)
find_package(pyclassifiers CONFIG REQUIRED)
find_package(libxlsxwriter CONFIG REQUIRED)
find_package(Boost REQUIRED COMPONENTS python)
# Subdirectories # Subdirectories
# -------------- # --------------
## Configure test data path
cmake_path(SET TEST_DATA_PATH "${CMAKE_CURRENT_SOURCE_DIR}/tests/data")
configure_file(src/common/SourceData.h.in "${CMAKE_BINARY_DIR}/configured_files/include/SourceData.h")
add_subdirectory(config) add_subdirectory(config)
add_subdirectory(src) add_subdirectory(src)
add_subdirectory(sample) # add_subdirectory(sample)
file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${Platform_SOURCE_DIR}/src/*.cc) file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${Platform_SOURCE_DIR}/src/*.cpp)
# Testing # Testing
# ------- # -------
if (ENABLE_TESTING) if (ENABLE_TESTING)
MESSAGE("Testing enabled") MESSAGE("Testing enabled")
if (NOT TARGET Catch2::Catch2) set(CMAKE_CXX_FLAGS_DEBUG " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
add_git_submodule("lib/catch2") enable_testing()
endif (NOT TARGET Catch2::Catch2) find_package(Catch2 CONFIG REQUIRED)
set(CODE_COVERAGE ON)
include(CTest) include(CTest)
add_subdirectory(tests) add_subdirectory(tests)
endif (ENABLE_TESTING) endif (ENABLE_TESTING)
if (CODE_COVERAGE)
MESSAGE("Code coverage enabled")
include(CodeCoverage)
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
endif (CODE_COVERAGE)

View File

@@ -976,7 +976,7 @@ INPUT_FILE_ENCODING =
# Note the list of default checked file patterns might differ from the list of # Note the list of default checked file patterns might differ from the list of
# default file extension mappings. # default file extension mappings.
# #
# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # If left blank the following patterns are tested:*.c, *.cpp, *.cxx, *.cpp,
# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, # *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C # *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
@@ -984,7 +984,7 @@ INPUT_FILE_ENCODING =
# *.vhdl, *.ucf, *.qsf and *.ice. # *.vhdl, *.ucf, *.qsf and *.ice.
FILE_PATTERNS = *.c \ FILE_PATTERNS = *.c \
*.cc \ *.cpp \
*.cxx \ *.cxx \
*.cpp \ *.cpp \
*.c++ \ *.c++ \

View File

@@ -1,6 +1,6 @@
MIT License MIT License
Copyright (c) 2024 rmontanana Copyright (c) 2024 Ricardo Montañana Gómez
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

102
Makefile
View File

@@ -1,12 +1,18 @@
SHELL := /bin/bash SHELL := /bin/bash
.DEFAULT_GOAL := help .DEFAULT_GOAL := help
.PHONY: coverage setup help build test clean debug release submodules buildr buildd install dependency testp testb clang-uml .PHONY: init clean coverage setup help build test clean debug release buildr buildd install dependency testp testb clang-uml example
f_release = build_release f_release = build_Release
f_debug = build_debug f_debug = build_Debug
app_targets = b_best b_list b_main b_manage b_grid app_targets = b_best b_list b_main b_manage b_grid b_results
test_targets = unit_tests_bayesnet unit_tests_platform test_targets = unit_tests_platform
n_procs = -j 16 # Set the number of parallel jobs to the number of available processors minus 7
CPUS := $(shell getconf _NPROCESSORS_ONLN 2>/dev/null \
|| nproc --all 2>/dev/null \
|| sysctl -n hw.ncpu)
# --- Your desired job count: CPUs 7, but never less than 1 --------------
JOBS := $(shell n=$(CPUS); [ $${n} -gt 7 ] && echo $$((n-7)) || echo 1)
define ClearTests define ClearTests
@for t in $(test_targets); do \ @for t in $(test_targets); do \
@@ -21,14 +27,43 @@ define ClearTests
fi ; fi ;
endef endef
define build_target
@echo ">>> Building the project for $(1)..."
@if [ -d $(2) ]; then rm -fr $(2); fi
@conan install . --build=missing -of $(2) -s build_type=$(1)
@cmake -S . -B $(2) -DCMAKE_TOOLCHAIN_FILE=$(2)/build/$(1)/generators/conan_toolchain.cmake -DCMAKE_BUILD_TYPE=$(1) -D$(3)
@echo ">>> Will build using $(JOBS) parallel jobs"
echo ">>> Done"
endef
sub-init: ## Initialize submodules define compile_target
@git submodule update --init --recursive @echo ">>> Compiling for $(1)..."
if [ "$(3)" != "" ]; then \
target="-t$(3)"; \
else \
target=""; \
fi
@cmake --build $(2) --config $(1) --parallel $(JOBS) $(target)
@echo ">>> Done"
endef
sub-update: ## Initialize submodules init: ## Initialize the project installing dependencies
@git submodule update --remote --merge @echo ">>> Installing dependencies with Conan"
@git submodule foreach git pull origin master @conan install . --output-folder=build --build=missing -s build_type=Release
@conan install . --output-folder=build_debug --build=missing -s build_type=Debug
@echo ">>> Done"
clean: ## Clean the project
@echo ">>> Cleaning the project..."
@if test -f CMakeCache.txt ; then echo "- Deleting CMakeCache.txt"; rm -f CMakeCache.txt; fi
@for folder in $(f_release) $(f_debug) build build_debug install_test ; do \
if test -d "$$folder" ; then \
echo "- Deleting $$folder folder" ; \
rm -rf "$$folder"; \
fi; \
done
$(call ClearTests)
@echo ">>> Done";
setup: ## Install dependencies for tests and coverage setup: ## Install dependencies for tests and coverage
@if [ "$(shell uname)" = "Darwin" ]; then \ @if [ "$(shell uname)" = "Darwin" ]; then \
brew install gcovr; \ brew install gcovr; \
@@ -41,13 +76,15 @@ setup: ## Install dependencies for tests and coverage
dest ?= ${HOME}/bin dest ?= ${HOME}/bin
install: ## Copy binary files to bin folder install: ## Copy binary files to bin folder
@echo "Destination folder: $(dest)" @echo "Destination folder: $(dest)"
make buildr @make buildr
@echo "*******************************************" @echo "*******************************************"
@echo ">>> Copying files to $(dest)" @echo ">>> Copying files to $(dest)"
@echo "*******************************************" @echo "*******************************************"
@for item in $(app_targets); do \ @for item in $(app_targets); do \
echo ">>> Copying $$item" ; \ echo ">>> Copying $$item" ; \
cp $(f_release)/src/$$item $(dest) ; \ cp $(f_release)/src/$$item $(dest) || { \
echo "*** Error copying $$item" ; \
} ; \
done done
dependency: ## Create a dependency graph diagram of the project (build/dependency.png) dependency: ## Create a dependency graph diagram of the project (build/dependency.png)
@@ -56,38 +93,27 @@ dependency: ## Create a dependency graph diagram of the project (build/dependenc
cd $(f_debug) && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png cd $(f_debug) && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png
buildd: ## Build the debug targets buildd: ## Build the debug targets
cmake --build $(f_debug) -t $(app_targets) PlatformSample $(n_procs) @$(call compile_target,"Debug","$(f_debug)")
buildr: ## Build the release targets buildr: ## Build the release targets
cmake --build $(f_release) -t $(app_targets) $(n_procs) @$(call compile_target,"Release","$(f_release)")
clean: ## Clean the tests info
@echo ">>> Cleaning Debug Platform tests...";
$(call ClearTests)
@echo ">>> Done";
clang-uml: ## Create uml class and sequence diagrams clang-uml: ## Create uml class and sequence diagrams
clang-uml -p --add-compile-flag -I /usr/lib/gcc/x86_64-redhat-linux/8/include/ clang-uml -p --add-compile-flag -I /usr/lib/gcc/x86_64-redhat-linux/8/include/
debug: ## Build a debug version of the project debug: ## Build a debug version of the project with Conan
@echo ">>> Building Debug Platform..."; @$(call build_target,"Debug","$(f_debug)", "ENABLE_TESTING=ON")
@if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi
@mkdir $(f_debug); release: ## Build a Release version of the project with Conan
@cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON @$(call build_target,"Release","$(f_release)", "ENABLE_TESTING=OFF")
@echo ">>> Done";
release: ## Build a Release version of the project
@echo ">>> Building Release Platform...";
@if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi
@mkdir $(f_release);
@cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release
@echo ">>> Done";
opt = "" opt = ""
test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
@echo ">>> Running Platform tests..."; @echo ">>> Running Platform tests...";
@$(MAKE) clean @$(MAKE) clean
@cmake --build $(f_debug) -t $(test_targets) $(n_procs) @$(MAKE) debug
@$(call "Compile_target", "Debug", "$(f_debug)", $(test_targets))
@for t in $(test_targets); do \ @for t in $(test_targets); do \
if [ -f $(f_debug)/tests/$$t ]; then \ if [ -f $(f_debug)/tests/$$t ]; then \
cd $(f_debug)/tests ; \ cd $(f_debug)/tests ; \
@@ -96,6 +122,14 @@ test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximu
done done
@echo ">>> Done"; @echo ">>> Done";
fname = iris
example: ## Build sample
@echo ">>> Building Sample...";
@cmake --build $(f_release) -t sample
$(f_release)/sample/PlatformSample --model BoostAODE --dataset $(fname) --discretize --stratified
@echo ">>> Done";
coverage: ## Run tests and generate coverage report (build/index.html) coverage: ## Run tests and generate coverage report (build/index.html)
@echo ">>> Building tests with coverage..." @echo ">>> Building tests with coverage..."
@$(MAKE) test @$(MAKE) test
@@ -105,7 +139,7 @@ coverage: ## Run tests and generate coverage report (build/index.html)
help: ## Show help message help: ## Show help message
@IFS=$$'\n' ; \ @IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ help_lines=(`grep -Fh "##" $(MAKEFILE_LIST) | grep -Fv fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
printf "%s\n\n" "Usage: make [task]"; \ printf "%s\n\n" "Usage: make [task]"; \
printf "%-20s %s\n" "task" "help" ; \ printf "%-20s %s\n" "task" "help" ; \
printf "%-20s %s\n" "------" "----" ; \ printf "%-20s %s\n" "------" "----" ; \

View File

@@ -1,10 +1,9 @@
# Platform # <img src="logo.png" alt="logo" width="50"/> Platform
Platform to run Bayesian Networks and Machine Learning Classifiers experiments. ![C++](https://img.shields.io/badge/c++-%2300599C.svg?style=flat&logo=c%2B%2B&logoColor=white)
[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](<https://opensource.org/licenses/MIT>)
# Platform [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/rmontanana/Platform)
![Gitea Last Commit](https://img.shields.io/gitea/last-commit/rmontanana/platform?gitea_url=https://gitea.rmontanana.es&logo=gitea)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
Platform to run Bayesian Networks and Machine Learning Classifiers experiments. Platform to run Bayesian Networks and Machine Learning Classifiers experiments.
@@ -22,11 +21,18 @@ In Linux sometimes the library libstdc++ is mistaken from the miniconda installa
libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by b_xxxx) libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by b_xxxx)
``` ```
The solution is to erase the libstdc++ library from the miniconda installation: The solution is to erase the libstdc++ library from the miniconda installation and no further compilation is needed.
### MPI ### MPI
In Linux just install openmpi & openmpi-devel packages. Only if cmake can't find openmpi installation (like in Oracle Linux) set the following variable: In Linux just install openmpi & openmpi-devel packages.
```bash
source /etc/profile.d/modules.sh
module load mpi/openmpi-x86_64
```
If cmake can't find openmpi installation (like in Oracle Linux) set the following variable:
```bash ```bash
export MPI_HOME="/usr/lib64/openmpi" export MPI_HOME="/usr/lib64/openmpi"
@@ -35,7 +41,7 @@ export MPI_HOME="/usr/lib64/openmpi"
In Mac OS X, install mpich with brew and if cmake doesn't find it, edit mpicxx wrapper to remove the ",-commons,use_dylibs" from final_ldflags In Mac OS X, install mpich with brew and if cmake doesn't find it, edit mpicxx wrapper to remove the ",-commons,use_dylibs" from final_ldflags
```bash ```bash
vi /opt/homebrew/bin/mpicx vi /opt/homebrew/bin/mpicxx
``` ```
### boost library ### boost library
@@ -86,4 +92,64 @@ make release
make debug make debug
``` ```
## 1. Introduction ### Configuration
The configuration file is named .env and it should be located in the folder where the experiments should be run. In the root folder of the project there is a file named .env.example that can be used as a template.
## 1. Commands
### b_list
List all the datasets and its properties. The datasets are located in the _datasets_ folder under the experiments root folder. A special file called all.txt with the names of the datasets has to be created. This all file is built wih lines of the form:
<name>,<class_name>,<real_features>
where <real_features> can be either the word _all_ or a list of numbers separated by commas, i.e. [0,3,6,7]
### b_grid
Run a grid search over the parameters of the classifiers. The parameters are defined in the file _grid.txt_ located in the grid folder of the experiments. The file has to be created with the following format:
```json
{
"all": [
<set of hyperparams>, ...
],
"<dataset_name>": [
<specific set of hyperparams for <dataset_name>>, ...
],
}
```
The file has to be named _grid_<model_name>_input.json_
As a result it builds a file named _grid_<model_name>_output.json_ with the results of the grid search.
The computation is done in parallel using MPI.
![b_grid](img/bgrid.gif)
### b_main
Run the main experiment. There are several hyperparameters that can set in command line:
- -d, -\-dataset <dataset_name> : Name of the dataset to run the experiment with. If no dataset is specificied the experiment will run with all the datasets in the all.txt file.
- -m, -\-model <classifier_name> : Name of the classifier to run the experiment with (i.e. BoostAODE, TAN, Odte, etc.).
- -\-discretize: Discretize the dataset before running the experiment.
- -\-stratified: Use stratified cross validation.
- -\-folds <folds>: Number of folds for cross validation (optional, default value is in .env file).
- -s, -\-seeds <seed>: Seeds for the random number generator (optional, default values are in .env file).
- -\-no-train-score: Do not calculate the train score (optional), this is useful when the dataset is big and the training score is not needed.
- -\-hyperparameters <hyperparameters>: Hyperparameters for the experiment in json format.
- -\-hyper-file <hyperparameters_file>: File with the hyperparameters for the experiment in json format. This file uses the output format of the b_grid command.
- -\-title <title_text>: Title of the experiment (optional if only one dataset is specificied).
- -\-quiet: Don't display detailed progress and result of the experiment.
### b_manage
Manage the results of the experiments.
### b_best
Get and optionally compare the best results of the experiments. The results can be stored in an MS Excel file.
![b_best](img/bbest.gif)

View File

@@ -137,7 +137,7 @@
include(CMakeParseArguments) include(CMakeParseArguments)
option(CODE_COVERAGE_VERBOSE "Verbose information" FALSE) option(CODE_COVERAGE_VERBOSE "Verbose information" TRUE)
# Check prereqs # Check prereqs
find_program( GCOV_PATH gcov ) find_program( GCOV_PATH gcov )
@@ -160,8 +160,12 @@ foreach(LANG ${LANGUAGES})
endif() endif()
elseif(NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "GNU" elseif(NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "GNU"
AND NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "(LLVM)?[Ff]lang") AND NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "(LLVM)?[Ff]lang")
if ("${LANG}" MATCHES "CUDA")
message(STATUS "Ignoring CUDA")
else()
message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...") message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
endif() endif()
endif()
endforeach() endforeach()
set(COVERAGE_COMPILER_FLAGS "-g --coverage" set(COVERAGE_COMPILER_FLAGS "-g --coverage"

42
conanfile.py Normal file
View File

@@ -0,0 +1,42 @@
from conan import ConanFile
from conan.tools.cmake import CMakeToolchain, CMakeDeps, cmake_layout
class PlatformConan(ConanFile):
name = "platform"
version = "1.1.0"
# Binary configuration
settings = "os", "compiler", "build_type", "arch"
# Sources are located in the same place as this recipe, copy them to the recipe
exports_sources = "CMakeLists.txt", "src/*", "tests/*", "config/*", "cmake/*"
def requirements(self):
# Core dependencies from vcpkg.json
self.requires("argparse/3.2")
self.requires("libtorch/2.7.1")
self.requires("nlohmann_json/3.11.3")
self.requires("folding/1.1.2")
self.requires("fimdlp/2.1.1")
self.requires("arff-files/1.2.1")
self.requires("bayesnet/1.2.1")
self.requires("pyclassifiers/1.0.3")
self.requires("libxlsxwriter/1.2.2")
def build_requirements(self):
self.tool_requires("cmake/[>=3.30]")
self.test_requires("catch2/3.8.1")
def layout(self):
cmake_layout(self)
def generate(self):
deps = CMakeDeps(self)
deps.generate()
tc = CMakeToolchain(self)
tc.generate()
def configure(self):
# C++20 requirement
self.settings.compiler.cppstd = "20"

View File

@@ -1,4 +1,4 @@
configure_file( configure_file(
"config.h.in" "config.h.in"
"${CMAKE_BINARY_DIR}/configured_files/include/config.h" ESCAPE_QUOTES "${CMAKE_BINARY_DIR}/configured_files/include/config_platform.h" ESCAPE_QUOTES
) )

View File

@@ -1,14 +1,11 @@
#pragma once #ifndef PLATFORM_H
#define PLATFORM_H
#include <string> #include <string>
#include <string_view> #include <string_view>
#define PROJECT_VERSION_MAJOR @PROJECT_VERSION_MAJOR @ static constexpr std::string_view platform_project_name = "@PROJECT_NAME@";
#define PROJECT_VERSION_MINOR @PROJECT_VERSION_MINOR @ static constexpr std::string_view platform_project_version = "@PROJECT_VERSION@";
#define PROJECT_VERSION_PATCH @PROJECT_VERSION_PATCH @ static constexpr std::string_view platform_project_description = "@PROJECT_DESCRIPTION@";
static constexpr std::string_view platform_git_sha = "@GIT_SHA@";
static constexpr std::string_view project_name = "@PROJECT_NAME@"; static constexpr std::string_view platform_data_path = "@Platform_SOURCE_DIR@/tests/data/";
static constexpr std::string_view project_version = "@PROJECT_VERSION@"; #endif
static constexpr std::string_view project_description = "@PROJECT_DESCRIPTION@";
static constexpr std::string_view git_sha = "@GIT_SHA@";
static constexpr std::string_view data_path = "@Platform_SOURCE_DIR@/tests/data/";

View File

@@ -1,4 +1,4 @@
filter = src/ filter = src/
exclude-directories = build/lib/ exclude-directories = build_debug/lib/
print-summary = yes print-summary = yes
sort-percentage = yes sort-percentage = yes

View File

@@ -1,31 +0,0 @@
[submodule "lib/mdlp"]
path = lib/mdlp
url = https://github.com/rmontanana/mdlp
main = main
update = merge
[submodule "lib/catch2"]
path = lib/catch2
main = v2.x
update = merge
url = https://github.com/catchorg/Catch2.git
[submodule "lib/argparse"]
path = lib/argparse
url = https://github.com/p-ranav/argparse
master = master
update = merge
[submodule "lib/json"]
path = lib/json
url = https://github.com/nlohmann/json.git
master = master
update = merge
[submodule "lib/libxlsxwriter"]
path = lib/libxlsxwriter
url = https://github.com/jmcnamara/libxlsxwriter.git
main = main
update = merge
[submodule "lib/PyClassifiers"]
path = lib/PyClassifiers
url = https://github.com/rmontanana/PyClassifiers
[submodule "lib/folding"]
path = lib/folding
url = https://github.com/rmontanana/Folding

BIN
img/bbest.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 MiB

BIN
img/bgrid.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 349 KiB

BIN
img/blist.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

BIN
img/bmain.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.3 MiB

BIN
img/bmanage.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.7 MiB

View File

@@ -1,168 +0,0 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
ArffFiles::ArffFiles() = default;
std::vector<std::string> ArffFiles::getLines() const
{
return lines;
}
unsigned long int ArffFiles::getSize() const
{
return lines.size();
}
std::vector<std::pair<std::string, std::string>> ArffFiles::getAttributes() const
{
return attributes;
}
std::string ArffFiles::getClassName() const
{
return className;
}
std::string ArffFiles::getClassType() const
{
return classType;
}
std::vector<std::vector<float>>& ArffFiles::getX()
{
return X;
}
std::vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::loadCommon(std::string fileName)
{
std::ifstream file(fileName);
if (!file.is_open()) {
throw std::invalid_argument("Unable to open file");
}
std::string line;
std::string keyword;
std::string attribute;
std::string type;
std::string type_w;
while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
ss >> keyword >> attribute;
type = "";
while (ss >> type_w)
type += type_w + " ";
attributes.emplace_back(trim(attribute), trim(type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw std::invalid_argument("No attributes found");
}
void ArffFiles::load(const std::string& fileName, bool classLast)
{
int labelIndex;
loadCommon(fileName);
if (classLast) {
className = std::get<0>(attributes.back());
classType = std::get<1>(attributes.back());
attributes.pop_back();
labelIndex = static_cast<int>(attributes.size());
} else {
className = std::get<0>(attributes.front());
classType = std::get<1>(attributes.front());
attributes.erase(attributes.begin());
labelIndex = 0;
}
generateDataset(labelIndex);
}
void ArffFiles::load(const std::string& fileName, const std::string& name)
{
int labelIndex;
loadCommon(fileName);
bool found = false;
for (int i = 0; i < attributes.size(); ++i) {
if (attributes[i].first == name) {
className = std::get<0>(attributes[i]);
classType = std::get<1>(attributes[i]);
attributes.erase(attributes.begin() + i);
labelIndex = i;
found = true;
break;
}
}
if (!found) {
throw std::invalid_argument("Class name not found");
}
generateDataset(labelIndex);
}
void ArffFiles::generateDataset(int labelIndex)
{
X = std::vector<std::vector<float>>(attributes.size(), std::vector<float>(lines.size()));
auto yy = std::vector<std::string>(lines.size(), "");
auto removeLines = std::vector<int>(); // Lines with missing values
for (size_t i = 0; i < lines.size(); i++) {
std::stringstream ss(lines[i]);
std::string value;
int pos = 0;
int xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
if (value == "?") {
X[xIndex++][i] = -1;
removeLines.push_back(i);
} else
X[xIndex++][i] = stof(value);
}
}
}
for (auto i : removeLines) {
yy.erase(yy.begin() + i);
for (auto& x : X) {
x.erase(x.begin() + i);
}
}
y = factorize(yy);
}
std::string ArffFiles::trim(const std::string& source)
{
std::string s(source);
s.erase(0, s.find_first_not_of(" '\n\r\t"));
s.erase(s.find_last_not_of(" '\n\r\t") + 1);
return s;
}
std::vector<int> ArffFiles::factorize(const std::vector<std::string>& labels_t)
{
std::vector<int> yy;
yy.reserve(labels_t.size());
std::map<std::string, int> labelMap;
int i = 0;
for (const std::string& label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

View File

@@ -1,32 +0,0 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
class ArffFiles {
private:
std::vector<std::string> lines;
std::vector<std::pair<std::string, std::string>> attributes;
std::string className;
std::string classType;
std::vector<std::vector<float>> X;
std::vector<int> y;
void generateDataset(int);
void loadCommon(std::string);
public:
ArffFiles();
void load(const std::string&, bool = true);
void load(const std::string&, const std::string&);
std::vector<std::string> getLines() const;
unsigned long int getSize() const;
std::string getClassName() const;
std::string getClassType() const;
static std::string trim(const std::string&);
std::vector<std::vector<float>>& getX();
std::vector<int>& getY();
std::vector<std::pair<std::string, std::string>> getAttributes() const;
static std::vector<int> factorize(const std::vector<std::string>& labels_t);
};
#endif

View File

@@ -1 +0,0 @@
add_library(ArffFiles ArffFiles.cc)

Submodule lib/argparse deleted from 1b3abd9b92

Submodule lib/catch2 deleted from ed6ac8a629

Submodule lib/json deleted from 0457de21cf

Submodule lib/mdlp deleted from 5708dc3de9

BIN
logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 874 KiB

14
remove_submodules.sh Normal file
View File

@@ -0,0 +1,14 @@
git config --file .gitmodules --get-regexp path | awk '{ print $2 }' | while read line; do
echo "Removing $line"
# Deinit the submodule
git submodule deinit -f "$line"
# Remove the submodule from the working tree
git rm -f "$line"
# Remove the submodule from .git/modules
rm -rf ".git/modules/$line"
done
# Remove the .gitmodules file
git rm -f .gitmodules

View File

@@ -1,15 +1,11 @@
include_directories( include_directories(
${TORCH_INCLUDE_DIRS}
${Platform_SOURCE_DIR}/src/common ${Platform_SOURCE_DIR}/src/common
${Platform_SOURCE_DIR}/src/main ${Platform_SOURCE_DIR}/src/main
${Platform_SOURCE_DIR}/lib/PyClassifiers/src
${Python3_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS}
${Platform_SOURCE_DIR}/lib/Files
${Platform_SOURCE_DIR}/lib/argparse/include
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/src
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/folding
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/mdlp
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/json/include
${CMAKE_BINARY_DIR}/configured_files/include ${CMAKE_BINARY_DIR}/configured_files/include
${PyClassifiers_INCLUDE_DIRS}
${bayesnet_INCLUDE_DIRS}
) )
add_executable(PlatformSample sample.cc ${Platform_SOURCE_DIR}/src/main/Models.cc) add_executable(PlatformSample sample.cpp ${Platform_SOURCE_DIR}/src/main/Models.cpp)
target_link_libraries(PlatformSample PyClassifiers ArffFiles mdlp "${TORCH_LIBRARIES}") target_link_libraries(PlatformSample "${PyClassifiers}" "${BayesNet}" fimdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} ${Boost_LIBRARIES})

View File

@@ -1,240 +0,0 @@
#include <iostream>
#include <torch/torch.h>
#include <string>
#include <map>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include "ArffFiles.h"
#include "BayesMetrics.h"
#include "CPPFImdlp.h"
#include "folding.hpp"
#include "Models.h"
#include "modelRegister.h"
#include <fstream>
#include "config.h"
const std::string PATH = { data_path.begin(), data_path.end() };
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
{
std::vector<mdlp::labels_t>Xd;
map<std::string, int> maxes;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
Xd.push_back(xd);
}
return { Xd, maxes };
}
bool file_exists(const std::string& name)
{
if (FILE* file = fopen(name.c_str(), "r")) {
fclose(file);
return true;
} else {
return false;
}
}
pair<std::vector<std::vector<int>>, std::vector<int>> extract_indices(std::vector<int> indices, std::vector<std::vector<int>> X, std::vector<int> y)
{
std::vector<std::vector<int>> Xr; // nxm
std::vector<int> yr;
for (int col = 0; col < X.size(); ++col) {
Xr.push_back(std::vector<int>());
}
for (auto index : indices) {
for (int col = 0; col < X.size(); ++col) {
Xr[col].push_back(X[col][index]);
}
yr.push_back(y[index]);
}
return { Xr, yr };
}
int main(int argc, char** argv)
{
map<std::string, bool> datasets = {
{"diabetes", true},
{"ecoli", true},
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
};
auto valid_datasets = std::vector<std::string>();
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
[](const pair<std::string, bool>& pair) { return pair.first; });
argparse::ArgumentParser program("PlatformSample");
program.add_argument("-d", "--dataset")
.help("Dataset file name")
.action([valid_datasets](const std::string& value) {
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
return value;
}
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
}
);
program.add_argument("-p", "--path")
.help(" folder where the data files are located, default")
.default_value(std::string{ PATH }
);
program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->tostring())
.action([](const std::string& value) {
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw runtime_error("Model must be one of " + platform::Models::instance()->tostring());
}
);
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw runtime_error(err.what());
}
catch (...) {
throw runtime_error("Number of folds must be an integer");
}});
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
bool class_last, stratified, tensors, dump_cpt;
std::string model_name, file_name, path, complete_file_name;
int nFolds, seed;
try {
program.parse_args(argc, argv);
file_name = program.get<std::string>("dataset");
path = program.get<std::string>("path");
model_name = program.get<std::string>("model");
complete_file_name = path + file_name + ".arff";
stratified = program.get<bool>("stratified");
tensors = program.get<bool>("tensors");
nFolds = program.get<int>("folds");
seed = program.get<int>("seed");
dump_cpt = program.get<bool>("dumpcpt");
class_last = datasets[file_name];
if (!file_exists(complete_file_name)) {
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
}
}
catch (const exception& err) {
cerr << err.what() << std::endl;
cerr << program;
exit(1);
}
/*
* Begin Processing
*/
auto handler = ArffFiles();
handler.load(complete_file_name, class_last);
// Get Dataset X, y
std::vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
std::vector<std::string> features;
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features),
[](const pair<std::string, std::string>& item) { return item.first; });
// Discretize Dataset
auto [Xd, maxes] = discretize(X, y, features);
maxes[className] = *max_element(y.begin(), y.end()) + 1;
map<std::string, std::vector<int>> states;
for (auto feature : features) {
states[feature] = std::vector<int>(maxes[feature]);
}
states[className] = std::vector<int>(maxes[className]);
auto clf = platform::Models::instance()->create(model_name);
clf->fit(Xd, y, features, className, states);
if (dump_cpt) {
std::cout << "--- CPT Tables ---" << std::endl;
clf->dump_cpt();
}
auto lines = clf->show();
for (auto line : lines) {
std::cout << line << std::endl;
}
std::cout << "--- Topological Order ---" << std::endl;
auto order = clf->topological_order();
for (auto name : order) {
std::cout << name << ", ";
}
std::cout << "end." << std::endl;
auto score = clf->score(Xd, y);
std::cout << "Score: " << score << std::endl;
auto graph = clf->graph();
auto dot_file = model_name + "_" + file_name;
ofstream file(dot_file + ".dot");
file << graph;
file.close();
std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl;
std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl;
std::string stratified_string = stratified ? " Stratified" : "";
std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl;
std::cout << "==========================================" << std::endl;
torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
torch::Tensor yt = torch::tensor(y, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
}
float total_score = 0, total_score_train = 0, score_train, score_test;
folding::Fold* fold;
double nodes = 0.0;
if (stratified)
fold = new folding::StratifiedKFold(nFolds, y, seed);
else
fold = new folding::KFold(nFolds, y.size(), seed);
for (auto i = 0; i < nFolds; ++i) {
auto [train, test] = fold->getFold(i);
std::cout << "Fold: " << i + 1 << std::endl;
if (tensors) {
auto ttrain = torch::tensor(train, torch::kInt64);
auto ttest = torch::tensor(test, torch::kInt64);
torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
torch::Tensor ytraint = yt.index({ ttrain });
torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
torch::Tensor ytestt = yt.index({ ttest });
clf->fit(Xtraint, ytraint, features, className, states);
auto temp = clf->predict(Xtraint);
score_train = clf->score(Xtraint, ytraint);
score_test = clf->score(Xtestt, ytestt);
} else {
auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
auto [Xtest, ytest] = extract_indices(test, Xd, y);
clf->fit(Xtrain, ytrain, features, className, states);
std::cout << "Nodes: " << clf->getNumberOfNodes() << std::endl;
nodes += clf->getNumberOfNodes();
score_train = clf->score(Xtrain, ytrain);
score_test = clf->score(Xtest, ytest);
}
if (dump_cpt) {
std::cout << "--- CPT Tables ---" << std::endl;
clf->dump_cpt();
}
total_score_train += score_train;
total_score += score_test;
std::cout << "Score Train: " << score_train << std::endl;
std::cout << "Score Test : " << score_test << std::endl;
std::cout << "-------------------------------------------------------------------------------" << std::endl;
}
std::cout << "Nodes: " << nodes / nFolds << std::endl;
std::cout << "**********************************************************************************" << std::endl;
std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl;
std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0;
}

279
sample/sample.cpp Normal file
View File

@@ -0,0 +1,279 @@
#include <iostream>
#include <string>
#include <map>
#include <fstream>
#include <torch/torch.h>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include <ArffFiles.hpp>
#include <fimdlp/CPPFImdlp.h>
#include <folding.hpp>
#include <bayesnet/utils/BayesMetrics.h>
#include <bayesnet/classifiers/SPODE.h>
#include "Models.h"
#include "modelRegister.h"
#include "config_platform.h"
const std::string PATH = { platform_data_path.begin(), platform_data_path.end() };
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
{
std::vector<mdlp::labels_t>Xd;
map<std::string, int> maxes;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
Xd.push_back(xd);
}
return { Xd, maxes };
}
bool file_exists(const std::string& name)
{
if (FILE* file = fopen(name.c_str(), "r")) {
fclose(file);
return true;
} else {
return false;
}
}
pair<std::vector<std::vector<int>>, std::vector<int>> extract_indices(std::vector<int> indices, std::vector<std::vector<int>> X, std::vector<int> y)
{
std::vector<std::vector<int>> Xr; // nxm
std::vector<int> yr;
for (int col = 0; col < X.size(); ++col) {
Xr.push_back(std::vector<int>());
}
for (auto index : indices) {
for (int col = 0; col < X.size(); ++col) {
Xr[col].push_back(X[col][index]);
}
yr.push_back(y[index]);
}
return { Xr, yr };
}
int main(int argc, char** argv)
{
map<std::string, bool> datasets = {
{"diabetes", true},
{"ecoli", true},
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
};
auto valid_datasets = std::vector<std::string>();
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
[](const pair<std::string, bool>& pair) { return pair.first; });
argparse::ArgumentParser program("PlatformSample");
program.add_argument("-d", "--dataset")
.help("Dataset file name")
.action([valid_datasets](const std::string& value) {
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
return value;
}
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
}
);
program.add_argument("-p", "--path")
.help(" folder where the data files are located, default")
.default_value(std::string{ PATH }
);
program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->toString())
.action([](const std::string& value) {
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw runtime_error("Model must be one of " + platform::Models::instance()->toString());
}
);
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw runtime_error(err.what());
}
catch (...) {
throw runtime_error("Number of folds must be an integer");
}});
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
bool class_last, stratified, tensors, dump_cpt;
std::string model_name, file_name, path, complete_file_name;
int nFolds, seed;
try {
program.parse_args(argc, argv);
file_name = program.get<std::string>("dataset");
path = program.get<std::string>("path");
model_name = program.get<std::string>("model");
complete_file_name = path + file_name + ".arff";
stratified = program.get<bool>("stratified");
tensors = program.get<bool>("tensors");
nFolds = program.get<int>("folds");
seed = program.get<int>("seed");
dump_cpt = program.get<bool>("dumpcpt");
class_last = datasets[file_name];
if (!file_exists(complete_file_name)) {
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
}
}
catch (const exception& err) {
cerr << err.what() << std::endl;
cerr << program;
exit(1);
}
/*
* Begin Processing
*/
auto handler = ArffFiles();
handler.load(complete_file_name, class_last);
// Get Dataset X, y
std::vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
std::vector<std::string> features;
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features),
[](const pair<std::string, std::string>& item) { return item.first; });
// Discretize Dataset
auto [Xd, maxes] = discretize(X, y, features);
maxes[className] = *max_element(y.begin(), y.end()) + 1;
map<std::string, std::vector<int>> states;
for (auto feature : features) {
states[feature] = std::vector<int>(maxes[feature]);
}
states[className] = std::vector<int>(maxes[className]);
// Output the states
std::cout << std::string(80, '-') << std::endl;
std::cout << "States" << std::endl;
for (auto feature : features) {
std::cout << feature << ": " << states[feature].size() << std::endl;
}
std::cout << std::string(80, '-') << std::endl;
//auto clf = platform::Models::instance()->create("SPODE");
auto clf = bayesnet::SPODE(2);
bayesnet::Smoothing_t smoothing = bayesnet::Smoothing_t::ORIGINAL;
clf.fit(Xd, y, features, className, states, smoothing);
if (dump_cpt) {
std::cout << "--- CPT Tables ---" << std::endl;
std::cout << clf.dump_cpt();
}
std::cout << "--- Datos predicción ---" << std::endl;
std::cout << "Orden de variables: " << std::endl;
for (auto feature : features) {
std::cout << feature << ", ";
}
std::cout << std::endl;
std::cout << "X[0]: ";
for (int i = 0; i < Xd.size(); ++i) {
std::cout << Xd[i][0] << ", ";
}
std::cout << std::endl;
std::cout << std::string(80, '-') << std::endl;
auto lines = clf.show();
for (auto line : lines) {
std::cout << line << std::endl;
}
std::cout << "--- Topological Order ---" << std::endl;
auto order = clf.topological_order();
for (auto name : order) {
std::cout << name << ", ";
}
auto predict_proba = clf.predict_proba(Xd);
std::cout << "Instances predict_proba: ";
for (int i = 0; i < predict_proba.size(); i++) {
std::cout << "Instance " << i << ": ";
for (int j = 0; j < 4; ++j) {
std::cout << Xd[j][i] << ", ";
}
std::cout << ": ";
for (auto score : predict_proba[i]) {
std::cout << score << ", ";
}
std::cout << std::endl;
}
// std::cout << std::endl;
// std::cout << "end." << std::endl;
// auto score = clf->score(Xd, y);
// std::cout << "Score: " << score << std::endl;
// auto graph = clf->graph();
// auto dot_file = model_name + "_" + file_name;
// ofstream file(dot_file + ".dot");
// file << graph;
// file.close();
// std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl;
// std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl;
// std::string stratified_string = stratified ? " Stratified" : "";
// std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl;
// std::cout << "==========================================" << std::endl;
// torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
// torch::Tensor yt = torch::tensor(y, torch::kInt32);
// for (int i = 0; i < features.size(); ++i) {
// Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
// }
// float total_score = 0, total_score_train = 0, score_train, score_test;
// folding::Fold* fold;
// double nodes = 0.0;
// if (stratified)
// fold = new folding::StratifiedKFold(nFolds, y, seed);
// else
// fold = new folding::KFold(nFolds, y.size(), seed);
// for (auto i = 0; i < nFolds; ++i) {
// auto [train, test] = fold->getFold(i);
// std::cout << "Fold: " << i + 1 << std::endl;
// if (tensors) {
// auto ttrain = torch::tensor(train, torch::kInt64);
// auto ttest = torch::tensor(test, torch::kInt64);
// torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
// torch::Tensor ytraint = yt.index({ ttrain });
// torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
// torch::Tensor ytestt = yt.index({ ttest });
// clf->fit(Xtraint, ytraint, features, className, states, smoothing);
// auto temp = clf->predict(Xtraint);
// score_train = clf->score(Xtraint, ytraint);
// score_test = clf->score(Xtestt, ytestt);
// } else {
// auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
// auto [Xtest, ytest] = extract_indices(test, Xd, y);
// clf->fit(Xtrain, ytrain, features, className, states, smoothing);
// std::cout << "Nodes: " << clf->getNumberOfNodes() << std::endl;
// nodes += clf->getNumberOfNodes();
// score_train = clf->score(Xtrain, ytrain);
// score_test = clf->score(Xtest, ytest);
// }
// // if (dump_cpt) {
// // std::cout << "--- CPT Tables ---" << std::endl;
// // std::cout << clf->dump_cpt();
// // }
// total_score_train += score_train;
// total_score += score_test;
// std::cout << "Score Train: " << score_train << std::endl;
// std::cout << "Score Test : " << score_test << std::endl;
// std::cout << "-------------------------------------------------------------------------------" << std::endl;
// }
// std::cout << "Nodes: " << nodes / nFolds << std::endl;
// std::cout << "**********************************************************************************" << std::endl;
// std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl;
// std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0;
}

View File

@@ -1,53 +1,80 @@
include_directories( include_directories(
## Libs
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/src
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/folding
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/mdlp
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/json/include
${Platform_SOURCE_DIR}/lib/PyClassifiers/src
${Platform_SOURCE_DIR}/lib/Files
${Platform_SOURCE_DIR}/lib/mdlp
${Platform_SOURCE_DIR}/lib/argparse/include
${Platform_SOURCE_DIR}/lib/json/include
${Platform_SOURCE_DIR}/lib/libxlsxwriter/include
${Python3_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS}
${MPI_CXX_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS}
${CMAKE_BINARY_DIR}/configured_files/include ${CMAKE_BINARY_DIR}/configured_files/include
## Platform ${Platform_SOURCE_DIR}/src
${Platform_SOURCE_DIR}/src/common
${Platform_SOURCE_DIR}/src/best
${Platform_SOURCE_DIR}/src/grid
${Platform_SOURCE_DIR}/src/main
${Platform_SOURCE_DIR}/src/manage
${Platform_SOURCE_DIR}/src/reports
) )
# b_best # b_best
set(best_sources b_best.cc BestResults.cc Statistics.cc BestResultsExcel.cc) add_executable(
list(TRANSFORM best_sources PREPEND best/) b_best commands/b_best.cpp best/Statistics.cpp
add_executable(b_best ${best_sources} main/Result.cc reports/ReportExcel.cc reports/ReportBase.cc reports/ExcelFile.cc common/Datasets.cc common/Dataset.cc) best/BestResultsExcel.cpp best/BestResultsTex.cpp best/BestResultsMd.cpp best/BestResults.cpp
target_link_libraries(b_best Boost::boost "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp) common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
main/Models.cpp main/Scores.cpp
reports/ReportExcel.cpp reports/ReportBase.cpp reports/ExcelFile.cpp
results/Result.cpp
experimental_clfs/XA1DE.cpp
experimental_clfs/ExpClf.cpp
experimental_clfs/DecisionTree.cpp
experimental_clfs/AdaBoost.cpp
)
target_link_libraries(b_best Boost::boost pyclassifiers::pyclassifiers bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy libxlsxwriter::libxlsxwriter)
# b_grid # b_grid
set(grid_sources b_grid.cc GridSearch.cc GridData.cc) set(grid_sources GridSearch.cpp GridData.cpp GridExperiment.cpp GridBase.cpp )
list(TRANSFORM grid_sources PREPEND grid/) list(TRANSFORM grid_sources PREPEND grid/)
add_executable(b_grid ${grid_sources} main/HyperParameters.cc main/Models.cc common/Datasets.cc common/Dataset.cc) add_executable(b_grid commands/b_grid.cpp ${grid_sources}
target_link_libraries(b_grid PyClassifiers ${MPI_CXX_LIBRARIES} ArffFiles) common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
main/HyperParameters.cpp main/Models.cpp main/Experiment.cpp main/Scores.cpp main/ArgumentsExperiment.cpp
reports/ReportConsole.cpp reports/ReportBase.cpp
results/Result.cpp
experimental_clfs/XA1DE.cpp
experimental_clfs/ExpClf.cpp
experimental_clfs/DecisionTree.cpp
experimental_clfs/AdaBoost.cpp
)
target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} pyclassifiers::pyclassifiers bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy)
# b_list # b_list
set(list_sources b_list.cc DatasetsExcel.cc) add_executable(b_list commands/b_list.cpp
list(TRANSFORM list_sources PREPEND list/) common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
add_executable(b_list ${list_sources} common/Datasets.cc common/Dataset.cc reports/ReportExcel.cc reports/ExcelFile.cc reports/ReportBase.cc) main/Models.cpp main/Scores.cpp
target_link_libraries(b_list "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp) reports/ReportExcel.cpp reports/ExcelFile.cpp reports/ReportBase.cpp reports/DatasetsExcel.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
results/Result.cpp results/ResultsDatasetExcel.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
experimental_clfs/XA1DE.cpp
experimental_clfs/ExpClf.cpp
experimental_clfs/DecisionTree.cpp
experimental_clfs/AdaBoost.cpp
)
target_link_libraries(b_list pyclassifiers::pyclassifiers bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy libxlsxwriter::libxlsxwriter)
# b_main # b_main
set(main_sources b_main.cc Experiment.cc Models.cc HyperParameters.cc) set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp ArgumentsExperiment.cpp)
list(TRANSFORM main_sources PREPEND main/) list(TRANSFORM main_sources PREPEND main/)
add_executable(b_main ${main_sources} common/Datasets.cc common/Dataset.cc reports/ReportConsole.cc reports/ReportBase.cc main/Result.cc) add_executable(b_main commands/b_main.cpp ${main_sources}
target_link_libraries(b_main PyClassifiers BayesNet ArffFiles mdlp) common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
reports/ReportConsole.cpp reports/ReportBase.cpp
results/Result.cpp
experimental_clfs/XA1DE.cpp
experimental_clfs/ExpClf.cpp
experimental_clfs/ExpClf.cpp
experimental_clfs/DecisionTree.cpp
experimental_clfs/AdaBoost.cpp
)
target_link_libraries(b_main PRIVATE nlohmann_json::nlohmann_json pyclassifiers::pyclassifiers bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy)
# b_manage # b_manage
set(manage_sources b_manage.cc ManageResults.cc CommandParser.cc Results.cc) set(manage_sources ManageScreen.cpp OptionsMenu.cpp ResultsManager.cpp)
list(TRANSFORM manage_sources PREPEND manage/) list(TRANSFORM manage_sources PREPEND manage/)
add_executable(b_manage ${manage_sources} main/Result.cc reports/ReportConsole.cc reports/ReportExcel.cc reports/ReportBase.cc reports/ExcelFile.cc common/Datasets.cc common/Dataset.cc) add_executable(
target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp) b_manage commands/b_manage.cpp ${manage_sources}
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
reports/ReportConsole.cpp reports/ReportExcel.cpp reports/ReportExcelCompared.cpp reports/ReportBase.cpp reports/ExcelFile.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
results/Result.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
main/Scores.cpp
)
target_link_libraries(b_manage torch::torch libxlsxwriter::libxlsxwriter fimdlp::fimdlp bayesnet::bayesnet argparse::argparse)
# b_results
add_executable(b_results commands/b_results.cpp)
target_link_libraries(b_results torch::torch libxlsxwriter::libxlsxwriter fimdlp::fimdlp bayesnet::bayesnet argparse::argparse)

View File

@@ -4,12 +4,17 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <algorithm> #include <algorithm>
#include "BestResults.h" #include <cctype>
#include "Result.h" #include "common/Colors.h"
#include "Colors.h" #include "common/CLocale.h"
#include "Statistics.h" #include "common/Paths.h"
#include "common/Utils.h" // compute_std
#include "results/Result.h"
#include "BestResultsExcel.h" #include "BestResultsExcel.h"
#include "CLocale.h" #include "BestResultsTex.h"
#include "BestResultsMd.h"
#include "best/Statistics.h"
#include "BestResults.h"
namespace fs = std::filesystem; namespace fs = std::filesystem;
@@ -42,26 +47,29 @@ namespace platform {
for (auto const& item : data.at("results")) { for (auto const& item : data.at("results")) {
bool update = true; bool update = true;
auto datasetName = item.at("dataset").get<std::string>(); auto datasetName = item.at("dataset").get<std::string>();
if (dataset != "any" && dataset != datasetName) {
continue;
}
if (bests.contains(datasetName)) { if (bests.contains(datasetName)) {
if (item.at("score").get<double>() < bests[datasetName].at(0).get<double>()) { if (item.at("score").get<double>() < bests[datasetName].at(0).get<double>()) {
update = false; update = false;
} }
} }
if (update) { if (update) {
bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file }; bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file, item.at("score_std").get<double>() };
} }
} }
} }
std::string bestFileName = path + bestResultFile(); if (bests.empty()) {
std::cerr << Colors::MAGENTA() << "No results found for model " << model << " and score " << score << Colors::RESET() << std::endl;
exit(1);
}
std::string bestFileName = path + Paths::bestResultsFile(score, model);
std::ofstream file(bestFileName); std::ofstream file(bestFileName);
file << bests; file << bests;
file.close(); file.close();
return bestFileName; return bestFileName;
} }
std::string BestResults::bestResultFile()
{
return "best_results_" + score + "_" + model + ".json";
}
std::pair<std::string, std::string> getModelScore(std::string name) std::pair<std::string, std::string> getModelScore(std::string name)
{ {
// results_accuracy_BoostAODE_MacBookpro16_2023-09-06_12:27:00_1.json // results_accuracy_BoostAODE_MacBookpro16_2023-09-06_12:27:00_1.json
@@ -116,15 +124,24 @@ namespace platform {
} }
result = std::vector<std::string>(models.begin(), models.end()); result = std::vector<std::string>(models.begin(), models.end());
maxModelName = (*max_element(result.begin(), result.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size(); maxModelName = (*max_element(result.begin(), result.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
maxModelName = std::max(12, maxModelName); maxModelName = std::max(minLength, maxModelName);
return result; return result;
} }
std::string toLower(std::string data)
{
std::transform(data.begin(), data.end(), data.begin(),
[](unsigned char c) { return std::tolower(c); });
return data;
}
std::vector<std::string> BestResults::getDatasets(json table) std::vector<std::string> BestResults::getDatasets(json table)
{ {
std::vector<std::string> datasets; std::vector<std::string> datasets;
for (const auto& dataset : table.items()) { for (const auto& dataset_ : table.items()) {
datasets.push_back(dataset.key()); datasets.push_back(dataset_.key());
} }
std::stable_sort(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) {
return toLower(a) < toLower(b);
});
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size(); maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
maxDatasetName = std::max(7, maxDatasetName); maxDatasetName = std::max(7, maxDatasetName);
return datasets; return datasets;
@@ -143,7 +160,7 @@ namespace platform {
} }
void BestResults::listFile() void BestResults::listFile()
{ {
std::string bestFileName = path + bestResultFile(); std::string bestFileName = path + Paths::bestResultsFile(score, model);
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) { if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest); fclose(fileTest);
} else { } else {
@@ -167,10 +184,9 @@ namespace platform {
std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << "Dataset" << "Score " << std::setw(maxFileName) << "File" << " Hyperparameters" << std::endl; std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << "Dataset" << "Score " << std::setw(maxFileName) << "File" << " Hyperparameters" << std::endl;
std::cout << "=== " << std::string(maxDatasetName, '=') << " =========== " << std::string(maxFileName, '=') << " " << std::string(maxHyper, '=') << std::endl; std::cout << "=== " << std::string(maxDatasetName, '=') << " =========== " << std::string(maxFileName, '=') << " " << std::string(maxHyper, '=') << std::endl;
auto i = 0; auto i = 0;
bool odd = true;
double total = 0; double total = 0;
for (auto const& item : data.items()) { for (auto const& item : data.items()) {
auto color = odd ? Colors::BLUE() : Colors::CYAN(); auto color = (i % 2) ? Colors::BLUE() : Colors::CYAN();
double value = item.value().at(0).get<double>(); double value = item.value().at(0).get<double>();
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " "; std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
std::cout << std::setw(maxDatasetName) << std::left << item.key() << " "; std::cout << std::setw(maxDatasetName) << std::left << item.key() << " ";
@@ -179,7 +195,6 @@ namespace platform {
std::cout << item.value().at(1) << " "; std::cout << item.value().at(1) << " ";
std::cout << std::endl; std::cout << std::endl;
total += value; total += value;
odd = !odd;
} }
std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ===========" << std::endl; std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ===========" << std::endl;
std::cout << Colors::GREEN() << " Total" << std::string(maxDatasetName - 5, '.') << " " << std::setw(11) << std::setprecision(8) << std::fixed << total << std::endl; std::cout << Colors::GREEN() << " Total" << std::string(maxDatasetName - 5, '.') << " " << std::setw(11) << std::setprecision(8) << std::fixed << total << std::endl;
@@ -191,7 +206,7 @@ namespace platform {
auto maxDate = std::filesystem::file_time_type::max(); auto maxDate = std::filesystem::file_time_type::max();
for (const auto& model : models) { for (const auto& model : models) {
this->model = model; this->model = model;
std::string bestFileName = path + bestResultFile(); std::string bestFileName = path + Paths::bestResultsFile(score, model);
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) { if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
fclose(fileTest); fclose(fileTest);
} else { } else {
@@ -208,13 +223,20 @@ namespace platform {
table["dateTable"] = ftime_to_string(maxDate); table["dateTable"] = ftime_to_string(maxDate);
return table; return table;
} }
void BestResults::printTableResults(std::vector<std::string> models, json table)
void BestResults::printTableResults(std::vector<std::string> models, json table, bool tex, bool index)
{ {
std::stringstream oss; std::stringstream oss;
oss << Colors::GREEN() << "Best results for " << score << " as of " << table.at("dateTable").get<std::string>() << std::endl; oss << Colors::GREEN() << "Best results for " << score << " as of " << table.at("dateTable").get<std::string>() << std::endl;
std::cout << oss.str(); std::cout << oss.str();
std::cout << std::string(oss.str().size() - 8, '-') << std::endl; std::cout << std::string(oss.str().size() - 8, '-') << std::endl;
std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << std::string("Dataset"); std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << std::string("Dataset");
auto bestResultsTex = BestResultsTex(score);
auto bestResultsMd = BestResultsMd();
if (tex) {
bestResultsTex.results_header(models, table.at("dateTable").get<std::string>(), index);
bestResultsMd.results_header(models, table.at("dateTable").get<std::string>());
}
for (const auto& model : models) { for (const auto& model : models) {
std::cout << std::setw(maxModelName) << std::left << model << " "; std::cout << std::setw(maxModelName) << std::left << model << " ";
} }
@@ -225,23 +247,23 @@ namespace platform {
} }
std::cout << std::endl; std::cout << std::endl;
auto i = 0; auto i = 0;
bool odd = true; std::map<std::string, std::vector<double>> totals;
std::map<std::string, double> totals;
int nDatasets = table.begin().value().size(); int nDatasets = table.begin().value().size();
for (const auto& model : models) {
totals[model] = 0.0;
}
auto datasets = getDatasets(table.begin().value()); auto datasets = getDatasets(table.begin().value());
for (auto const& dataset : datasets) { if (tex) {
auto color = odd ? Colors::BLUE() : Colors::CYAN(); bestResultsTex.results_body(datasets, table, index);
bestResultsMd.results_body(datasets, table);
}
for (auto const& dataset_ : datasets) {
auto color = (i % 2) ? Colors::BLUE() : Colors::CYAN();
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " "; std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
std::cout << std::setw(maxDatasetName) << std::left << dataset << " "; std::cout << std::setw(maxDatasetName) << std::left << dataset_ << " ";
double maxValue = 0; double maxValue = 0;
// Find out the max value for this dataset // Find out the max value for this dataset
for (const auto& model : models) { for (const auto& model : models) {
double value; double value;
try { try {
value = table[model].at(dataset).at(0).get<double>(); value = table[model].at(dataset_).at(0).get<double>();
} }
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) { catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
value = -1.0; value = -1.0;
@@ -253,12 +275,14 @@ namespace platform {
// Print the row with red colors on max values // Print the row with red colors on max values
for (const auto& model : models) { for (const auto& model : models) {
std::string efectiveColor = color; std::string efectiveColor = color;
double value; double value, std;
try { try {
value = table[model].at(dataset).at(0).get<double>(); value = table[model].at(dataset_).at(0).get<double>();
std = table[model].at(dataset_).at(3).get<double>();
} }
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) { catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
value = -1.0; value = -1.0;
std = -1.0;
} }
if (value == maxValue) { if (value == maxValue) {
efectiveColor = Colors::RED(); efectiveColor = Colors::RED();
@@ -266,31 +290,38 @@ namespace platform {
if (value == -1) { if (value == -1) {
std::cout << Colors::YELLOW() << std::setw(maxModelName) << std::right << "N/A" << " "; std::cout << Colors::YELLOW() << std::setw(maxModelName) << std::right << "N/A" << " ";
} else { } else {
totals[model] += value; totals[model].push_back(value);
std::cout << efectiveColor << std::setw(maxModelName) << std::setprecision(maxModelName - 2) << std::fixed << value << " "; std::cout << efectiveColor << std::setw(maxModelName - 6) << std::setprecision(maxModelName - 8) << std::fixed << value;
std::cout << efectiveColor << "±" << std::setw(5) << std::setprecision(3) << std::fixed << std << " ";
} }
} }
std::cout << std::endl; std::cout << std::endl;
odd = !odd;
} }
std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " "; std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ";
for (const auto& model : models) { for (const auto& model : models) {
std::cout << std::string(maxModelName, '=') << " "; std::cout << std::string(maxModelName, '=') << " ";
} }
std::cout << std::endl; std::cout << std::endl;
std::cout << Colors::GREEN() << " Totals" << std::string(maxDatasetName - 6, '.') << " "; std::cout << Colors::GREEN() << " Average" << std::string(maxDatasetName - 7, '.') << " ";
double max_value = 0.0; double max_value = 0.0;
std::string best_model = "";
for (const auto& total : totals) { for (const auto& total : totals) {
if (total.second > max_value) { auto actual = std::reduce(total.second.begin(), total.second.end());
max_value = total.second; if (actual > max_value) {
max_value = actual;
best_model = total.first;
} }
} }
if (tex) {
bestResultsTex.results_footer(totals, best_model);
bestResultsMd.results_footer(totals, best_model);
}
for (const auto& model : models) { for (const auto& model : models) {
std::string efectiveColor = Colors::GREEN(); std::string efectiveColor = model == best_model ? Colors::RED() : Colors::GREEN();
if (totals[model] == max_value) { double value = std::reduce(totals[model].begin(), totals[model].end()) / nDatasets;
efectiveColor = Colors::RED(); double std = compute_std(totals[model], value);
} std::cout << efectiveColor << std::right << std::setw(maxModelName - 6) << std::setprecision(maxModelName - 8) << std::fixed << value;
std::cout << efectiveColor << std::right << std::setw(maxModelName) << std::setprecision(maxModelName - 4) << std::fixed << totals[model] << " "; std::cout << efectiveColor << "±" << std::setw(5) << std::setprecision(3) << std::fixed << std << " ";
} }
std::cout << std::endl; std::cout << std::endl;
} }
@@ -302,54 +333,53 @@ namespace platform {
// Build the table of results // Build the table of results
json table = buildTableResults(models); json table = buildTableResults(models);
std::vector<std::string> datasets = getDatasets(table.begin().value()); std::vector<std::string> datasets = getDatasets(table.begin().value());
BestResultsExcel excel_report(score, datasets); BestResultsExcel excel_report(path, score, datasets);
excel_report.reportSingle(model, path + bestResultFile()); excel_report.reportSingle(model, path + Paths::bestResultsFile(score, model));
messageExcelFile(excel_report.getFileName()); messageOutputFile("Excel", excel_report.getFileName());
excelFileName = excel_report.getFileName();
} }
} }
void BestResults::reportAll(bool excel) void BestResults::reportAll(bool excel, bool tex, bool index)
{ {
auto models = getModels(); auto models = getModels();
// Build the table of results // Build the table of results
json table = buildTableResults(models); json table = buildTableResults(models);
std::vector<std::string> datasets = getDatasets(table.begin().value()); std::vector<std::string> datasets = getDatasets(table.begin().value());
// Print the table of results // Print the table of results
printTableResults(models, table); printTableResults(models, table, tex, index);
// Compute the Friedman test // Compute the Friedman test
std::map<std::string, std::map<std::string, float>> ranksModels; std::map<std::string, std::map<std::string, float>> ranksModels;
if (friedman) { if (friedman) {
Statistics stats(models, datasets, table, significance); Statistics stats(score, models, datasets, table, significance);
auto result = stats.friedmanTest(); auto result = stats.friedmanTest();
stats.postHocHolmTest(result); stats.postHocTest();
stats.postHocTestReport(result, tex);
ranksModels = stats.getRanks(); ranksModels = stats.getRanks();
} }
if (tex) {
messageOutputFile("TeX", Paths::tex() + Paths::tex_output());
messageOutputFile("MarkDown", Paths::tex() + Paths::md_output());
if (friedman) {
messageOutputFile("TeX", Paths::tex() + Paths::tex_post_hoc());
messageOutputFile("MarkDown", Paths::tex() + Paths::md_post_hoc());
}
}
if (excel) { if (excel) {
BestResultsExcel excel(score, datasets); BestResultsExcel excel(path, score, datasets);
excel.reportAll(models, table, ranksModels, friedman, significance); excel.reportAll(models, table, ranksModels, friedman, significance);
if (friedman) { if (friedman) {
int idx = -1; Statistics stats(score, models, datasets, table, significance);
double min = 2000; int idx = stats.getControlIdx();
// Find out the control model
auto totals = std::vector<double>(models.size(), 0.0);
for (const auto& dataset : datasets) {
for (int i = 0; i < models.size(); ++i) {
totals[i] += ranksModels[dataset][models[i]];
}
}
for (int i = 0; i < models.size(); ++i) {
if (totals[i] < min) {
min = totals[i];
idx = i;
}
}
model = models.at(idx); model = models.at(idx);
excel.reportSingle(model, path + bestResultFile()); excel.reportSingle(model, path + Paths::bestResultsFile(score, model));
} }
messageExcelFile(excel.getFileName()); messageOutputFile("Excel", excel.getFileName());
excelFileName = excel.getFileName();
} }
} }
void BestResults::messageExcelFile(const std::string& fileName) void BestResults::messageOutputFile(const std::string& title, const std::string& fileName)
{ {
std::cout << Colors::YELLOW() << "** Excel file generated: " << fileName << Colors::RESET() << std::endl; std::cout << Colors::YELLOW() << "** " << std::setw(8) << std::left << title
<< " file generated: " << fileName << Colors::RESET() << std::endl;
} }
} }

View File

@@ -2,35 +2,39 @@
#define BESTRESULTS_H #define BESTRESULTS_H
#include <string> #include <string>
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
using json = nlohmann::json;
namespace platform { namespace platform {
using json = nlohmann::ordered_json;
class BestResults { class BestResults {
public: public:
explicit BestResults(const std::string& path, const std::string& score, const std::string& model, bool friedman, double significance = 0.05) explicit BestResults(const std::string& path, const std::string& score, const std::string& model, const std::string& dataset, bool friedman, double significance = 0.05)
: path(path), score(score), model(model), friedman(friedman), significance(significance) : path(path), score(score), model(model), dataset(dataset), friedman(friedman), significance(significance)
{ {
} }
std::string build(); std::string build();
void reportSingle(bool excel); void reportSingle(bool excel);
void reportAll(bool excel); void reportAll(bool excel, bool tex, bool index);
void buildAll(); void buildAll();
std::string getExcelFileName() const { return excelFileName; }
private: private:
std::vector<std::string> getModels(); std::vector<std::string> getModels();
std::vector<std::string> getDatasets(json table); std::vector<std::string> getDatasets(json table);
std::vector<std::string> loadResultFiles(); std::vector<std::string> loadResultFiles();
void messageExcelFile(const std::string& fileName); void messageOutputFile(const std::string& title, const std::string& fileName);
json buildTableResults(std::vector<std::string> models); json buildTableResults(std::vector<std::string> models);
void printTableResults(std::vector<std::string> models, json table); void printTableResults(std::vector<std::string> models, json table, bool tex, bool index);
std::string bestResultFile();
json loadFile(const std::string& fileName); json loadFile(const std::string& fileName);
void listFile(); void listFile();
std::string path; std::string path;
std::string score; std::string score;
std::string model; std::string model;
std::string dataset;
bool friedman; bool friedman;
double significance; double significance;
int maxModelName = 0; int maxModelName = 0;
int maxDatasetName = 0; int maxDatasetName = 0;
int minLength = 13; // Minimum length for scores
std::string excelFileName;
}; };
} }
#endif //BESTRESULTS_H #endif

View File

@@ -1,10 +1,10 @@
#include <sstream> #include <sstream>
#include "BestResultsExcel.h"
#include "Paths.h"
#include <map> #include <map>
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
#include "Statistics.h" #include "common/Paths.h"
#include "ReportExcel.h" #include "reports/ReportExcel.h"
#include "best/Statistics.h"
#include "BestResultsExcel.h"
namespace platform { namespace platform {
json loadResultData(const std::string& fileName) json loadResultData(const std::string& fileName)
@@ -30,9 +30,9 @@ namespace platform {
} }
return columnName; return columnName;
} }
BestResultsExcel::BestResultsExcel(const std::string& score, const std::vector<std::string>& datasets) : score(score), datasets(datasets) BestResultsExcel::BestResultsExcel(const std::string& path, const std::string& score, const std::vector<std::string>& datasets) : path(path), score(score), datasets(datasets)
{ {
file_name = "BestResults.xlsx"; file_name = Paths::bestResultsExcel(score);
workbook = workbook_new(getFileName().c_str()); workbook = workbook_new(getFileName().c_str());
setProperties("Best Results"); setProperties("Best Results");
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size(); int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
@@ -64,19 +64,21 @@ namespace platform {
json data = loadResultData(fileName); json data = loadResultData(fileName);
std::string title = "Best results for " + model; std::string title = "Best results for " + model;
worksheet_merge_range(worksheet, 0, 0, 0, 4, title.c_str(), styles["headerFirst"]); worksheet_merge_range(worksheet, 0, 0, 0, 5, title.c_str(), styles["headerFirst"]);
// Body header // Body header
row = 3; row = 3;
int col = 1; int col = 1;
writeString(row, 0, "", "bodyHeader"); writeString(row, 0, "#", "bodyHeader");
writeString(row, 1, "Dataset", "bodyHeader"); writeString(row, 1, "Dataset", "bodyHeader");
writeString(row, 2, "Score", "bodyHeader"); writeString(row, 2, "Score", "bodyHeader");
writeString(row, 3, "File", "bodyHeader"); writeString(row, 3, "File", "bodyHeader");
writeString(row, 4, "Hyperparameters", "bodyHeader"); writeString(row, 4, "Hyperparameters", "bodyHeader");
writeString(row, 5, "F", "bodyHeader");
auto i = 0; auto i = 0;
std::string hyperparameters; std::string hyperparameters;
int hypSize = 22; int hypSize = 22;
std::map<std::string, std::string> files; // map of files imported and their tabs std::map<std::string, std::string> files; // map of files imported and their tabs
int numLines = data.size();
for (auto const& item : data.items()) { for (auto const& item : data.items()) {
row++; row++;
writeInt(row, 0, i++, "ints"); writeInt(row, 0, i++, "ints");
@@ -90,7 +92,7 @@ namespace platform {
catch (const std::out_of_range& oor) { catch (const std::out_of_range& oor) {
auto tabName = "table_" + std::to_string(i); auto tabName = "table_" + std::to_string(i);
auto worksheetNew = workbook_add_worksheet(workbook, tabName.c_str()); auto worksheetNew = workbook_add_worksheet(workbook, tabName.c_str());
json data = loadResultData(Paths::results() + fileName); json data = loadResultData(path + fileName);
auto report = ReportExcel(data, false, workbook, worksheetNew); auto report = ReportExcel(data, false, workbook, worksheetNew);
report.show(); report.show();
hyperlink = "#table_" + std::to_string(i); hyperlink = "#table_" + std::to_string(i);
@@ -104,6 +106,8 @@ namespace platform {
hypSize = hyperparameters.size(); hypSize = hyperparameters.size();
} }
writeString(row, 4, hyperparameters, "text"); writeString(row, 4, hyperparameters, "text");
std::string countHyperparameters = "=COUNTIF(e5:e" + std::to_string(numLines + 4) + ", e" + std::to_string(row + 1) + ")";
worksheet_write_formula(worksheet, row, 5, countHyperparameters.c_str(), efectiveStyle("ints"));
} }
row++; row++;
// Set Totals // Set Totals
@@ -160,6 +164,7 @@ namespace platform {
addConditionalFormat("max"); addConditionalFormat("max");
footer(false); footer(false);
if (friedman) { if (friedman) {
if (score == "accuracy") {
// Create Sheet with ranks // Create Sheet with ranks
worksheet = workbook_add_worksheet(workbook, "Ranks"); worksheet = workbook_add_worksheet(workbook, "Ranks");
formatColumns(); formatColumns();
@@ -167,6 +172,7 @@ namespace platform {
body(true); body(true);
addConditionalFormat("min"); addConditionalFormat("min");
footer(true); footer(true);
}
// Create Sheet with Friedman Test // Create Sheet with Friedman Test
doFriedman(); doFriedman();
} }
@@ -180,7 +186,7 @@ namespace platform {
// Body header // Body header
row = 3; row = 3;
int col = 1; int col = 1;
writeString(row, 0, "", "bodyHeader"); writeString(row, 0, "#", "bodyHeader");
writeString(row, 1, "Dataset", "bodyHeader"); writeString(row, 1, "Dataset", "bodyHeader");
for (const auto& model : models) { for (const auto& model : models) {
writeString(row, ++col, model.c_str(), "bodyHeader"); writeString(row, ++col, model.c_str(), "bodyHeader");
@@ -237,11 +243,12 @@ namespace platform {
} }
worksheet_merge_range(worksheet, 0, 0, 0, 7, "Friedman Test", styles["headerFirst"]); worksheet_merge_range(worksheet, 0, 0, 0, 7, "Friedman Test", styles["headerFirst"]);
row = 2; row = 2;
Statistics stats(models, datasets, table, significance, false); Statistics stats(score, models, datasets, table, significance, false); // No output
auto result = stats.friedmanTest(); auto result = stats.friedmanTest();
stats.postHocHolmTest(result); stats.postHocTest();
stats.postHocTestReport(result, false); // No tex output
auto friedmanResult = stats.getFriedmanResult(); auto friedmanResult = stats.getFriedmanResult();
auto holmResult = stats.getHolmResult(); auto postHocResults = stats.getPostHocResults();
worksheet_merge_range(worksheet, row, 0, row, 7, "Null hypothesis: H0 'There is no significant differences between all the classifiers.'", styles["headerSmall"]); worksheet_merge_range(worksheet, row, 0, row, 7, "Null hypothesis: H0 'There is no significant differences between all the classifiers.'", styles["headerSmall"]);
row += 2; row += 2;
writeString(row, 1, "Friedman Q", "bodyHeader"); writeString(row, 1, "Friedman Q", "bodyHeader");
@@ -260,7 +267,7 @@ namespace platform {
row += 2; row += 2;
worksheet_merge_range(worksheet, row, 0, row, 7, "Null hypothesis: H0 'There is no significant differences between the control model and the other models.'", styles["headerSmall"]); worksheet_merge_range(worksheet, row, 0, row, 7, "Null hypothesis: H0 'There is no significant differences between the control model and the other models.'", styles["headerSmall"]);
row += 2; row += 2;
std::string controlModel = "Control Model: " + holmResult.model; std::string controlModel = "Control Model: " + postHocResults.at(0).model;
worksheet_merge_range(worksheet, row, 1, row, 7, controlModel.c_str(), styles["bodyHeader_odd"]); worksheet_merge_range(worksheet, row, 1, row, 7, controlModel.c_str(), styles["bodyHeader_odd"]);
row++; row++;
writeString(row, 1, "Model", "bodyHeader"); writeString(row, 1, "Model", "bodyHeader");
@@ -272,7 +279,7 @@ namespace platform {
writeString(row, 7, "Reject H0", "bodyHeader"); writeString(row, 7, "Reject H0", "bodyHeader");
row++; row++;
bool first = true; bool first = true;
for (const auto& item : holmResult.holmLines) { for (const auto& item : postHocResults) {
writeString(row, 1, item.model, "text"); writeString(row, 1, item.model, "text");
if (first) { if (first) {
// Control model info // Control model info

View File

@@ -1,17 +1,16 @@
#ifndef BESTRESULTS_EXCEL_H #ifndef BESTRESULTSEXCEL_H
#define BESTRESULTS_EXCEL_H #define BESTRESULTSEXCEL_H
#include "ExcelFile.h"
#include <vector> #include <vector>
#include <map> #include <map>
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
#include "reports/ExcelFile.h"
using json = nlohmann::json;
namespace platform { namespace platform {
using json = nlohmann::ordered_json;
class BestResultsExcel : public ExcelFile { class BestResultsExcel : public ExcelFile {
public: public:
BestResultsExcel(const std::string& score, const std::vector<std::string>& datasets); BestResultsExcel(const std::string& path, const std::string& score, const std::vector<std::string>& datasets);
~BestResultsExcel(); ~BestResultsExcel();
void reportAll(const std::vector<std::string>& models, const json& table, const std::map<std::string, std::map<std::string, float>>& ranks, bool friedman, double significance); void reportAll(const std::vector<std::string>& models, const json& table, const std::map<std::string, std::map<std::string, float>>& ranks, bool friedman, double significance);
void reportSingle(const std::string& model, const std::string& fileName); void reportSingle(const std::string& model, const std::string& fileName);
@@ -23,6 +22,7 @@ namespace platform {
void formatColumns(); void formatColumns();
void doFriedman(); void doFriedman();
void addConditionalFormat(std::string formula); void addConditionalFormat(std::string formula);
std::string path;
std::string score; std::string score;
std::vector<std::string> models; std::vector<std::string> models;
std::vector<std::string> datasets; std::vector<std::string> datasets;
@@ -34,4 +34,4 @@ namespace platform {
int datasetNameSize = 25; // Min size of the column int datasetNameSize = 25; // Min size of the column
}; };
} }
#endif //BESTRESULTS_EXCEL_H #endif

105
src/best/BestResultsMd.cpp Normal file
View File

@@ -0,0 +1,105 @@
#include <iostream>
#include "BestResultsMd.h"
#include "common/Utils.h" // compute_std
namespace platform {
using json = nlohmann::ordered_json;
void BestResultsMd::openMdFile(const std::string& name)
{
handler.open(name);
if (!handler.is_open()) {
std::cerr << "Error opening file " << name << std::endl;
exit(1);
}
}
void BestResultsMd::results_header(const std::vector<std::string>& models, const std::string& date)
{
this->models = models;
auto file_name = Paths::tex() + Paths::md_output();
openMdFile(file_name);
handler << "<!-- This file has been generated by the platform program" << std::endl;
handler << " Date: " << date.c_str() << std::endl;
handler << "" << std::endl;
handler << " Table of results" << std::endl;
handler << "-->" << std::endl;
handler << "| # | Dataset |";
for (const auto& model : models) {
handler << " " << model.c_str() << " |";
}
handler << std::endl;
handler << "|--: | :--- |";
for (const auto& model : models) {
handler << " :---: |";
}
handler << std::endl;
}
void BestResultsMd::results_body(const std::vector<std::string>& datasets, json& table)
{
int i = 0;
for (auto const& dataset : datasets) {
// Find out max value for this dataset
double max_value = 0;
// Find out the max value for this dataset
for (const auto& model : models) {
double value;
try {
value = table[model].at(dataset).at(0).get<double>();
}
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
value = -1.0;
}
if (value > max_value) {
max_value = value;
}
}
handler << "| " << ++i << " | " << dataset.c_str() << " | ";
for (const auto& model : models) {
double value = table[model].at(dataset).at(0).get<double>();
double std_value = table[model].at(dataset).at(3).get<double>();
const char* bold = value == max_value ? "**" : "";
handler << bold << std::setprecision(4) << std::fixed << value << "±" << std::setprecision(3) << std_value << bold << " | ";
}
handler << std::endl;
}
}
void BestResultsMd::results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model)
{
handler << "| | **Average Score** | ";
int nDatasets = totals.begin()->second.size();
for (const auto& model : models) {
double value = std::reduce(totals.at(model).begin(), totals.at(model).end()) / nDatasets;
double std_value = compute_std(totals.at(model), value);
const char* bold = model == best_model ? "**" : "";
handler << bold << std::setprecision(4) << std::fixed << value << "±" << std::setprecision(3) << std::fixed << std_value << bold << " | ";
}
handler.close();
}
void BestResultsMd::postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date)
{
auto file_name = Paths::tex() + Paths::md_post_hoc();
openMdFile(file_name);
handler << "<!-- This file has been generated by the platform program" << std::endl;
handler << " Date: " << date.c_str() << std::endl;
handler << std::endl;
handler << " Post-hoc handler test" << std::endl;
handler << "-->" << std::endl;
handler << "Post-hoc " << kind << " test: H<sub>0</sub>: There is no significant differences between the control model and the other models." << std::endl << std::endl;
handler << "| classifier | pvalue | rank | win | tie | loss | H<sub>0</sub> |" << std::endl;
handler << "| :-- | --: | --: | --:| --: | --: | :--: |" << std::endl;
bool first = true;
for (auto const& line : postHocResults) {
auto textStatus = !line.reject ? "**" : " ";
if (first) {
handler << "| " << line.model << " | - | " << std::fixed << std::setprecision(2) << line.rank << " | - | - | - |" << std::endl;
first = false;
} else {
handler << "| " << line.model << " | " << textStatus << std::scientific << std::setprecision(4) << line.pvalue << textStatus << " |";
handler << std::fixed << std::setprecision(2) << line.rank << " | " << line.wtl.win << " | " << line.wtl.tie << " | " << line.wtl.loss << " |";
handler << (line.reject ? "rejected" : "**accepted**") << " |" << std::endl;
}
}
handler << std::endl;
handler.close();
}
}

24
src/best/BestResultsMd.h Normal file
View File

@@ -0,0 +1,24 @@
#ifndef BEST_RESULTS_MD_H
#define BEST_RESULTS_MD_H
#include <map>
#include <vector>
#include <nlohmann/json.hpp>
#include "common/Paths.h"
#include "Statistics.h"
namespace platform {
using json = nlohmann::ordered_json;
class BestResultsMd {
public:
BestResultsMd() = default;
~BestResultsMd() = default;
void results_header(const std::vector<std::string>& models, const std::string& date);
void results_body(const std::vector<std::string>& datasets, json& table);
void results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model);
void postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date);
private:
void openMdFile(const std::string& name);
std::ofstream handler;
std::vector<std::string> models;
};
}
#endif

124
src/best/BestResultsTex.cpp Normal file
View File

@@ -0,0 +1,124 @@
#include <iostream>
#include "BestResultsTex.h"
#include "common/Utils.h" // compute_std
namespace platform {
using json = nlohmann::ordered_json;
void BestResultsTex::openTexFile(const std::string& name)
{
handler.open(name);
if (!handler.is_open()) {
std::cerr << "Error opening file " << name << std::endl;
exit(1);
}
}
void BestResultsTex::results_header(const std::vector<std::string>& models, const std::string& date, bool index)
{
this->models = models;
auto file_name = Paths::tex() + Paths::tex_output();
openTexFile(file_name);
handler << "%% This file has been generated by the platform program" << std::endl;
handler << "%% Date: " << date.c_str() << std::endl;
handler << "%%" << std::endl;
handler << "%% Table of results" << std::endl;
handler << "%%" << std::endl;
handler << "\\begin{table}[htbp] " << std::endl;
handler << "\\centering " << std::endl;
handler << "\\tiny " << std::endl;
handler << "\\renewcommand{\\arraystretch }{1.2} " << std::endl;
handler << "\\renewcommand{\\tabcolsep }{0.07cm} " << std::endl;
auto umetric = score;
umetric[0] = toupper(umetric[0]);
handler << "\\caption{" << umetric << " results(mean $\\pm$ std) for all the algorithms and datasets} " << std::endl;
handler << "\\label{tab:results_" << score << "}" << std::endl;
std::string header_dataset_name = index ? "r" : "l";
handler << "\\begin{tabular} {{" << header_dataset_name << std::string(models.size(), 'c').c_str() << "}}" << std::endl;
handler << "\\hline " << std::endl;
handler << "" << std::endl;
for (const auto& model : models) {
handler << "& " << model.c_str();
}
handler << "\\\\" << std::endl;
handler << "\\hline" << std::endl;
}
void BestResultsTex::results_body(const std::vector<std::string>& datasets, json& table, bool index)
{
int i = 0;
for (auto const& dataset : datasets) {
// Find out max value for this dataset
double max_value = 0;
for (const auto& model : models) {
double value;
try {
value = table[model].at(dataset).at(0).get<double>();
}
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
value = -1.0;
}
if (value > max_value) {
max_value = value;
}
}
if (index)
handler << ++i << " ";
else
handler << dataset << " ";
for (const auto& model : models) {
double value = table[model].at(dataset).at(0).get<double>();
double std_value = table[model].at(dataset).at(3).get<double>();
const char* bold = value == max_value ? "\\bfseries" : "";
handler << "& " << bold << std::setprecision(4) << std::fixed << value << "$\\pm$" << std::setprecision(3) << std_value;
}
handler << "\\\\" << std::endl;
}
}
void BestResultsTex::results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model)
{
handler << "\\hline" << std::endl;
handler << "Average ";
int nDatasets = totals.begin()->second.size();
for (const auto& model : models) {
double value = std::reduce(totals.at(model).begin(), totals.at(model).end()) / nDatasets;
double std_value = compute_std(totals.at(model), value);
const char* bold = model == best_model ? "\\bfseries" : "";
handler << "& " << bold << std::setprecision(4) << std::fixed << value << "$\\pm$" << std::setprecision(3) << std::fixed << std_value;
}
handler << "\\\\" << std::endl;
handler << "\\hline " << std::endl;
handler << "\\end{tabular}" << std::endl;
handler << "\\end{table}" << std::endl;
handler.close();
}
void BestResultsTex::postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date)
{
auto file_name = Paths::tex() + Paths::tex_post_hoc();
openTexFile(file_name);
handler << "%% This file has been generated by the platform program" << std::endl;
handler << "%% Date: " << date.c_str() << std::endl;
handler << "%%" << std::endl;
handler << "%% Post-hoc " << kind << " test" << std::endl;
handler << "%%" << std::endl;
handler << "\\begin{table}[htbp]" << std::endl;
handler << "\\centering" << std::endl;
handler << "\\caption{Results of the post-hoc " << kind << " test for the mean " << score << " of the algorithms.}\\label{ tab:tests }" << std::endl;
handler << "\\begin{tabular}{lrrrrr}" << std::endl;
handler << "\\hline" << std::endl;
handler << "classifier & pvalue & rank & win & tie & loss\\\\" << std::endl;
handler << "\\hline" << std::endl;
bool first = true;
for (auto const& line : postHocResults) {
auto textStatus = !line.reject ? "\\bf " : " ";
if (first) {
handler << line.model << " & - & " << std::fixed << std::setprecision(2) << line.rank << " & - & - & - \\\\" << std::endl;
first = false;
} else {
handler << line.model << " & " << textStatus << std::scientific << std::setprecision(4) << line.pvalue << " & ";
handler << std::fixed << std::setprecision(2) << line.rank << " & " << line.wtl.win << " & " << line.wtl.tie << " & " << line.wtl.loss << "\\\\" << std::endl;
}
}
handler << "\\hline " << std::endl;
handler << "\\end{tabular}" << std::endl;
handler << "\\end{table}" << std::endl;
handler.close();
}
}

26
src/best/BestResultsTex.h Normal file
View File

@@ -0,0 +1,26 @@
#ifndef BEST_RESULTS_TEX_H
#define BEST_RESULTS_TEX_H
#include <map>
#include <vector>
#include <nlohmann/json.hpp>
#include "common/Paths.h"
#include "Statistics.h"
namespace platform {
using json = nlohmann::ordered_json;
class BestResultsTex {
public:
BestResultsTex(const std::string score, bool dataset_name = true) : score{ score }, dataset_name{ dataset_name } {};
~BestResultsTex() = default;
void results_header(const std::vector<std::string>& models, const std::string& date, bool index);
void results_body(const std::vector<std::string>& datasets, json& table, bool index);
void results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model);
void postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date);
private:
std::string score;
bool dataset_name;
void openTexFile(const std::string& name);
std::ofstream handler;
std::vector<std::string> models;
};
}
#endif

View File

@@ -3,7 +3,7 @@
#include <string> #include <string>
#include <map> #include <map>
#include <utility> #include <utility>
#include "DotEnv.h" #include "common/DotEnv.h"
namespace platform { namespace platform {
class BestScore { class BestScore {
public: public:
@@ -24,5 +24,4 @@ namespace platform {
} }
}; };
} }
#endif #endif

View File

@@ -1,22 +1,31 @@
#include <sstream> #include <sstream>
#include "Statistics.h"
#include "Colors.h"
#include "Symbols.h"
#include <boost/math/distributions/chi_squared.hpp> #include <boost/math/distributions/chi_squared.hpp>
#include <boost/math/distributions/normal.hpp> #include <boost/math/distributions/normal.hpp>
#include "CLocale.h" #include "common/Colors.h"
#include "common/Symbols.h"
#include "common/CLocale.h"
#include "BestResultsTex.h"
#include "BestResultsMd.h"
#include "Statistics.h"
#include "WilcoxonTest.hpp"
namespace platform { namespace platform {
Statistics::Statistics(const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance, bool output) : Statistics::Statistics(const std::string& score, const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance, bool output) :
models(models), datasets(datasets), data(data), significance(significance), output(output) score(score), models(models), datasets(datasets), data(data), significance(significance), output(output)
{ {
if (score == "accuracy") {
postHocType = "Holm";
hlen = 85;
} else {
postHocType = "Wilcoxon";
hlen = 88;
}
nModels = models.size(); nModels = models.size();
nDatasets = datasets.size(); nDatasets = datasets.size();
auto temp = ConfigLocale(); auto temp = ConfigLocale();
} }
void Statistics::fit() void Statistics::fit()
{ {
if (nModels < 3 || nDatasets < 3) { if (nModels < 3 || nDatasets < 3) {
@@ -25,9 +34,11 @@ namespace platform {
throw std::runtime_error("Can't make the Friedman test with less than 3 models and/or less than 3 datasets."); throw std::runtime_error("Can't make the Friedman test with less than 3 models and/or less than 3 datasets.");
} }
ranksModels.clear(); ranksModels.clear();
computeRanks(); computeRanks(); // compute greaterAverage and ranks
// Set the control model as the one with the lowest average rank // Set the control model as the one with the lowest average rank
controlIdx = distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; })); controlIdx = score == "accuracy" ?
distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; }))
: greaterAverage; // The model with the greater average score
computeWTL(); computeWTL();
maxModelName = (*std::max_element(models.begin(), models.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size(); maxModelName = (*std::max_element(models.begin(), models.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
maxDatasetName = (*std::max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size(); maxDatasetName = (*std::max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
@@ -64,11 +75,16 @@ namespace platform {
void Statistics::computeRanks() void Statistics::computeRanks()
{ {
std::map<std::string, float> ranksLine; std::map<std::string, float> ranksLine;
std::map<std::string, float> averages;
for (const auto& model : models) {
averages[model] = 0;
}
for (const auto& dataset : datasets) { for (const auto& dataset : datasets) {
std::vector<std::pair<std::string, double>> ranksOrder; std::vector<std::pair<std::string, double>> ranksOrder;
for (const auto& model : models) { for (const auto& model : models) {
double value = data[model].at(dataset).at(0).get<double>(); double value = data[model].at(dataset).at(0).get<double>();
ranksOrder.push_back({ model, value }); ranksOrder.push_back({ model, value });
averages[model] += value;
} }
// Assign the ranks // Assign the ranks
ranksLine = assignRanks(ranksOrder); ranksLine = assignRanks(ranksOrder);
@@ -86,10 +102,17 @@ namespace platform {
for (const auto& rank : ranks) { for (const auto& rank : ranks) {
ranks[rank.first] /= nDatasets; ranks[rank.first] /= nDatasets;
} }
// Average the scores
for (const auto& average : averages) {
averages[average.first] /= nDatasets;
}
// Get the model with the greater average score
greaterAverage = distance(averages.begin(), max_element(averages.begin(), averages.end(), [](const auto& l, const auto& r) { return l.second < r.second; }));
} }
void Statistics::computeWTL() void Statistics::computeWTL()
{ {
// Compute the WTL matrix const double practical_threshold = 0.0005;
// Compute the WTL matrix (Win Tie Loss)
for (int i = 0; i < nModels; ++i) { for (int i = 0; i < nModels; ++i) {
wtl[i] = { 0, 0, 0 }; wtl[i] = { 0, 0, 0 };
} }
@@ -102,23 +125,85 @@ namespace platform {
continue; continue;
} }
double value = data[models[i]].at(item.key()).at(0).get<double>(); double value = data[models[i]].at(item.key()).at(0).get<double>();
if (value < controlValue) { double diff = controlValue - value; // control comparison
wtl[i].win++; if (std::fabs(diff) <= practical_threshold) {
} else if (value == controlValue) {
wtl[i].tie++; wtl[i].tie++;
} else if (diff < 0) {
wtl[i].win++;
} else { } else {
wtl[i].loss++; wtl[i].loss++;
} }
} }
} }
} }
int Statistics::getControlIdx()
void Statistics::postHocHolmTest(bool friedmanResult) {
if (!fitted) {
fit();
}
return controlIdx;
}
void Statistics::postHocTest()
{
if (score == "accuracy") {
postHocHolmTest();
} else {
postHocWilcoxonTest();
}
}
void Statistics::postHocWilcoxonTest()
{
if (!fitted) {
fit();
}
// Reference: Wilcoxon, F. (1945). “Individual Comparisons by Ranking Methods”. Biometrics Bulletin, 1(6), 80-83.
auto wilcoxon = WilcoxonTest(models, datasets, data, significance);
controlIdx = wilcoxon.getControlIdx();
postHocResults = wilcoxon.getPostHocResults();
setResultsOrder();
// Fill the ranks info
for (const auto& item : postHocResults) {
ranks[item.model] = item.rank;
}
Holm_Bonferroni();
restoreResultsOrder();
}
void Statistics::Holm_Bonferroni()
{
// The algorithm need the p-values sorted from the lowest to the highest
// Sort the models by p-value
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
return a.pvalue < b.pvalue;
});
// Holm adjustment
for (int i = 0; i < postHocResults.size(); ++i) {
auto item = postHocResults.at(i);
double before = i == 0 ? 0.0 : postHocResults.at(i - 1).pvalue;
double p_value = std::min((long double)1.0, item.pvalue * (nModels - i));
p_value = std::max(before, p_value);
postHocResults[i].pvalue = p_value;
}
}
void Statistics::setResultsOrder()
{
int c = 0;
for (auto& item : postHocResults) {
item.idx = c++;
}
}
void Statistics::restoreResultsOrder()
{
// Restore the order of the results
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
return a.idx < b.idx;
});
}
void Statistics::postHocHolmTest()
{ {
if (!fitted) { if (!fitted) {
fit(); fit();
} }
std::stringstream oss;
// Reference https://link.springer.com/article/10.1007/s44196-022-00083-8 // Reference https://link.springer.com/article/10.1007/s44196-022-00083-8
// Post-hoc Holm test // Post-hoc Holm test
// Calculate the p-value for the models paired with the control model // Calculate the p-value for the models paired with the control model
@@ -126,75 +211,67 @@ namespace platform {
boost::math::normal dist(0.0, 1.0); boost::math::normal dist(0.0, 1.0);
double diff = sqrt(nModels * (nModels + 1) / (6.0 * nDatasets)); double diff = sqrt(nModels * (nModels + 1) / (6.0 * nDatasets));
for (int i = 0; i < nModels; i++) { for (int i = 0; i < nModels; i++) {
PostHocLine line;
line.model = models[i];
line.rank = ranks.at(models[i]);
line.wtl = wtl.at(i);
line.reject = false;
if (i == controlIdx) { if (i == controlIdx) {
stats[i] = 0.0; postHocResults.push_back(line);
continue; continue;
} }
double z = abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff; double z = std::abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
double p_value = (long double)2 * (1 - cdf(dist, z)); line.pvalue = (long double)2 * (1 - cdf(dist, z));
stats[i] = p_value; line.reject = (line.pvalue < significance);
postHocResults.push_back(line);
} }
// Sort the models by p-value std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
std::vector<std::pair<int, double>> statsOrder; return a.rank < b.rank;
for (const auto& stat : stats) {
statsOrder.push_back({ stat.first, stat.second });
}
std::sort(statsOrder.begin(), statsOrder.end(), [](const std::pair<int, double>& a, const std::pair<int, double>& b) {
return a.second < b.second;
}); });
setResultsOrder();
// Holm adjustment Holm_Bonferroni();
for (int i = 0; i < statsOrder.size(); ++i) { restoreResultsOrder();
auto item = statsOrder.at(i);
double before = i == 0 ? 0.0 : statsOrder.at(i - 1).second;
double p_value = std::min((double)1.0, item.second * (nModels - i));
p_value = std::max(before, p_value);
statsOrder[i] = { item.first, p_value };
} }
holmResult.model = models.at(controlIdx);
void Statistics::postHocTestReport(bool friedmanResult, bool tex)
{
std::stringstream oss;
auto color = friedmanResult ? Colors::CYAN() : Colors::YELLOW(); auto color = friedmanResult ? Colors::CYAN() : Colors::YELLOW();
oss << color; oss << color;
oss << " *************************************************************************************************************" << std::endl; oss << " " << std::string(hlen + 25, '*') << std::endl;
oss << " Post-hoc Holm test: H0: 'There is no significant differences between the control model and the other models.'" << std::endl; oss << " Post-hoc " << postHocType << " test: H0: 'There is no significant differences between the control model and the other models.'" << std::endl;
oss << " Control model: " << models.at(controlIdx) << std::endl; oss << " Control model: " << models.at(controlIdx) << std::endl;
oss << " " << std::left << std::setw(maxModelName) << std::string("Model") << " p-value rank win tie loss Status" << std::endl; oss << " " << std::left << std::setw(maxModelName) << std::string("Model") << " p-value rank win tie loss Status" << std::endl;
oss << " " << std::string(maxModelName, '=') << " ============ ========= === === ==== =============" << std::endl; oss << " " << std::string(maxModelName, '=') << " ============ ========= === === ==== =============" << std::endl;
// sort ranks from lowest to highest bool first = true;
std::vector<std::pair<std::string, float>> ranksOrder; for (const auto& item : postHocResults) {
for (const auto& rank : ranks) { if (first) {
ranksOrder.push_back({ rank.first, rank.second }); oss << " " << Colors::BLUE() << std::left << std::setw(maxModelName) << item.model << " ";
} oss << std::setw(12) << " " << std::setprecision(7) << std::fixed << " " << item.rank << std::endl;
std::sort(ranksOrder.begin(), ranksOrder.end(), [](const std::pair<std::string, float>& a, const std::pair<std::string, float>& b) { first = false;
return a.second < b.second;
});
// Show the control model info.
oss << " " << Colors::BLUE() << std::left << std::setw(maxModelName) << ranksOrder.at(0).first << " ";
oss << std::setw(12) << " " << std::setprecision(7) << std::fixed << " " << ranksOrder.at(0).second << std::endl;
for (const auto& item : ranksOrder) {
auto idx = distance(models.begin(), find(models.begin(), models.end(), item.first));
double pvalue = 0.0;
for (const auto& stat : statsOrder) {
if (stat.first == idx) {
pvalue = stat.second;
}
}
holmResult.holmLines.push_back({ item.first, pvalue, item.second, wtl.at(idx), pvalue < significance });
if (item.first == models.at(controlIdx)) {
continue; continue;
} }
auto pvalue = item.pvalue;
auto colorStatus = pvalue > significance ? Colors::GREEN() : Colors::MAGENTA(); auto colorStatus = pvalue > significance ? Colors::GREEN() : Colors::MAGENTA();
auto status = pvalue > significance ? Symbols::check_mark : Symbols::cross; auto status = pvalue > significance ? Symbols::check_mark : Symbols::cross;
auto textStatus = pvalue > significance ? " accepted H0" : " rejected H0"; auto textStatus = pvalue > significance ? " accepted H0" : " rejected H0";
oss << " " << colorStatus << std::left << std::setw(maxModelName) << item.first << " "; oss << " " << colorStatus << std::left << std::setw(maxModelName) << item.model << " ";
oss << std::setprecision(6) << std::scientific << pvalue << std::setprecision(7) << std::fixed << " " << item.second; oss << std::setprecision(6) << std::scientific << pvalue << std::setprecision(7) << std::fixed << " " << item.rank;
oss << " " << std::right << std::setw(3) << wtl.at(idx).win << " " << std::setw(3) << wtl.at(idx).tie << " " << std::setw(4) << wtl.at(idx).loss; oss << " " << std::right << std::setw(3) << item.wtl.win << " " << std::setw(3) << item.wtl.tie << " " << std::setw(4) << item.wtl.loss;
oss << " " << status << textStatus << std::endl; oss << " " << status << textStatus << std::endl;
} }
oss << color << " *************************************************************************************************************" << std::endl; oss << color << " " << std::string(hlen + 25, '*') << std::endl;
oss << Colors::RESET(); oss << Colors::RESET();
if (output) { if (output) {
std::cout << oss.str(); std::cout << oss.str();
} }
if (tex) {
BestResultsTex bestResultsTex(score);
BestResultsMd bestResultsMd;
bestResultsTex.postHoc_test(postHocResults, postHocType, get_date() + " " + get_time());
bestResultsMd.postHoc_test(postHocResults, postHocType, get_date() + " " + get_time());
}
} }
bool Statistics::friedmanTest() bool Statistics::friedmanTest()
{ {
@@ -205,7 +282,7 @@ namespace platform {
// Friedman test // Friedman test
// Calculate the Friedman statistic // Calculate the Friedman statistic
oss << Colors::BLUE() << std::endl; oss << Colors::BLUE() << std::endl;
oss << "***************************************************************************************************************" << std::endl; oss << std::string(hlen, '*') << std::endl;
oss << Colors::GREEN() << "Friedman test: H0: 'There is no significant differences between all the classifiers.'" << Colors::BLUE() << std::endl; oss << Colors::GREEN() << "Friedman test: H0: 'There is no significant differences between all the classifiers.'" << Colors::BLUE() << std::endl;
double degreesOfFreedom = nModels - 1.0; double degreesOfFreedom = nModels - 1.0;
double sumSquared = 0; double sumSquared = 0;
@@ -230,23 +307,11 @@ namespace platform {
oss << Colors::YELLOW() << "The null hypothesis H0 is accepted. Computed p-values will not be significant." << std::endl; oss << Colors::YELLOW() << "The null hypothesis H0 is accepted. Computed p-values will not be significant." << std::endl;
result = false; result = false;
} }
oss << Colors::BLUE() << "***************************************************************************************************************" << Colors::RESET() << std::endl; oss << Colors::BLUE() << std::string(hlen, '*') << Colors::RESET() << std::endl;
if (output) { if (output) {
std::cout << oss.str(); std::cout << oss.str();
} }
friedmanResult = { friedmanQ, criticalValue, p_value, result }; friedmanResult = { friedmanQ, criticalValue, p_value, result };
return result; return result;
} }
FriedmanResult& Statistics::getFriedmanResult()
{
return friedmanResult;
}
HolmResult& Statistics::getHolmResult()
{
return holmResult;
}
std::map<std::string, std::map<std::string, float>>& Statistics::getRanks()
{
return ranksModels;
}
} // namespace platform } // namespace platform

View File

@@ -5,13 +5,13 @@
#include <map> #include <map>
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
using json = nlohmann::json;
namespace platform { namespace platform {
using json = nlohmann::ordered_json;
struct WTL { struct WTL {
int win; uint win;
int tie; uint tie;
int loss; uint loss;
}; };
struct FriedmanResult { struct FriedmanResult {
double statistic; double statistic;
@@ -19,29 +19,36 @@ namespace platform {
long double pvalue; long double pvalue;
bool reject; bool reject;
}; };
struct HolmLine { struct PostHocLine {
uint idx; //index of the main order
std::string model; std::string model;
long double pvalue; long double pvalue;
double rank; double rank;
WTL wtl; WTL wtl;
bool reject; bool reject;
}; };
struct HolmResult {
std::string model;
std::vector<HolmLine> holmLines;
};
class Statistics { class Statistics {
public: public:
Statistics(const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance = 0.05, bool output = true); Statistics(const std::string& score, const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance = 0.05, bool output = true);
bool friedmanTest(); bool friedmanTest();
void postHocHolmTest(bool friedmanResult); void postHocTest();
FriedmanResult& getFriedmanResult(); void postHocTestReport(bool friedmanResult, bool tex);
HolmResult& getHolmResult(); int getControlIdx();
std::map<std::string, std::map<std::string, float>>& getRanks(); FriedmanResult& getFriedmanResult() { return friedmanResult; }
std::vector<PostHocLine>& getPostHocResults() { return postHocResults; }
std::map<std::string, std::map<std::string, float>>& getRanks() { return ranksModels; } // ranks of the models per dataset
private: private:
void fit(); void fit();
void postHocHolmTest();
void postHocWilcoxonTest();
void computeRanks(); void computeRanks();
void computeWTL(); void computeWTL();
void Holm_Bonferroni();
void setResultsOrder(); // Set the order of the results based on the statistic analysis needed
void restoreResultsOrder(); // Restore the order of the results after the Holm-Bonferroni adjustment
const std::string& score;
std::string postHocType;
const std::vector<std::string>& models; const std::vector<std::string>& models;
const std::vector<std::string>& datasets; const std::vector<std::string>& datasets;
const json& data; const json& data;
@@ -51,13 +58,15 @@ namespace platform {
int nModels = 0; int nModels = 0;
int nDatasets = 0; int nDatasets = 0;
int controlIdx = 0; int controlIdx = 0;
int greaterAverage = -1; // The model with the greater average score
std::map<int, WTL> wtl; std::map<int, WTL> wtl;
std::map<std::string, float> ranks; std::map<std::string, float> ranks;
int maxModelName = 0; int maxModelName = 0;
int maxDatasetName = 0; int maxDatasetName = 0;
int hlen; // length of the line
FriedmanResult friedmanResult; FriedmanResult friedmanResult;
HolmResult holmResult; std::vector<PostHocLine> postHocResults;
std::map<std::string, std::map<std::string, float>> ranksModels; std::map<std::string, std::map<std::string, float>> ranksModels;
}; };
} }
#endif // !STATISTICS_H #endif

245
src/best/WilcoxonTest.hpp Normal file
View File

@@ -0,0 +1,245 @@
#ifndef BEST_WILCOXON_TEST_HPP
#define BEST_WILCOXON_TEST_HPP
// WilcoxonTest.hpp
// Standalone class for paired Wilcoxon signedrank posthoc analysis
// ------------------------------------------------------------------
// * Constructor takes the *alreadyloaded* nlohmann::json object plus the
// vectors of model and dataset names.
// * Internally selects a control model (highest average AUC) and builds all
// statistics (ranks, W/T/L counts, Wilcoxon pvalues).
// * Public API:
// int getControlIdx() const;
// PostHocResult getPostHocResult() const;
//
#include <vector>
#include <string>
#include <cmath>
#include <algorithm>
#include <numeric>
#include <limits>
#include <nlohmann/json.hpp>
#include "Statistics.h"
namespace platform {
class WilcoxonTest {
public:
WilcoxonTest(const std::vector<std::string>& models, const std::vector<std::string>& datasets,
const json& data, double alpha = 0.05) : models_(models), datasets_(datasets), data_(data), alpha_(alpha)
{
buildAUCTable(); // extracts all AUCs into a dense matrix
computeAverageAUCs(); // permodel mean (→ control selection)
computeAverageRanks(); // Friedmanstyle ranks per model
selectControlModel(); // sets control_idx_
buildPostHocResult(); // fills postHocResult_
}
int getControlIdx() const noexcept { return control_idx_; }
const std::vector<PostHocLine>& getPostHocResults() const noexcept { return postHocResults_; }
private:
//-------------------------------------------------- helper structs ----
// When a value is missing we keep NaN so that ordinary arithmetic still
// works (NaN simply propagates and we can test with std::isnan).
using Matrix = std::vector<std::vector<double>>; // [model][dataset]
//------------------------------------------------- implementation ----
void buildAUCTable()
{
const std::size_t M = models_.size();
const std::size_t D = datasets_.size();
auc_.assign(M, std::vector<double>(D, std::numeric_limits<double>::quiet_NaN()));
for (std::size_t i = 0; i < M; ++i) {
const auto& model = models_[i];
for (std::size_t j = 0; j < D; ++j) {
const auto& ds = datasets_[j];
try {
auc_[i][j] = data_.at(model).at(ds).at(0).get<double>();
}
catch (...) {
// leave as NaN when value missing
}
}
}
}
void computeAverageAUCs()
{
const std::size_t M = models_.size();
avg_auc_.resize(M, std::numeric_limits<double>::quiet_NaN());
for (std::size_t i = 0; i < M; ++i) {
double sum = 0.0;
std::size_t cnt = 0;
for (double v : auc_[i]) {
if (!std::isnan(v)) { sum += v; ++cnt; }
}
avg_auc_[i] = cnt ? sum / cnt : std::numeric_limits<double>::quiet_NaN();
}
}
// Average rank across datasets (1 = best).
void computeAverageRanks()
{
const std::size_t M = models_.size();
const std::size_t D = datasets_.size();
rank_sum_.assign(M, 0.0);
rank_cnt_.assign(M, 0);
const double EPS = 1e-10;
for (std::size_t j = 0; j < D; ++j) {
// Collect present values for this dataset
std::vector<std::pair<double, std::size_t>> vals; // (auc, model_idx)
vals.reserve(M);
for (std::size_t i = 0; i < M; ++i) {
if (!std::isnan(auc_[i][j]))
vals.emplace_back(auc_[i][j], i);
}
if (vals.empty()) continue; // no info for this dataset
// Sort descending (higher AUC better)
std::sort(vals.begin(), vals.end(), [](auto a, auto b) {
return a.first > b.first;
});
// Assign ranks with average for ties
std::size_t k = 0;
while (k < vals.size()) {
std::size_t l = k + 1;
while (l < vals.size() && std::fabs(vals[l].first - vals[k].first) < EPS) ++l;
const double avg_rank = (k + 1 + l) * 0.5; // average of ranks (1based)
for (std::size_t m = k; m < l; ++m) {
const auto idx = vals[m].second;
rank_sum_[idx] += avg_rank;
++rank_cnt_[idx];
}
k = l;
}
}
// Final average
avg_rank_.resize(M, std::numeric_limits<double>::quiet_NaN());
for (std::size_t i = 0; i < M; ++i) {
avg_rank_[i] = rank_cnt_[i] ? rank_sum_[i] / rank_cnt_[i]
: std::numeric_limits<double>::quiet_NaN();
}
}
void selectControlModel()
{
// pick model with highest average AUC (ties → first)
control_idx_ = 0;
for (std::size_t i = 1; i < avg_auc_.size(); ++i) {
if (avg_auc_[i] > avg_auc_[control_idx_]) control_idx_ = static_cast<int>(i);
}
}
void buildPostHocResult()
{
const std::size_t M = models_.size();
const std::size_t D = datasets_.size();
const std::string& control_name = models_[control_idx_];
const double practical_threshold = 0.0005; // same heuristic as original code
for (std::size_t i = 0; i < M; ++i) {
PostHocLine line;
line.model = models_[i];
line.rank = avg_auc_[i];
WTL wtl = { 0, 0, 0 }; // win, tie, loss
std::vector<double> differences;
differences.reserve(D);
for (std::size_t j = 0; j < D; ++j) {
double auc_control = auc_[control_idx_][j];
double auc_other = auc_[i][j];
if (std::isnan(auc_control) || std::isnan(auc_other)) continue;
double diff = auc_control - auc_other; // control comparison
if (std::fabs(diff) <= practical_threshold) {
++wtl.tie;
} else if (diff < 0) {
++wtl.win; // comparison wins
} else {
++wtl.loss; // control wins
}
differences.push_back(diff);
}
line.wtl = wtl;
line.pvalue = differences.empty() ? 1.0L : static_cast<long double>(wilcoxonSignedRankTest(differences));
line.reject = (line.pvalue < alpha_);
postHocResults_.push_back(std::move(line));
}
// Sort results by rank (descending)
std::sort(postHocResults_.begin(), postHocResults_.end(), [](const PostHocLine& a, const PostHocLine& b) {
return a.rank > b.rank;
});
}
// ------------------------------------------------ Wilcoxon (private) --
static double wilcoxonSignedRankTest(const std::vector<double>& diffs)
{
if (diffs.empty()) return 1.0;
// Build |diff| + sign vector (exclude zeros)
struct Node { double absval; int sign; };
std::vector<Node> v;
v.reserve(diffs.size());
for (double d : diffs) {
if (d != 0.0) v.push_back({ std::fabs(d), d > 0 ? 1 : -1 });
}
if (v.empty()) return 1.0;
// Sort by absolute value
std::sort(v.begin(), v.end(), [](const Node& a, const Node& b) { return a.absval < b.absval; });
const double EPS = 1e-10;
const std::size_t n = v.size();
std::vector<double> ranks(n, 0.0);
std::size_t i = 0;
while (i < n) {
std::size_t j = i + 1;
while (j < n && std::fabs(v[j].absval - v[i].absval) < EPS) ++j;
double avg_rank = (i + 1 + j) * 0.5; // 1based ranks
for (std::size_t k = i; k < j; ++k) ranks[k] = avg_rank;
i = j;
}
double w_plus = 0.0, w_minus = 0.0;
for (std::size_t k = 0; k < n; ++k) {
if (v[k].sign > 0) w_plus += ranks[k];
else w_minus += ranks[k];
}
double w = std::min(w_plus, w_minus);
double mean_w = n * (n + 1) / 4.0;
double sd_w = std::sqrt(n * (n + 1) * (2 * n + 1) / 24.0);
if (sd_w == 0.0) return 1.0; // degenerate (all diffs identical)
double z = (w - mean_w) / sd_w;
double p_two = std::erfc(std::fabs(z) / std::sqrt(2.0)); // 2sided tail
return p_two;
}
//-------------------------------------------------------- data ----
std::vector<std::string> models_;
std::vector<std::string> datasets_;
json data_;
double alpha_;
Matrix auc_; // [model][dataset]
std::vector<double> avg_auc_; // mean AUC per model
std::vector<double> avg_rank_; // mean rank per model
std::vector<double> rank_sum_; // helper for ranks
std::vector<int> rank_cnt_; // datasets counted per model
int control_idx_ = -1;
std::vector<PostHocLine> postHocResults_;
};
} // namespace platform
#endif // BEST_WILCOXON_TEST_HPP

View File

@@ -1,16 +1,25 @@
#include <iostream> #include <iostream>
#include <argparse/argparse.hpp> #include <argparse/argparse.hpp>
#include "Paths.h" #include "main/Models.h"
#include "BestResults.h" #include "main/modelRegister.h"
#include "Colors.h" #include "common/Paths.h"
#include "config.h" #include "common/Colors.h"
#include "common/Utils.h"
#include "best/BestResults.h"
#include "common/DotEnv.h"
#include "config_platform.h"
void manageArguments(argparse::ArgumentParser& program) void manageArguments(argparse::ArgumentParser& program)
{ {
program.add_argument("-m", "--model").default_value("").help("Filter results of the selected model) (any for all models)"); auto env = platform::DotEnv();
program.add_argument("-s", "--score").default_value("accuracy").help("Filter results of the score name supplied"); program.add_argument("-m", "--model").help("Model to use or any").default_value("any");
program.add_argument("--folder").help("Results folder to use").default_value(platform::Paths::results());
program.add_argument("-d", "--dataset").default_value("any").help("Filter results of the selected model) (any for all datasets)");
program.add_argument("-s", "--score").default_value(env.get("score")).help("Filter results of the score name supplied");
program.add_argument("--friedman").help("Friedman test").default_value(false).implicit_value(true); program.add_argument("--friedman").help("Friedman test").default_value(false).implicit_value(true);
program.add_argument("--excel").help("Output to excel").default_value(false).implicit_value(true); program.add_argument("--excel").help("Output to excel").default_value(false).implicit_value(true);
program.add_argument("--tex").help("Output results to TeX & Markdown files").default_value(false).implicit_value(true);
program.add_argument("--index").help("In tex output show the index of the dataset instead of the name to save space").default_value(false).implicit_value(true);
program.add_argument("--level").help("significance level").default_value(0.05).scan<'g', double>().action([](const std::string& value) { program.add_argument("--level").help("significance level").default_value(0.05).scan<'g', double>().action([](const std::string& value) {
try { try {
auto k = std::stod(value); auto k = std::stod(value);
@@ -29,23 +38,30 @@ void manageArguments(argparse::ArgumentParser& program)
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
argparse::ArgumentParser program("b_best", { project_version.begin(), project_version.end() }); argparse::ArgumentParser program("b_best", { platform_project_version.begin(), platform_project_version.end() });
manageArguments(program); manageArguments(program);
std::string model, score; std::string model, dataset, score, folder;
bool build, report, friedman, excel; bool build, report, friedman, excel, tex, index;
double level; double level;
try { try {
program.parse_args(argc, argv); program.parse_args(argc, argv);
model = program.get<std::string>("model"); model = program.get<std::string>("model");
folder = program.get<std::string>("folder");
if (folder.back() != '/') {
folder += '/';
}
dataset = program.get<std::string>("dataset");
score = program.get<std::string>("score"); score = program.get<std::string>("score");
friedman = program.get<bool>("friedman"); friedman = program.get<bool>("friedman");
excel = program.get<bool>("excel"); excel = program.get<bool>("excel");
tex = program.get<bool>("tex");
index = program.get<bool>("index");
level = program.get<double>("level"); level = program.get<double>("level");
if (model == "" || score == "") { if (model == "" || score == "") {
throw std::runtime_error("Model and score name must be supplied"); throw std::runtime_error("Model and score name must be supplied");
} }
if (friedman && model != "any") { if (friedman && (model != "any" || dataset != "any")) {
std::cerr << "Friedman test can only be used with all models" << std::endl; std::cerr << "Friedman test can only be used with all models and all the datasets" << std::endl;
std::cerr << program; std::cerr << program;
exit(1); exit(1);
} }
@@ -56,15 +72,20 @@ int main(int argc, char** argv)
exit(1); exit(1);
} }
// Generate report // Generate report
auto results = platform::BestResults(platform::Paths::results(), score, model, friedman, level); auto results = platform::BestResults(folder, score, model, dataset, friedman, level);
if (model == "any") { if (model == "any") {
results.buildAll(); results.buildAll();
results.reportAll(excel); results.reportAll(excel, tex, index);
} else { } else {
std::string fileName = results.build(); std::string fileName = results.build();
std::cout << Colors::GREEN() << fileName << " created!" << Colors::RESET() << std::endl; std::cout << Colors::GREEN() << fileName << " created!" << Colors::RESET() << std::endl;
results.reportSingle(excel); results.reportSingle(excel);
} }
if (excel) {
auto fileName = results.getExcelFileName();
std::cout << "Opening " << fileName << std::endl;
platform::openFile(fileName);
}
std::cout << Colors::RESET(); std::cout << Colors::RESET();
return 0; return 0;
} }

View File

@@ -1,45 +1,50 @@
#include <iostream> #include <iostream>
#include <argparse/argparse.hpp> #include <argparse/argparse.hpp>
#include <map> #include <map>
#include <tuple>
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
#include <mpi.h> #include <mpi.h>
#include "DotEnv.h" #include "main/Models.h"
#include "Models.h" #include "main/ArgumentsExperiment.h"
#include "modelRegister.h" #include "common/Paths.h"
#include "GridSearch.h" #include "common/Timer.hpp"
#include "Paths.h" #include "common/Colors.h"
#include "Timer.h" #include "common/DotEnv.h"
#include "Colors.h" #include "grid/GridSearch.h"
#include "config.h" #include "grid/GridExperiment.h"
#include "config_platform.h"
using json = nlohmann::json; using json = nlohmann::ordered_json;
const int MAXL = 133; const int MAXL = 133;
void assignModel(argparse::ArgumentParser& parser) void assignModel(argparse::ArgumentParser& parser)
{ {
auto models = platform::Models::instance(); auto models = platform::Models::instance();
parser.add_argument("-m", "--model") parser.add_argument("-m", "--model")
.help("Model to use " + models->tostring()) .help("Model to use " + models->toString())
.required() .required()
.action([models](const std::string& value) { .action([models](const std::string& value) {
static const std::vector<std::string> choices = models->getNames(); static const std::vector<std::string> choices = models->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) { if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value; return value;
} }
throw std::runtime_error("Model must be one of " + models->tostring()); throw std::runtime_error("Model must be one of " + models->toString());
} }
); );
} }
void add_compute_args(argparse::ArgumentParser& program) void add_search_args(argparse::ArgumentParser& program)
{ {
auto env = platform::DotEnv(); auto env = platform::DotEnv();
program.add_argument("--discretize").help("Discretize input datasets").default_value((bool)stoi(env.get("discretize"))).implicit_value(true); program.add_argument("--discretize").help("Discretize input datasets").default_value((bool)stoi(env.get("discretize"))).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true); program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true); program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true);
program.add_argument("--continue").help("Continue computing from that dataset").default_value(platform::GridSearch::NO_CONTINUE()); program.add_argument("--continue").help("Continue computing from that dataset").default_value(platform::GridSearch::NO_CONTINUE());
program.add_argument("--only").help("Used with continue to compute that dataset only").default_value(false).implicit_value(true); program.add_argument("--only").help("Used with continue to search with that dataset only").default_value(false).implicit_value(true);
program.add_argument("--exclude").default_value("[]").help("Datasets to exclude in json format, e.g. [\"dataset1\", \"dataset2\"]"); program.add_argument("--exclude").default_value("[]").help("Datasets to exclude in json format, e.g. [\"dataset1\", \"dataset2\"]");
auto valid_choices = env.valid_tokens("smooth_strat");
auto& smooth_arg = program.add_argument("--smooth-strat").help("Smooth strategy used in Bayes Network node initialization. Valid values: " + env.valid_values("smooth_strat")).default_value(env.get("smooth_strat"));
for (auto choice : valid_choices) {
smooth_arg.choices(choice);
}
program.add_argument("--nested").help("Set the double/nested cross validation number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) { program.add_argument("--nested").help("Set the double/nested cross validation number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
try { try {
auto k = stoi(value); auto k = stoi(value);
@@ -93,21 +98,27 @@ void list_dump(std::string& model)
if (item.first.size() > max_dataset) { if (item.first.size() > max_dataset) {
max_dataset = item.first.size(); max_dataset = item.first.size();
} }
if (item.second.dump().size() > max_hyper) { for (auto const& [key, value] : item.second.items()) {
max_hyper = item.second.dump().size(); if (value.dump().size() > max_hyper) {
max_hyper = value.dump().size();
}
} }
} }
std::cout << Colors::GREEN() << left << " # " << left << setw(max_dataset) << "Dataset" << " #Com. " std::cout << Colors::GREEN() << left << " # " << left << setw(max_dataset) << "Dataset" << " #Com. "
<< setw(max_hyper) << "Hyperparameters" << std::endl; << setw(max_hyper) << "Hyperparameters" << std::endl;
std::cout << "=== " << string(max_dataset, '=') << " ===== " << string(max_hyper, '=') << std::endl; std::cout << "=== " << string(max_dataset, '=') << " ===== " << string(max_hyper, '=') << std::endl;
bool odd = true; int i = 0;
for (auto const& item : combinations) { for (auto const& item : combinations) {
auto color = odd ? Colors::CYAN() : Colors::BLUE(); auto color = (i++ % 2) ? Colors::CYAN() : Colors::BLUE();
std::cout << color; std::cout << color;
auto num_combinations = data.getNumCombinations(item.first); auto num_combinations = data.getNumCombinations(item.first);
std::cout << setw(3) << fixed << right << ++index << left << " " << setw(max_dataset) << item.first std::cout << setw(3) << fixed << right << ++index << left << " " << setw(max_dataset) << item.first
<< " " << setw(5) << right << num_combinations << " " << setw(max_hyper) << left << item.second.dump() << std::endl; << " " << setw(5) << right << num_combinations << " ";
odd = !odd; std::string prefix = "";
for (auto const& [key, value] : item.second.items()) {
std::cout << prefix << setw(max_hyper) << std::left << value.dump() << std::endl;
prefix = string(11 + max_dataset, ' ');
}
} }
std::cout << Colors::RESET() << std::endl; std::cout << Colors::RESET() << std::endl;
} }
@@ -127,7 +138,8 @@ void list_results(json& results, std::string& model)
std::cout << std::string(MAXL, '*') << std::endl; std::cout << std::string(MAXL, '*') << std::endl;
int spaces = 7; int spaces = 7;
int hyperparameters_spaces = 15; int hyperparameters_spaces = 15;
for (const auto& item : results["results"].items()) { nlohmann::json temp = results["results"]; // To show in alphabetical order of the dataset
for (const auto& item : temp.items()) {
auto key = item.key(); auto key = item.key();
auto value = item.value(); auto value = item.value();
if (key.size() > spaces) { if (key.size() > spaces) {
@@ -141,17 +153,15 @@ void list_results(json& results, std::string& model)
<< "Duration " << setw(8) << "Score" << " " << "Hyperparameters" << std::endl; << "Duration " << setw(8) << "Score" << " " << "Hyperparameters" << std::endl;
std::cout << "=== " << string(spaces, '=') << " " << string(19, '=') << " " << string(8, '=') << " " std::cout << "=== " << string(spaces, '=') << " " << string(19, '=') << " " << string(8, '=') << " "
<< string(8, '=') << " " << string(hyperparameters_spaces, '=') << std::endl; << string(8, '=') << " " << string(hyperparameters_spaces, '=') << std::endl;
bool odd = true;
int index = 0; int index = 0;
for (const auto& item : results["results"].items()) { for (const auto& item : temp.items()) {
auto color = odd ? Colors::CYAN() : Colors::BLUE(); auto color = (index % 2) ? Colors::CYAN() : Colors::BLUE();
auto value = item.value(); auto value = item.value();
std::cout << color; std::cout << color;
std::cout << std::setw(3) << std::right << index++ << " "; std::cout << std::setw(3) << std::right << index++ << " ";
std::cout << left << setw(spaces) << item.key() << " " << value["date"].get<string>() std::cout << left << setw(spaces) << item.key() << " " << value["date"].get<string>()
<< " " << setw(8) << right << value["duration"].get<string>() << " " << setw(8) << setprecision(6) << " " << setw(8) << right << value["duration"].get<string>() << " " << setw(8) << setprecision(6)
<< fixed << right << value["score"].get<double>() << " " << value["hyperparameters"].dump() << std::endl; << fixed << right << value["score"].get<double>() << " " << value["hyperparameters"].dump() << std::endl;
odd = !odd;
} }
std::cout << Colors::RESET() << std::endl; std::cout << Colors::RESET() << std::endl;
} }
@@ -177,13 +187,14 @@ void report(argparse::ArgumentParser& program)
list_results(results, config.model); list_results(results, config.model);
} }
} }
void compute(argparse::ArgumentParser& program) void search(argparse::ArgumentParser& program)
{ {
struct platform::ConfigGrid config; struct platform::ConfigGrid config;
config.model = program.get<std::string>("model"); config.model = program.get<std::string>("model");
config.score = program.get<std::string>("score"); config.score = program.get<std::string>("score");
config.discretize = program.get<bool>("discretize"); config.discretize = program.get<bool>("discretize");
config.stratified = program.get<bool>("stratified"); config.stratified = program.get<bool>("stratified");
config.smooth_strategy = program.get<std::string>("smooth-strat");
config.n_folds = program.get<int>("folds"); config.n_folds = program.get<int>("folds");
config.quiet = program.get<bool>("quiet"); config.quiet = program.get<bool>("quiet");
config.only = program.get<bool>("only"); config.only = program.get<bool>("only");
@@ -195,9 +206,6 @@ void compute(argparse::ArgumentParser& program)
} }
auto excluded = program.get<std::string>("exclude"); auto excluded = program.get<std::string>("exclude");
config.excluded = json::parse(excluded); config.excluded = json::parse(excluded);
auto env = platform::DotEnv();
config.platform = env.get("platform");
platform::Paths::createPath(platform::Paths::grid()); platform::Paths::createPath(platform::Paths::grid());
auto grid_search = platform::GridSearch(config); auto grid_search = platform::GridSearch(config);
platform::Timer timer; platform::Timer timer;
@@ -208,22 +216,54 @@ void compute(argparse::ArgumentParser& program)
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs); MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs);
if (mpi_config.n_procs < 2) { if (mpi_config.n_procs < 2) {
throw std::runtime_error("Cannot use --compute with less than 2 mpi processes, try mpirun -np 2 ..."); throw std::runtime_error("Cannot use --search with less than 2 mpi processes, try mpirun -np 2 ...");
} }
grid_search.go(mpi_config); grid_search.go(mpi_config);
if (mpi_config.rank == mpi_config.manager) { if (mpi_config.rank == mpi_config.manager) {
auto results = grid_search.loadResults(); auto results = grid_search.loadResults();
std::cout << Colors::RESET() << "* Report of the computed hyperparameters" << std::endl;
list_results(results, config.model); list_results(results, config.model);
std::cout << "Process took " << timer.getDurationString() << std::endl; std::cout << "Process took " << timer.getDurationString() << std::endl;
} }
MPI_Finalize(); MPI_Finalize();
} }
void experiment(argparse::ArgumentParser& program)
{
struct platform::ConfigGrid config;
auto arguments = platform::ArgumentsExperiment(program, platform::experiment_t::GRID);
arguments.parse();
auto path_results = arguments.getPathResults();
auto grid_experiment = platform::GridExperiment(arguments, config);
platform::Timer timer;
timer.start();
struct platform::ConfigMPI mpi_config;
mpi_config.manager = 0; // which process is the manager
MPI_Init(nullptr, nullptr);
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs);
if (mpi_config.n_procs < 2) {
throw std::runtime_error("Cannot use --experiment with less than 2 mpi processes, try mpirun -np 2 ...");
}
grid_experiment.go(mpi_config);
if (mpi_config.rank == mpi_config.manager) {
auto experiment = grid_experiment.getExperiment();
std::cout << "* Report of the computed hyperparameters" << std::endl;
auto duration = timer.getDuration();
experiment.setDuration(duration);
if (grid_experiment.haveToSaveResults()) {
experiment.saveResult(path_results);
}
experiment.report();
std::cout << "Process took " << duration << std::endl;
}
MPI_Finalize();
}
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
// //
// Manage arguments // Manage arguments
// //
argparse::ArgumentParser program("b_grid", { project_version.begin(), project_version.end() }); argparse::ArgumentParser program("b_grid", { platform_project_version.begin(), platform_project_version.end() });
// grid dump subparser // grid dump subparser
argparse::ArgumentParser dump_command("dump"); argparse::ArgumentParser dump_command("dump");
dump_command.add_description("Dump the combinations of hyperparameters of a model."); dump_command.add_description("Dump the combinations of hyperparameters of a model.");
@@ -234,15 +274,21 @@ int main(int argc, char** argv)
assignModel(report_command); assignModel(report_command);
report_command.add_description("Report the computed hyperparameters of a model."); report_command.add_description("Report the computed hyperparameters of a model.");
// grid compute subparser // grid search subparser
argparse::ArgumentParser compute_command("compute"); argparse::ArgumentParser search_command("search");
compute_command.add_description("Compute using mpi the hyperparameters of a model."); search_command.add_description("Search using mpi the hyperparameters of a model.");
assignModel(compute_command); assignModel(search_command);
add_compute_args(compute_command); add_search_args(search_command);
// grid experiment subparser
argparse::ArgumentParser experiment_command("experiment");
experiment_command.add_description("Experiment like b_main using mpi.");
auto arguments = platform::ArgumentsExperiment(experiment_command, platform::experiment_t::GRID);
arguments.add_arguments();
program.add_subparser(dump_command); program.add_subparser(dump_command);
program.add_subparser(report_command); program.add_subparser(report_command);
program.add_subparser(compute_command); program.add_subparser(search_command);
program.add_subparser(experiment_command);
// //
// Process options // Process options
@@ -250,7 +296,7 @@ int main(int argc, char** argv)
try { try {
program.parse_args(argc, argv); program.parse_args(argc, argv);
bool found = false; bool found = false;
map<std::string, void(*)(argparse::ArgumentParser&)> commands = { {"dump", &dump}, {"report", &report}, {"compute", &compute} }; map<std::string, void(*)(argparse::ArgumentParser&)> commands = { {"dump", &dump}, {"report", &report}, {"search", &search}, { "experiment",&experiment } };
for (const auto& command : commands) { for (const auto& command : commands) {
if (program.is_subcommand_used(command.first)) { if (program.is_subcommand_used(command.first)) {
std::invoke(command.second, program.at<argparse::ArgumentParser>(command.first)); std::invoke(command.second, program.at<argparse::ArgumentParser>(command.first));
@@ -259,7 +305,7 @@ int main(int argc, char** argv)
} }
} }
if (!found) { if (!found) {
throw std::runtime_error("You must specify one of the following commands: dump, report, compute, export\n"); throw std::runtime_error("You must specify one of the following commands: dump, experiment, report, search \n");
} }
} }
catch (const exception& err) { catch (const exception& err) {

119
src/commands/b_list.cpp Normal file
View File

@@ -0,0 +1,119 @@
#include <iostream>
#include <locale>
#include <map>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include "main/Models.h"
#include "main/modelRegister.h"
#include "common/Paths.h"
#include "common/Colors.h"
#include "common/Datasets.h"
#include "common/Utils.h"
#include "reports/DatasetsExcel.h"
#include "reports/DatasetsConsole.h"
#include "results/ResultsDatasetConsole.h"
#include "results/ResultsDataset.h"
#include "results/ResultsDatasetExcel.h"
#include "config_platform.h"
void list_datasets(argparse::ArgumentParser& program)
{
auto excel = program.get<bool>("excel");
auto report = platform::DatasetsConsole();
report.report();
std::cout << report.getOutput();
if (excel) {
auto data = report.getData();
auto ereport = new platform::DatasetsExcel();
ereport->report(data);
std::cout << std::endl << Colors::GREEN() << "Output saved in " << ereport->getFileName() << std::endl;
auto fileName = ereport->getExcelFileName();
delete ereport;
std::cout << "Opening " << fileName << std::endl;
platform::openFile(fileName);
}
}
void list_results(argparse::ArgumentParser& program)
{
auto dataset = program.get<string>("dataset");
auto score = program.get<string>("score");
auto model = program.get<string>("model");
auto excel = program.get<bool>("excel");
auto report = platform::ResultsDatasetsConsole();
if (!report.report(dataset, score, model))
return;
std::cout << report.getOutput();
if (excel) {
auto data = report.getData();
auto ereport = new platform::ResultsDatasetExcel();
ereport->report(data);
std::cout << std::endl << Colors::GREEN() << "Output saved in " << ereport->getFileName() << std::endl;
auto fileName = ereport->getExcelFileName();
delete ereport;
std::cout << "Opening " << fileName << std::endl;
platform::openFile(fileName);
}
}
int main(int argc, char** argv)
{
argparse::ArgumentParser program("b_list", { platform_project_version.begin(), platform_project_version.end() });
//
// datasets subparser
//
argparse::ArgumentParser datasets_command("datasets");
datasets_command.add_description("List datasets available in the platform.");
datasets_command.add_argument("--excel").help("Output in Excel format").default_value(false).implicit_value(true);
//
// results subparser
//
argparse::ArgumentParser results_command("results");
results_command.add_description("List the results of a given dataset.");
auto datasets = platform::Datasets(false, platform::Paths::datasets());
results_command.add_argument("-d", "--dataset")
.help("Dataset to use " + datasets.toString())
.required()
.action([](const std::string& value) {
auto datasets = platform::Datasets(false, platform::Paths::datasets());
static const std::vector<std::string> choices = datasets.getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw std::runtime_error("Dataset must be one of " + datasets.toString());
}
);
results_command.add_argument("-m", "--model")
.help("Model to use or any")
.default_value("any");
results_command.add_argument("--excel").help("Output in Excel format").default_value(false).implicit_value(true);
results_command.add_argument("-s", "--score").default_value("accuracy").help("Filter results of the score name supplied");
// Add subparsers
program.add_subparser(datasets_command);
program.add_subparser(results_command);
// Parse command line and execute
try {
program.parse_args(argc, argv);
bool found = false;
map<std::string, void(*)(argparse::ArgumentParser&)> commands = { {"datasets", &list_datasets}, {"results", &list_results} };
for (const auto& command : commands) {
if (program.is_subcommand_used(command.first)) {
std::invoke(command.second, program.at<argparse::ArgumentParser>(command.first));
found = true;
break;
}
}
if (!found) {
throw std::runtime_error("You must specify one of the following commands: {datasets, results}\n");
}
}
catch (const exception& err) {
cerr << err.what() << std::endl;
cerr << program;
exit(1);
}
std::cout << Colors::RESET() << std::endl;
return 0;
}

37
src/commands/b_main.cpp Normal file
View File

@@ -0,0 +1,37 @@
#include <argparse/argparse.hpp>
#include "main/Experiment.h"
#include "main/ArgumentsExperiment.h"
#include "config_platform.h"
using json = nlohmann::ordered_json;
int main(int argc, char** argv)
{
argparse::ArgumentParser program("b_main", { platform_project_version.begin(), platform_project_version.end() });
auto arguments = platform::ArgumentsExperiment(program, platform::experiment_t::NORMAL);
arguments.add_arguments();
arguments.parse_args(argc, argv);
/*
* Begin Processing
*/
// Initialize the experiment class with the command line arguments
auto experiment = arguments.initializedExperiment();
auto path_results = arguments.getPathResults();
platform::Timer timer;
timer.start();
experiment.go();
experiment.setDuration(timer.getDuration());
if (!arguments.isQuiet()) {
// Classification report if only one dataset is tested
experiment.report();
}
if (arguments.haveToSaveResults()) {
experiment.saveResult(path_results);
}
if (arguments.doGraph()) {
experiment.saveGraph();
}
return 0;
}

85
src/commands/b_manage.cpp Normal file
View File

@@ -0,0 +1,85 @@
#include <utility>
#include <iostream>
#include <sys/ioctl.h>
#include "common/Paths.h"
#include <argparse/argparse.hpp>
#include "manage/ManageScreen.h"
#include <signal.h>
#include "config_platform.h"
platform::ManageScreen* manager = nullptr;
void manageArguments(argparse::ArgumentParser& program, int argc, char** argv)
{
program.add_argument("-m", "--model").default_value("any").help("Filter results of the selected model)");
program.add_argument("-s", "--score").default_value("any").help("Filter results of the score name supplied");
program.add_argument("--folder").help("Results folder to use").default_value(platform::Paths::results());
program.add_argument("--platform").default_value("any").help("Filter results of the selected platform");
program.add_argument("--complete").help("Show only results with all datasets").default_value(false).implicit_value(true);
program.add_argument("--partial").help("Show only partial results").default_value(false).implicit_value(true);
program.add_argument("--compare").help("Compare with best results").default_value(false).implicit_value(true);
try {
program.parse_args(argc, argv);
auto platform = program.get<std::string>("platform");
auto model = program.get<std::string>("model");
auto score = program.get<std::string>("score");
auto complete = program.get<bool>("complete");
auto partial = program.get<bool>("partial");
auto compare = program.get<bool>("compare");
}
catch (const std::exception& err) {
std::cerr << err.what() << std::endl;
std::cerr << program;
exit(1);
}
}
std::pair<int, int> numRowsCols()
{
#ifdef TIOCGSIZE
struct ttysize ts;
ioctl(STDIN_FILENO, TIOCGSIZE, &ts);
return { ts.ts_lines, ts.ts_cols };
#elif defined(TIOCGWINSZ)
struct winsize ts;
ioctl(STDIN_FILENO, TIOCGWINSZ, &ts);
return { ts.ws_row, ts.ws_col };
#endif /* TIOCGSIZE */
}
void handleResize(int sig)
{
auto [rows, cols] = numRowsCols();
manager->updateSize(rows, cols);
}
int main(int argc, char** argv)
{
auto program = argparse::ArgumentParser("b_manage", { platform_project_version.begin(), platform_project_version.end() });
manageArguments(program, argc, argv);
std::string model = program.get<std::string>("model");
std::string path = program.get<std::string>("folder");
if (path.back() != '/') {
path += '/';
}
std::string score = program.get<std::string>("score");
std::string platform = program.get<std::string>("platform");
bool complete = program.get<bool>("complete");
bool partial = program.get<bool>("partial");
bool compare = program.get<bool>("compare");
if (complete)
partial = false;
signal(SIGWINCH, handleResize);
auto [rows, cols] = numRowsCols();
manager = new platform::ManageScreen(path, rows, cols, model, score, platform, complete, partial, compare);
manager->doMenu();
auto fileName = manager->getExcelFileName();
delete manager;
if (!fileName.empty()) {
std::cout << "Opening " << fileName << std::endl;
platform::openFile(fileName);
}
return 0;
}

102
src/commands/b_results.cpp Normal file
View File

@@ -0,0 +1,102 @@
#include <iostream>
#include <filesystem>
#include <fstream>
#include <vector>
#include "nlohmann/json.hpp"
#include "argparse/argparse.hpp"
#include "common/Paths.h"
#include "results/JsonValidator.h"
#include "results/SchemaV1_0.h"
#include "config_platform.h"
using json = nlohmann::json;
namespace fs = std::filesystem;
void header(const std::string& message, int length, const std::string& symbol)
{
std::cout << std::string(length + 11, symbol[0]) << std::endl;
std::cout << symbol << " " << std::setw(length + 7) << std::left << message << " " << symbol << std::endl;
std::cout << std::string(length + 11, symbol[0]) << std::endl;
}
int main(int argc, char* argv[])
{
argparse::ArgumentParser program("b_results", { platform_project_version.begin(), platform_project_version.end() });
program.add_description("Check the results files and optionally fixes them.");
program.add_argument("--fix").help("Fix any errors in results").default_value(false).implicit_value(true);
program.add_argument("--file").help("check only this results file").default_value("");
std::string nameSuffix = "results_";
std::string schemaVersion = "1.0";
bool fix_it = false;
std::string selected_file;
try {
program.parse_args(argc, argv);
fix_it = program.get<bool>("fix");
selected_file = program.get<std::string>("file");
}
catch (const std::exception& err) {
std::cerr << err.what() << std::endl;
std::cerr << program;
exit(1);
}
//
// Determine the files to process
//
std::vector<std::string> result_files;
int max_length = 0;
if (selected_file != "") {
if (!selected_file.starts_with(platform::Paths::results())) {
selected_file = platform::Paths::results() + selected_file;
}
// Only check the selected file
result_files.push_back(selected_file);
max_length = selected_file.length();
} else {
// Load the result files and find the longest file name
for (const auto& entry : fs::directory_iterator(platform::Paths::results())) {
if (entry.is_regular_file() && entry.path().filename().string().starts_with(nameSuffix) && entry.path().filename().string().ends_with(".json")) {
std::string fileName = entry.path().string();
if (fileName.length() > max_length) {
max_length = fileName.length();
}
result_files.push_back(fileName);
}
}
}
//
// Process the results files
//
if (result_files.empty()) {
std::cerr << "Error: No result files found." << std::endl;
return 1;
}
std::string header_message = "Processing " + std::to_string(result_files.size()) + " result files.";
header(header_message, max_length, "*");
platform::JsonValidator validator(platform::SchemaV1_0::schema);
int n_errors = 0;
std::vector<std::string> files_with_errors;
for (const auto& file_name : result_files) {
std::vector<std::string> errors = validator.validate_file(file_name);
if (!errors.empty()) {
n_errors++;
std::cout << std::setw(max_length) << std::left << file_name << ": " << errors.size() << " Errors:" << std::endl;
for (const auto& error : errors) {
std::cout << " - " << error << std::endl;
}
if (fix_it) {
validator.fix_it(file_name);
std::cout << " -> File fixed." << std::endl;
}
files_with_errors.push_back(file_name);
}
}
if (n_errors == 0) {
header("All files are valid.", max_length, "*");
} else {
std::string $verb = (fix_it) ? "had" : "have";
std::string msg = std::to_string(n_errors) + " files " + $verb + " errors.";
header(msg, max_length, "*");
for (const auto& file_name : files_with_errors) {
std::cout << "- " << file_name << std::endl;
}
}
return 0;
}

View File

@@ -1,5 +1,5 @@
#ifndef LOCALE_H #ifndef CLOCALE_H
#define LOCALE_H #define CLOCALE_H
#include <locale> #include <locale>
#include <iostream> #include <iostream>
#include <string> #include <string>

View File

@@ -1,15 +1,30 @@
#ifndef COLORS_H #ifndef COLORS_H
#define COLORS_H #define COLORS_H
#include <string>
class Colors { class Colors {
public: public:
static std::string MAGENTA() { return "\033[1;35m"; } static std::string BLACK() { return "\033[1;30m"; }
static std::string IBLACK() { return "\033[0;90m"; }
static std::string BLUE() { return "\033[1;34m"; } static std::string BLUE() { return "\033[1;34m"; }
static std::string CYAN() { return "\033[1;36m"; }
static std::string GREEN() { return "\033[1;32m"; }
static std::string YELLOW() { return "\033[1;33m"; }
static std::string RED() { return "\033[1;31m"; }
static std::string WHITE() { return "\033[1;37m"; }
static std::string IBLUE() { return "\033[0;94m"; } static std::string IBLUE() { return "\033[0;94m"; }
static std::string CYAN() { return "\033[1;36m"; }
static std::string ICYAN() { return "\033[0;96m"; }
static std::string GREEN() { return "\033[1;32m"; }
static std::string IGREEN() { return "\033[0;92m"; }
static std::string MAGENTA() { return "\033[1;35m"; }
static std::string IMAGENTA() { return "\033[0;95m"; }
static std::string RED() { return "\033[1;31m"; }
static std::string IRED() { return "\033[0;91m"; }
static std::string YELLOW() { return "\033[1;33m"; }
static std::string IYELLOW() { return "\033[0;93m"; }
static std::string WHITE() { return "\033[1;37m"; }
static std::string IWHITE() { return "\033[0;97m"; }
static std::string RESET() { return "\033[0m"; } static std::string RESET() { return "\033[0m"; }
static std::string BOLD() { return "\033[1m"; }
static std::string UNDERLINE() { return "\033[4m"; }
static std::string BLINK() { return "\033[5m"; }
static std::string REVERSE() { return "\033[7m"; }
static std::string CONCEALED() { return "\033[8m"; }
static std::string CLRSCR() { return "\033[2J\033[1;1H"; }
}; };
#endif // COLORS_H #endif

View File

@@ -1,215 +0,0 @@
#include "Dataset.h"
#include "ArffFiles.h"
#include <fstream>
namespace platform {
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
{
}
std::string Dataset::getName() const
{
return name;
}
std::string Dataset::getClassName() const
{
return className;
}
std::vector<std::string> Dataset::getFeatures() const
{
if (loaded) {
return features;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures() const
{
if (loaded) {
return n_features;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples() const
{
if (loaded) {
return n_samples;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
std::map<std::string, std::vector<int>> Dataset::getStates() const
{
if (loaded) {
return states;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<int>>&, std::vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
std::string line;
getline(file, line);
std::vector<std::string> tokens = split(line, ',');
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
if (className == "-1") {
className = tokens.back();
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw std::invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = std::vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
}
states[className] = std::vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
}
std::vector<std::string> tokenize(std::string line)
{
std::vector<std::string> tokens;
for (auto i = 0; i < line.size(); ++i) {
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
std::string token = line.substr(0, i);
tokens.push_back(token);
line.erase(line.begin(), line.begin() + i + 1);
i = 0;
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
line.erase(line.begin(), line.begin() + i + 1);
}
}
if (line.size() > 0) {
tokens.push_back(line);
}
return tokens;
}
void Dataset::load_rdata()
{
ifstream file(path + "/" + name + "_R.dat");
if (file.is_open()) {
std::string line;
getline(file, line);
line = ArffFiles::trim(line);
std::vector<std::string> tokens = tokenize(line);
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
if (className == "-1") {
className = ArffFiles::trim(tokens.back());
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = tokenize(line);
// We have to skip the first token, which is the instance number.
for (auto i = 1; i < features.size() + 1; ++i) {
const float value = stof(tokens[i]);
Xv[i - 1].push_back(value);
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw std::invalid_argument("Unable to open dataset file.");
}
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
} else if (fileType == RDATA) {
load_rdata();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
}
n_samples = Xv[0].size();
n_features = Xv.size();
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
}
y = torch::tensor(yv, torch::kInt32);
}
std::vector<mdlp::labels_t> Dataset::discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
{
std::vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
}

278
src/common/Dataset.cpp Normal file
View File

@@ -0,0 +1,278 @@
#include <ArffFiles.hpp>
#include <fstream>
#include "Dataset.h"
namespace platform {
const std::string message_dataset_not_loaded = "Dataset not loaded.";
Dataset::Dataset(const Dataset& dataset) :
path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples),
n_features(dataset.n_features), numericFeatures(dataset.numericFeatures), features(dataset.features),
states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y),
X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), yv(dataset.yv),
fileType(dataset.fileType)
{
}
std::string Dataset::getName() const
{
return name;
}
std::vector<std::string> Dataset::getFeatures() const
{
if (loaded) {
return features;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
int Dataset::getNFeatures() const
{
if (loaded) {
return n_features;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
int Dataset::getNSamples() const
{
if (loaded) {
return n_samples;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
std::string Dataset::getClassName() const
{
return className;
}
int Dataset::getNClasses() const
{
if (loaded) {
return *std::max_element(yv.begin(), yv.end()) + 1;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
std::vector<std::string> Dataset::getLabels() const
{
// Return the labels factorization result
if (loaded) {
return labels;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
std::vector<int> Dataset::getClassesCounts() const
{
if (loaded) {
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
for (auto y : yv) {
counts[y]++;
}
return counts;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
std::map<std::string, std::vector<int>> Dataset::getStates() const
{
if (loaded) {
return states;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
return { X, y };
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (!file.is_open()) {
throw std::invalid_argument("Unable to open dataset file.");
}
labels.clear();
std::string line;
getline(file, line);
std::vector<std::string> tokens = split(line, ',');
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
if (className == "-1") {
className = tokens.back();
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
auto label = trim(tokens.back());
if (find(labels.begin(), labels.end(), label) == labels.end()) {
labels.push_back(label);
}
yv.push_back(stoi(label));
}
file.close();
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
auto [max_value, idx] = torch::max(X_train.index({ i, "..." }), 0);
states[features[i]] = std::vector<int>(max_value.item<int>() + 1);
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
}
auto [max_value, idx] = torch::max(y_train, 0);
states[className] = std::vector<int>(max_value.item<int>() + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
labels = arff.getLabels();
}
std::vector<std::string> tokenize(std::string line)
{
std::vector<std::string> tokens;
for (auto i = 0; i < line.size(); ++i) {
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
std::string token = line.substr(0, i);
tokens.push_back(token);
line.erase(line.begin(), line.begin() + i + 1);
i = 0;
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
line.erase(line.begin(), line.begin() + i + 1);
}
}
if (line.size() > 0) {
tokens.push_back(line);
}
return tokens;
}
void Dataset::load_rdata()
{
ifstream file(path + "/" + name + "_R.dat");
if (!file.is_open()) {
throw std::invalid_argument("Unable to open dataset file.");
}
std::string line;
labels.clear();
getline(file, line);
line = ArffFiles::trim(line);
std::vector<std::string> tokens = tokenize(line);
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
if (className == "-1") {
className = ArffFiles::trim(tokens.back());
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = tokenize(line);
// We have to skip the first token, which is the instance number.
for (auto i = 1; i < features.size() + 1; ++i) {
const float value = stof(tokens[i]);
Xv[i - 1].push_back(value);
}
auto label = trim(tokens.back());
if (find(labels.begin(), labels.end(), label) == labels.end()) {
labels.push_back(label);
}
yv.push_back(stoi(label));
}
file.close();
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
} else if (fileType == RDATA) {
load_rdata();
}
n_samples = Xv[0].size();
n_features = Xv.size();
if (numericFeaturesIdx.size() == 0) {
numericFeatures = std::vector<bool>(n_features, false);
} else {
if (numericFeaturesIdx.at(0) == -1) {
numericFeatures = std::vector<bool>(n_features, true);
} else {
numericFeatures = std::vector<bool>(n_features, false);
for (auto i : numericFeaturesIdx) {
numericFeatures[i] = true;
}
}
}
// Build Tensors
X = torch::zeros({ n_features, n_samples }, torch::kFloat32);
for (int i = 0; i < features.size(); ++i) {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
y = torch::tensor(yv, torch::kInt32);
loaded = true;
}
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> Dataset::getTrainTestTensors(std::vector<int>& train, std::vector<int>& test)
{
if (!loaded) {
throw std::invalid_argument(message_dataset_not_loaded);
}
auto train_t = torch::tensor(train);
int samples_train = train.size();
int samples_test = test.size();
auto test_t = torch::tensor(test);
X_train = X.index({ "...", train_t });
y_train = y.index({ train_t });
X_test = X.index({ "...", test_t });
y_test = y.index({ test_t });
if (discretize) {
auto discretizer = Discretization::instance()->create(discretizer_algorithm);
auto X_train_d = torch::zeros({ n_features, samples_train }, torch::kInt32);
auto X_test_d = torch::zeros({ n_features, samples_test }, torch::kInt32);
for (auto feature = 0; feature < n_features; ++feature) {
if (numericFeatures[feature]) {
auto feature_train = X_train.index({ feature, "..." });
auto feature_test = X_test.index({ feature, "..." });
auto feature_train_disc = discretizer->fit_transform_t(feature_train, y_train);
auto feature_test_disc = discretizer->transform_t(feature_test);
X_train_d.index_put_({ feature, "..." }, feature_train_disc);
X_test_d.index_put_({ feature, "..." }, feature_test_disc);
} else {
X_train_d.index_put_({ feature, "..." }, X_train.index({ feature, "..." }).to(torch::kInt32));
X_test_d.index_put_({ feature, "..." }, X_test.index({ feature, "..." }).to(torch::kInt32));
}
}
X_train = X_train_d;
X_test = X_test_d;
assert(X_train.dtype() == torch::kInt32);
assert(X_test.dtype() == torch::kInt32);
computeStates();
}
assert(y_train.dtype() == torch::kInt32);
assert(y_test.dtype() == torch::kInt32);
return { X_train, X_test, y_train, y_test };
}
}

View File

@@ -4,75 +4,57 @@
#include <map> #include <map>
#include <vector> #include <vector>
#include <string> #include <string>
#include "CPPFImdlp.h" #include <tuple>
#include <common/DiscretizationRegister.h>
#include "Utils.h" #include "Utils.h"
#include "SourceData.h"
namespace platform { namespace platform {
enum fileType_t { CSV, ARFF, RDATA };
class SourceData {
public:
SourceData(std::string source)
{
if (source == "Surcov") {
path = "datasets/";
fileType = CSV;
} else if (source == "Arff") {
path = "datasets/";
fileType = ARFF;
} else if (source == "Tanveer") {
path = "data/";
fileType = RDATA;
} else {
throw std::invalid_argument("Unknown source.");
}
}
std::string getPath()
{
return path;
}
fileType_t getFileType()
{
return fileType;
}
private:
std::string path;
fileType_t fileType;
};
class Dataset { class Dataset {
public:
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType, std::vector<int> numericFeaturesIdx, std::string discretizer_algo = "none") :
path(path), name(name), className(className), discretize(discretize),
loaded(false), fileType(fileType), numericFeaturesIdx(numericFeaturesIdx), discretizer_algorithm(discretizer_algo)
{
};
explicit Dataset(const Dataset&);
std::string getName() const;
std::string getClassName() const;
int getNClasses() const;
std::vector<std::string> getLabels() const; // return the labels factorization result
std::vector<int> getClassesCounts() const;
std::vector<string> getFeatures() const;
std::map<std::string, std::vector<int>> getStates() const;
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> getTrainTestTensors(std::vector<int>& train, std::vector<int>& test);
int getNFeatures() const;
int getNSamples() const;
std::vector<bool>& getNumericFeatures() { return numericFeatures; }
void load();
const bool inline isLoaded() const { return loaded; };
private: private:
std::string path; std::string path;
std::string name; std::string name;
fileType_t fileType; fileType_t fileType;
std::string className; std::string className;
int n_samples{ 0 }, n_features{ 0 }; int n_samples{ 0 }, n_features{ 0 };
std::vector<int> numericFeaturesIdx;
std::string discretizer_algorithm;
std::vector<bool> numericFeatures; // true if feature is numeric
std::vector<std::string> features; std::vector<std::string> features;
std::vector<std::string> labels;
std::map<std::string, std::vector<int>> states; std::map<std::string, std::vector<int>> states;
bool loaded; bool loaded;
bool discretize; bool discretize;
torch::Tensor X, y; torch::Tensor X, y;
torch::Tensor X_train, X_test, y_train, y_test;
std::vector<std::vector<float>> Xv; std::vector<std::vector<float>> Xv;
std::vector<std::vector<int>> Xd;
std::vector<int> yv; std::vector<int> yv;
void buildTensors();
void load_csv(); void load_csv();
void load_arff(); void load_arff();
void load_rdata(); void load_rdata();
void computeStates(); void computeStates();
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y); std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y);
public:
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
explicit Dataset(const Dataset&);
std::string getName() const;
std::string getClassName() const;
std::vector<string> getFeatures() const;
std::map<std::string, std::vector<int>> getStates() const;
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures() const;
int getNSamples() const;
void load();
const bool inline isLoaded() const { return loaded; };
}; };
}; };
#endif #endif

View File

@@ -1,129 +0,0 @@
#include "Datasets.h"
#include <fstream>
namespace platform {
void Datasets::load()
{
auto sd = SourceData(sfileType);
fileType = sd.getFileType();
path = sd.getPath();
ifstream catalog(path + "all.txt");
if (catalog.is_open()) {
std::string line;
while (getline(catalog, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
std::vector<std::string> tokens = split(line, ',');
std::string name = tokens[0];
std::string className;
if (tokens.size() == 1) {
className = "-1";
} else {
className = tokens[1];
}
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
}
catalog.close();
} else {
throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
}
}
std::vector<std::string> Datasets::getNames()
{
std::vector<std::string> result;
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
return result;
}
std::vector<std::string> Datasets::getFeatures(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getFeatures();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
map<std::string, std::vector<int>> Datasets::getStates(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getStates();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
void Datasets::loadDataset(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return;
} else {
datasets.at(name)->load();
}
}
std::string Datasets::getClassName(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getClassName();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNSamples(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getNSamples();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNClasses(const std::string& name)
{
if (datasets.at(name)->isLoaded()) {
auto className = datasets.at(name)->getClassName();
if (discretize) {
auto states = getStates(name);
return states.at(className).size();
}
auto [Xv, yv] = getVectors(name);
return *std::max_element(yv.begin(), yv.end()) + 1;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
std::vector<int> Datasets::getClassesCounts(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
auto [Xv, yv] = datasets.at(name)->getVectors();
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
for (auto y : yv) {
counts[y]++;
}
return counts;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<float>>&, std::vector<int>&> Datasets::getVectors(const std::string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getVectors();
}
pair<std::vector<std::vector<int>>&, std::vector<int>&> Datasets::getVectorsDiscretized(const std::string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getVectorsDiscretized();
}
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(const std::string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getTensors();
}
bool Datasets::isDataset(const std::string& name) const
{
return datasets.find(name) != datasets.end();
}
}

105
src/common/Datasets.cpp Normal file
View File

@@ -0,0 +1,105 @@
#include <fstream>
#include<algorithm>
#include "Datasets.h"
#include <nlohmann/json.hpp>
namespace platform {
using json = nlohmann::ordered_json;
const std::string message_dataset_not_loaded = "dataset not loaded.";
Datasets::Datasets(bool discretize, std::string sfileType, std::string discretizer_algorithm) :
discretize(discretize), sfileType(sfileType), discretizer_algorithm(discretizer_algorithm)
{
if ((discretizer_algorithm == "none" || discretizer_algorithm == "") && discretize) {
throw std::runtime_error("Can't discretize without discretization algorithm");
}
load();
}
void Datasets::load()
{
auto sd = SourceData(sfileType);
fileType = sd.getFileType();
path = sd.getPath();
ifstream catalog(path + "all.txt");
std::vector<int> numericFeaturesIdx;
if (!catalog.is_open()) {
throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
}
std::string line;
std::vector<std::string> sorted_lines;
while (getline(catalog, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
sorted_lines.push_back(line);
}
sort(sorted_lines.begin(), sorted_lines.end(), [](const auto& lhs, const auto& rhs) {
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
});
for (const auto& line : sorted_lines) {
std::vector<std::string> tokens = split(line, ';');
std::string name = tokens[0];
std::string className;
numericFeaturesIdx.clear();
int size = tokens.size();
switch (size) {
case 1:
className = "-1";
numericFeaturesIdx.push_back(-1);
break;
case 2:
className = tokens[1];
numericFeaturesIdx.push_back(-1);
break;
case 3:
{
className = tokens[1];
auto numericFeatures = tokens[2];
if (numericFeatures == "all") {
numericFeaturesIdx.push_back(-1);
} else {
if (numericFeatures != "none") {
auto features = json::parse(numericFeatures);
for (auto& f : features) {
numericFeaturesIdx.push_back(f);
}
}
}
}
break;
default:
throw std::invalid_argument("Invalid catalog file format.");
}
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType, numericFeaturesIdx, discretizer_algorithm);
}
catalog.close();
}
std::vector<std::string> Datasets::getNames()
{
std::vector<std::string> result;
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
sort(result.begin(), result.end(), [](const auto& lhs, const auto& rhs) {
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
});
return result;
}
bool Datasets::isDataset(const std::string& name) const
{
return datasets.find(name) != datasets.end();
}
std::string Datasets::toString() const
{
std::string result;
std::string sep = "";
for (const auto& d : datasets) {
result += sep + d.first;
sep = ", ";
}
return "{" + result + "}";
}
}

View File

@@ -3,28 +3,20 @@
#include "Dataset.h" #include "Dataset.h"
namespace platform { namespace platform {
class Datasets { class Datasets {
public:
explicit Datasets(bool discretize, std::string sfileType, std::string discretizer_algorithm = "none");
std::vector<std::string> getNames();
bool isDataset(const std::string& name) const;
Dataset& getDataset(const std::string& name) const { return *datasets.at(name); }
std::string toString() const;
private: private:
std::string path; std::string path;
fileType_t fileType; fileType_t fileType;
std::string sfileType; std::string sfileType;
std::string discretizer_algorithm;
std::map<std::string, std::unique_ptr<Dataset>> datasets; std::map<std::string, std::unique_ptr<Dataset>> datasets;
bool discretize; bool discretize;
void load(); // Loads the list of datasets void load(); // Loads the list of datasets
public:
explicit Datasets(bool discretize, std::string sfileType) : discretize(discretize), sfileType(sfileType) { load(); };
std::vector<string> getNames();
std::vector<string> getFeatures(const std::string& name) const;
int getNSamples(const std::string& name) const;
std::string getClassName(const std::string& name) const;
int getNClasses(const std::string& name);
std::vector<int> getClassesCounts(const std::string& name) const;
std::map<std::string, std::vector<int>> getStates(const std::string& name) const;
std::pair<std::vector<std::vector<float>>&, std::vector<int>&> getVectors(const std::string& name);
std::pair<std::vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized(const std::string& name);
std::pair<torch::Tensor&, torch::Tensor&> getTensors(const std::string& name);
bool isDataset(const std::string& name) const;
void loadDataset(const std::string& name) const;
}; };
}; };
#endif #endif

View File

@@ -0,0 +1,55 @@
#include "Discretization.h"
namespace platform {
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
Discretization* Discretization::factory = nullptr;
Discretization* Discretization::instance()
{
//manages singleton
if (factory == nullptr)
factory = new Discretization();
return factory;
}
void Discretization::registerFactoryFunction(const std::string& name,
function<mdlp::Discretizer* (void)> classFactoryFunction)
{
// register the class factory function
functionRegistry[name] = classFactoryFunction;
}
std::shared_ptr<mdlp::Discretizer> Discretization::create(const std::string& name)
{
mdlp::Discretizer* instance = nullptr;
// find name in the registry and call factory method.
auto it = functionRegistry.find(name);
if (it != functionRegistry.end())
instance = it->second();
// wrap instance in a shared ptr and return
if (instance != nullptr)
return std::unique_ptr<mdlp::Discretizer>(instance);
else
throw std::runtime_error("Discretizer not found: " + name);
}
std::vector<std::string> Discretization::getNames()
{
std::vector<std::string> names;
transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names),
[](const pair<std::string, function<mdlp::Discretizer* (void)>>& pair) { return pair.first; });
return names;
}
std::string Discretization::toString()
{
std::string result = "";
std::string sep = "";
for (const auto& pair : functionRegistry) {
result += sep + pair.first;
sep = ", ";
}
return "{" + result + "}";
}
RegistrarDiscretization::RegistrarDiscretization(const std::string& name, function<mdlp::Discretizer* (void)> classFactoryFunction)
{
// register the class factory function
Discretization::instance()->registerFactoryFunction(name, classFactoryFunction);
}
}

View File

@@ -0,0 +1,33 @@
#ifndef DISCRETIZATION_H
#define DISCRETIZATION_H
#include <map>
#include <memory>
#include <string>
#include <functional>
#include <vector>
#include <fimdlp/Discretizer.h>
#include <fimdlp/BinDisc.h>
#include <fimdlp/CPPFImdlp.h>
namespace platform {
class Discretization {
public:
Discretization(Discretization&) = delete;
void operator=(const Discretization&) = delete;
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
static Discretization* instance();
std::shared_ptr<mdlp::Discretizer> create(const std::string& name);
void registerFactoryFunction(const std::string& name,
function<mdlp::Discretizer* (void)> classFactoryFunction);
std::vector<string> getNames();
std::string toString();
private:
map<std::string, function<mdlp::Discretizer* (void)>> functionRegistry;
static Discretization* factory; //singleton
Discretization() {};
};
class RegistrarDiscretization {
public:
RegistrarDiscretization(const std::string& className, function<mdlp::Discretizer* (void)> classFactoryFunction);
};
}
#endif

View File

@@ -0,0 +1,38 @@
#ifndef DISCRETIZATIONREGISTER_H
#define DISCRETIZATIONREGISTER_H
#include <common/Discretization.h>
static platform::RegistrarDiscretization registrarM("mdlp",
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp();});
static platform::RegistrarDiscretization registrarBU3("bin3u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ3("bin3q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU4("bin4u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ4("bin4q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU5("bin5u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(5, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ5("bin5q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(5, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU6("bin6u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(6, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ6("bin6q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(6, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU7("bin7u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(7, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ7("bin7q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(7, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU8("bin8u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(8, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ8("bin8q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(8, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU9("bin9u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(9, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ9("bin9q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(9, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU10("bin10u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(10, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ10("bin10q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(10, mdlp::strategy_t::QUANTILE);});
#endif

View File

@@ -13,9 +13,55 @@ namespace platform {
class DotEnv { class DotEnv {
private: private:
std::map<std::string, std::string> env; std::map<std::string, std::string> env;
std::map<std::string, std::vector<std::string>> valid;
public: public:
DotEnv() DotEnv(bool create = false)
{ {
valid =
{
{"depth", {"any"}},
{"discretize", {"0", "1"}},
{"discretize_algo", {"mdlp", "bin3u", "bin3q", "bin4u", "bin4q", "bin5q", "bin5u", "bin6q", "bin6u", "bin7q", "bin7u", "bin8q", "bin8u", "bin9q", "bin9u", "bin10q", "bin10u"}},
{"experiment", {"discretiz", "odte", "covid", "Test"}},
{"fit_features", {"0", "1"}},
{"framework", {"bulma", "bootstrap"}},
{"ignore_nan", {"0", "1"}},
{"leaves", {"any"}},
{"margin", {"0.1", "0.2", "0.3"}},
{"model", {"any"}},
{"n_folds", {"5", "10"}},
{"nodes", {"any"}},
{"platform", {"any"}},
{"stratified", {"0", "1"}},
{"score", {"accuracy", "roc-auc-ovr"}},
{"seeds", {"any"}},
{"smooth_strat", {"ORIGINAL", "LAPLACE", "CESTNIK"}},
{"source_data", {"Arff", "Tanveer", "Surcov", "Test"}},
};
if (create) {
// For testing purposes
std::ofstream file(".env");
file << "experiment=Test" << std::endl;
file << "source_data=Test" << std::endl;
file << "margin=0.1" << std::endl;
file << "score=accuracy" << std::endl;
file << "platform=um790Linux" << std::endl;
file << "n_folds=5" << std::endl;
file << "discretize_algo=mdlp" << std::endl;
file << "smooth_strat=ORIGINAL" << std::endl;
file << "stratified=0" << std::endl;
file << "model=TAN" << std::endl;
file << "seeds=[271]" << std::endl;
file << "discretize=0" << std::endl;
file << "ignore_nan=0" << std::endl;
file << "nodes=Nodes" << std::endl;
file << "leaves=Edges" << std::endl;
file << "depth=States" << std::endl;
file << "fit_features=0" << std::endl;
file << "framework=bulma" << std::endl;
file << "margin=0.1" << std::endl;
file.close();
}
std::ifstream file(".env"); std::ifstream file(".env");
if (!file.is_open()) { if (!file.is_open()) {
std::cerr << "File .env not found" << std::endl; std::cerr << "File .env not found" << std::endl;
@@ -30,12 +76,62 @@ namespace platform {
std::istringstream iss(line); std::istringstream iss(line);
std::string key, value; std::string key, value;
if (std::getline(iss, key, '=') && std::getline(iss, value)) { if (std::getline(iss, key, '=') && std::getline(iss, value)) {
key = trim(key);
value = trim(value);
parse(key, value);
env[key] = value; env[key] = value;
} }
} }
parseEnv();
}
void parse(const std::string& key, const std::string& value)
{
if (valid.find(key) == valid.end()) {
std::cerr << "Invalid key in .env: " << key << std::endl;
exit(1);
}
if (valid[key].front() == "any") {
return;
}
if (std::find(valid[key].begin(), valid[key].end(), value) == valid[key].end()) {
std::cerr << "Invalid value in .env: " << key << " = " << value << std::endl;
exit(1);
}
}
std::vector<std::string> valid_tokens(const std::string& key)
{
if (valid.find(key) == valid.end()) {
return {};
}
return valid.at(key);
}
std::string valid_values(const std::string& key)
{
std::string valid_values = "{", sep = "";
if (valid.find(key) == valid.end()) {
return "{}";
}
for (const auto& value : valid.at(key)) {
valid_values += sep + value;
sep = ", ";
}
return valid_values + "}";
}
void parseEnv()
{
for (auto& [key, values] : valid) {
if (env.find(key) == env.end()) {
std::cerr << "Key not found in .env: " << key << ", valid values: " << valid_values(key) << std::endl;
exit(1);
}
}
} }
std::string get(const std::string& key) std::string get(const std::string& key)
{ {
if (env.find(key) == env.end()) {
std::cerr << "Key not found in .env: " << key << std::endl;
exit(1);
}
return env.at(key); return env.at(key);
} }
std::vector<int> getSeeds() std::vector<int> getSeeds()

View File

@@ -6,15 +6,30 @@
namespace platform { namespace platform {
class Paths { class Paths {
public: public:
static std::string results() { return "results/"; } static std::string createIfNotExists(const std::string& folder)
static std::string hiddenResults() { return "hidden_results/"; } {
static std::string excel() { return "excel/"; } if (!std::filesystem::exists(folder)) {
static std::string grid() { return "grid/"; } std::filesystem::create_directory(folder);
}
return folder;
}
static std::string results() { return createIfNotExists("results/"); }
static std::string hiddenResults() { return createIfNotExists("hidden_results/"); }
static std::string excel() { return createIfNotExists("excel/"); }
static std::string grid() { return createIfNotExists("grid/"); }
static std::string graphs() { return createIfNotExists("graphs/"); }
static std::string tex() { return createIfNotExists("tex/"); }
static std::string datasets() static std::string datasets()
{ {
auto env = platform::DotEnv(); auto env = platform::DotEnv();
return env.get("source_data"); return env.get("source_data");
} }
static std::string experiment_file(const std::string& fileName, bool discretize, bool stratified, int seed, int nfold)
{
std::string disc = discretize ? "_disc_" : "_ndisc_";
std::string strat = stratified ? "strat_" : "nstrat_";
return "datasets_experiment/" + fileName + disc + strat + std::to_string(seed) + "_" + std::to_string(nfold) + ".json";
}
static void createPath(const std::string& path) static void createPath(const std::string& path)
{ {
// Create directory if it does not exist // Create directory if it does not exist
@@ -25,7 +40,16 @@ namespace platform {
throw std::runtime_error("Could not create directory " + path); throw std::runtime_error("Could not create directory " + path);
} }
} }
static std::string bestResultsFile(const std::string& score, const std::string& model)
{
return "best_results_" + score + "_" + model + ".json";
}
static std::string bestResultsExcel(const std::string& score)
{
return "BestResults_" + score + ".xlsx";
}
static std::string excelResults() { return "some_results.xlsx"; } static std::string excelResults() { return "some_results.xlsx"; }
static std::string excelDatasets() { return "datasets.xlsx"; }
static std::string grid_input(const std::string& model) static std::string grid_input(const std::string& model)
{ {
return grid() + "grid_" + model + "_input.json"; return grid() + "grid_" + model + "_input.json";
@@ -34,6 +58,23 @@ namespace platform {
{ {
return grid() + "grid_" + model + "_output.json"; return grid() + "grid_" + model + "_output.json";
} }
static std::string tex_output()
{
return "results.tex";
}
static std::string md_output()
{
return "results.md";
}
static std::string tex_post_hoc()
{
return "post_hoc.tex";
}
static std::string md_post_hoc()
{
return "post_hoc.md";
}
}; };
} }
#endif #endif

View File

@@ -0,0 +1,38 @@
#ifndef SOURCEDATA_H
#define SOURCEDATA_H
namespace platform {
enum fileType_t { CSV, ARFF, RDATA };
class SourceData {
public:
SourceData(std::string source)
{
if (source == "Surcov") {
path = "datasets/";
fileType = CSV;
} else if (source == "Arff") {
path = "datasets/";
fileType = ARFF;
} else if (source == "Tanveer") {
path = "data/";
fileType = RDATA;
} else if (source == "Test") {
path = "@TEST_DATA_PATH@/";
fileType = ARFF;
} else {
throw std::invalid_argument("Unknown source.");
}
}
std::string getPath()
{
return path;
}
fileType_t getFileType()
{
return fileType;
}
private:
std::string path;
fileType_t fileType;
};
}
#endif

View File

@@ -9,10 +9,13 @@ namespace platform {
inline static const std::string black_star{ "\u2605" }; inline static const std::string black_star{ "\u2605" };
inline static const std::string cross{ "\u2717" }; inline static const std::string cross{ "\u2717" };
inline static const std::string upward_arrow{ "\u27B6" }; inline static const std::string upward_arrow{ "\u27B6" };
inline static const std::string down_arrow{ "\u27B4" }; inline static const std::string downward_arrow{ "\u27B4" };
inline static const std::string up_arrow{ "\u2B06" };
inline static const std::string down_arrow{ "\u2B07" };
inline static const std::string ellipsis{ "\u2026" };
inline static const std::string equal_best{ check_mark }; inline static const std::string equal_best{ check_mark };
inline static const std::string better_best{ black_star }; inline static const std::string better_best{ black_star };
inline static const std::string notebook{ "\U0001F5C8" }; inline static const std::string notebook{ "\U0001F5C8" };
}; };
} }
#endif // !SYMBOLS_H #endif

106
src/common/TensorUtils.hpp Normal file
View File

@@ -0,0 +1,106 @@
#ifndef TENSORUTILS_HPP
#define TENSORUTILS_HPP
#include <torch/torch.h>
#include <vector>
namespace platform {
class TensorUtils {
public:
template <typename T>
static std::vector<T> tensorToVector(const torch::Tensor& tensor)
{
torch::Tensor contig_tensor = tensor.contiguous();
auto num_elements = contig_tensor.numel();
const T* tensor_data = contig_tensor.data_ptr<T>();
std::vector<T> result(tensor_data, tensor_data + num_elements);
return result;
}
static std::vector<std::vector<int>> to_matrix(const torch::Tensor& X)
{
// Ensure tensor is contiguous in memory
auto X_contig = X.contiguous();
// Access tensor data pointer directly
auto data_ptr = X_contig.data_ptr<int>();
// IF you are using int64_t as the data type, use the following line
//auto data_ptr = X_contig.data_ptr<int64_t>();
//std::vector<std::vector<int64_t>> data(X.size(0), std::vector<int64_t>(X.size(1)));
// Prepare output container
std::vector<std::vector<int>> data(X.size(0), std::vector<int>(X.size(1)));
// Fill the 2D vector in a single loop using pointer arithmetic
int rows = X.size(0);
int cols = X.size(1);
for (int i = 0; i < rows; ++i) {
std::copy(data_ptr + i * cols, data_ptr + (i + 1) * cols, data[i].begin());
}
return data;
}
template <typename T>
static std::vector<T> to_vector(const torch::Tensor& y)
{
// Ensure the tensor is contiguous in memory
auto y_contig = y.contiguous();
// Access data pointer
auto data_ptr = y_contig.data_ptr<T>();
// Prepare output container
std::vector<T> data(y.size(0));
// Copy data efficiently
std::copy(data_ptr, data_ptr + y.size(0), data.begin());
return data;
}
static torch::Tensor to_matrix(const std::vector<std::vector<int>>& data)
{
if (data.empty()) return torch::empty({ 0, 0 }, torch::kInt64);
size_t rows = data.size();
size_t cols = data[0].size();
torch::Tensor tensor = torch::empty({ static_cast<long>(rows), static_cast<long>(cols) }, torch::kInt64);
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
tensor.index_put_({static_cast<int64_t>(i), static_cast<int64_t>(j)}, torch::scalar_tensor(data[i][j]));
}
}
return tensor;
}
};
static void dumpVector(const std::vector<std::vector<int>>& vec, const std::string& name)
{
std::cout << name << ": " << std::endl;
for (const auto& row : vec) {
std::cout << "[";
for (const auto& val : row) {
std::cout << val << " ";
}
std::cout << "]" << std::endl;
}
std::cout << std::endl;
}
static void dumpTensor(const torch::Tensor& tensor, const std::string& name)
{
std::cout << name << ": " << std::endl;
for (auto i = 0; i < tensor.size(0); i++) {
std::cout << "[";
for (auto j = 0; j < tensor.size(1); j++) {
std::cout << tensor[i][j].item<int>() << " ";
}
std::cout << "]" << std::endl;
}
std::cout << std::endl;
}
static void dumpTensorV(const torch::Tensor& tensor, const std::string& name)
{
std::cout << name << ": " << std::endl;
std::cout << "[";
for (int i = 0; i < tensor.size(0); i++) {
std::cout << tensor[i].item<int>() << " ";
}
std::cout << "]" << std::endl;
}
}
#endif // TENSORUTILS_HPP

View File

@@ -40,4 +40,4 @@ namespace platform {
} }
}; };
} /* namespace platform */ } /* namespace platform */
#endif /* TIMER_H */ #endif

View File

@@ -1,20 +1,20 @@
#ifndef UTILS_H #ifndef UTILS_H
#define UTILS_H #define UTILS_H
#include <unistd.h>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <algorithm>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <iomanip>
#include <string.h>
extern char** environ;
namespace platform { namespace platform {
//static std::vector<std::string> split(const std::string& text, char delimiter);
static std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(token);
}
return result;
}
static std::string trim(const std::string& str) static std::string trim(const std::string& str)
{ {
std::string result = str; std::string result = str;
@@ -26,5 +26,104 @@ namespace platform {
}).base(), result.end()); }).base(), result.end());
return result; return result;
} }
static std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(trim(token));
}
return result;
}
inline double compute_std(std::vector<double> values, double mean)
{
// Compute standard devation of the values
double sum = 0.0;
for (const auto& value : values) {
sum += std::pow(value - mean, 2);
}
double variance = sum / values.size();
return std::sqrt(variance);
}
inline std::string get_date()
{
time_t rawtime;
tm* timeinfo;
time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::ostringstream oss;
oss << std::put_time(timeinfo, "%Y-%m-%d");
return oss.str();
}
inline std::string get_time()
{
time_t rawtime;
tm* timeinfo;
time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::ostringstream oss;
oss << std::put_time(timeinfo, "%H:%M:%S");
return oss.str();
}
static void openFile(const std::string& fileName)
{
// #ifdef __APPLE__
// // macOS uses the "open" command
// std::string command = "open";
// #elif defined(__linux__)
// // Linux typically uses "xdg-open"
// std::string command = "xdg-open";
// #else
// // For other OSes, do nothing or handle differently
// std::cerr << "Unsupported platform." << std::endl;
// return;
// #endif
// execlp(command.c_str(), command.c_str(), fileName.c_str(), NULL);
#ifdef __APPLE__
const char* tool = "/usr/bin/open";
#elif defined(__linux__)
const char* tool = "/usr/bin/xdg-open";
#else
std::cerr << "Unsupported platform." << std::endl;
return;
#endif
// We'll build an argv array for execve:
std::vector<char*> argv;
argv.push_back(const_cast<char*>(tool)); // argv[0]
argv.push_back(const_cast<char*>(fileName.c_str())); // argv[1]
argv.push_back(nullptr);
// Make a new environment array, skipping BASH_FUNC_ variables
std::vector<std::string> filteredEnv;
for (char** env = environ; *env != nullptr; ++env) {
// *env is a string like "NAME=VALUE"
// We want to skip those starting with "BASH_FUNC_"
if (strncmp(*env, "BASH_FUNC_", 10) == 0) {
// skip it
continue;
}
filteredEnv.push_back(*env);
}
// Convert filteredEnv into a char* array
std::vector<char*> envp;
for (auto& var : filteredEnv) {
envp.push_back(const_cast<char*>(var.c_str()));
}
envp.push_back(nullptr);
// Now call execve with the cleaned environment
// NOTE: You may need a full path to the tool if it's not in PATH, or use which() logic
// For now, let's assume "open" or "xdg-open" is found in the default PATH:
execve(tool, argv.data(), envp.data());
// If we reach here, execve failed
perror("execve failed");
// This would terminate your current process if it's not in a child
// Usually you'd do something like:
_exit(EXIT_FAILURE);
}
} }
#endif #endif

View File

@@ -0,0 +1,492 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#include "AdaBoost.h"
#include "DecisionTree.h"
#include <cmath>
#include <algorithm>
#include <numeric>
#include <sstream>
#include <iomanip>
#include "common/TensorUtils.hpp"
// Conditional debug macro for performance-critical sections
#define DEBUG_LOG(condition, ...) \
do { \
if (__builtin_expect((condition), 0)) { \
std::cout << __VA_ARGS__ << std::endl; \
} \
} while(0)
namespace bayesnet {
AdaBoost::AdaBoost(int n_estimators, int max_depth)
: Ensemble(true), n_estimators(n_estimators), base_max_depth(max_depth), n(0), n_classes(0)
{
validHyperparameters = { "n_estimators", "base_max_depth" };
}
// Versión optimizada de buildModel - Reemplazar en AdaBoost.cpp:
void AdaBoost::buildModel(const torch::Tensor& weights)
{
// Initialize variables
models.clear();
alphas.clear();
training_errors.clear();
// Initialize n (number of features) and n_classes
n = dataset.size(0) - 1; // Exclude the label row
n_classes = states[className].size();
// Initialize sample weights uniformly
int n_samples = dataset.size(1);
sample_weights = torch::ones({ n_samples }) / n_samples;
// If initial weights are provided, incorporate them
if (weights.defined() && weights.numel() > 0) {
if (weights.size(0) != n_samples) {
throw std::runtime_error("weights must have the same length as number of samples");
}
sample_weights = weights.clone();
normalizeWeights();
}
// Conditional debug information (only when debug is enabled)
DEBUG_LOG(debug, "Starting AdaBoost training with " << n_estimators << " estimators\n"
<< "Number of classes: " << n_classes << "\n"
<< "Number of features: " << n << "\n"
<< "Number of samples: " << n_samples);
// Pre-compute random guess error threshold
const double random_guess_error = 1.0 - (1.0 / static_cast<double>(n_classes));
// Main AdaBoost training loop (SAMME algorithm)
for (int iter = 0; iter < n_estimators; ++iter) {
// Train base estimator with current sample weights
auto estimator = trainBaseEstimator(sample_weights);
// Calculate weighted error
double weighted_error = calculateWeightedError(estimator.get(), sample_weights);
training_errors.push_back(weighted_error);
// According to SAMME, we need error < random_guess_error
if (weighted_error >= random_guess_error) {
DEBUG_LOG(debug, "Error >= random guess (" << random_guess_error << "), stopping");
// If only one estimator and it's worse than random, keep it with zero weight
if (models.empty()) {
models.push_back(std::move(estimator));
alphas.push_back(0.0);
}
break; // Stop boosting
}
// Check for perfect classification BEFORE calculating alpha
if (weighted_error <= 1e-10) {
DEBUG_LOG(debug, "Perfect classification achieved (error=" << weighted_error << ")");
// For perfect classification, use a large but finite alpha
double alpha = 10.0 + std::log(static_cast<double>(n_classes - 1));
// Store the estimator and its weight
models.push_back(std::move(estimator));
alphas.push_back(alpha);
DEBUG_LOG(debug, "Iteration " << iter << ":\n"
<< " Weighted error: " << weighted_error << "\n"
<< " Alpha (finite): " << alpha << "\n"
<< " Random guess error: " << random_guess_error);
break; // Stop training as we have a perfect classifier
}
// Calculate alpha (estimator weight) using SAMME formula
// alpha = log((1 - err) / err) + log(K - 1)
// Clamp weighted_error to avoid division by zero and infinite alpha
double clamped_error = std::max(1e-15, std::min(1.0 - 1e-15, weighted_error));
double alpha = std::log((1.0 - clamped_error) / clamped_error) +
std::log(static_cast<double>(n_classes - 1));
// Clamp alpha to reasonable bounds to avoid numerical issues
alpha = std::max(-10.0, std::min(10.0, alpha));
// Store the estimator and its weight
models.push_back(std::move(estimator));
alphas.push_back(alpha);
// Update sample weights (only if this is not the last iteration)
if (iter < n_estimators - 1) {
updateSampleWeights(models.back().get(), alpha);
normalizeWeights();
}
DEBUG_LOG(debug, "Iteration " << iter << ":\n"
<< " Weighted error: " << weighted_error << "\n"
<< " Alpha: " << alpha << "\n"
<< " Random guess error: " << random_guess_error);
}
// Set the number of models actually trained
n_models = models.size();
DEBUG_LOG(debug, "AdaBoost training completed with " << n_models << " models");
}
void AdaBoost::trainModel(const torch::Tensor& weights, const Smoothing_t smoothing)
{
// Call buildModel which does the actual training
buildModel(weights);
fitted = true;
}
std::unique_ptr<Classifier> AdaBoost::trainBaseEstimator(const torch::Tensor& weights)
{
// Create a decision tree with specified max depth
auto tree = std::make_unique<DecisionTree>(base_max_depth);
// Ensure weights are properly normalized
auto normalized_weights = weights / weights.sum();
// Fit the tree with the current sample weights
tree->fit(dataset, features, className, states, normalized_weights, Smoothing_t::NONE);
return tree;
}
double AdaBoost::calculateWeightedError(Classifier* estimator, const torch::Tensor& weights)
{
// Get features and labels from dataset (avoid repeated indexing)
auto X = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), torch::indexing::Slice() });
auto y_true = dataset.index({ -1, torch::indexing::Slice() });
// Get predictions from the estimator
auto y_pred = estimator->predict(X);
// Vectorized error calculation using PyTorch operations
auto incorrect = (y_pred != y_true).to(torch::kDouble);
// Direct dot product for weighted error (more efficient than sum)
double weighted_error = torch::dot(incorrect, weights).item<double>();
// Clamp to valid range in one operation
return std::clamp(weighted_error, 1e-15, 1.0 - 1e-15);
}
void AdaBoost::updateSampleWeights(Classifier* estimator, double alpha)
{
// Get predictions from the estimator (reuse from calculateWeightedError if possible)
auto X = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), torch::indexing::Slice() });
auto y_true = dataset.index({ -1, torch::indexing::Slice() });
auto y_pred = estimator->predict(X);
// Vectorized weight update using PyTorch operations
auto incorrect = (y_pred != y_true).to(torch::kDouble);
// Single vectorized operation instead of element-wise multiplication
sample_weights *= torch::exp(alpha * incorrect);
// Vectorized clamping for numerical stability
sample_weights = torch::clamp(sample_weights, 1e-15, 1e15);
}
void AdaBoost::normalizeWeights()
{
// Single-pass normalization using PyTorch operations
double sum_weights = torch::sum(sample_weights).item<double>();
if (__builtin_expect(sum_weights <= 0, 0)) {
// Reset to uniform if all weights are zero/negative (rare case)
sample_weights = torch::ones_like(sample_weights) / sample_weights.size(0);
} else {
// Vectorized normalization
sample_weights /= sum_weights;
// Vectorized minimum weight enforcement
sample_weights = torch::clamp_min(sample_weights, 1e-15);
// Renormalize after clamping (if any weights were clamped)
double new_sum = torch::sum(sample_weights).item<double>();
if (new_sum != 1.0) {
sample_weights /= new_sum;
}
}
}
std::vector<std::string> AdaBoost::graph(const std::string& title) const
{
// Create a graph representation of the AdaBoost ensemble
std::vector<std::string> graph_lines;
// Header
graph_lines.push_back("digraph AdaBoost {");
graph_lines.push_back(" rankdir=TB;");
graph_lines.push_back(" node [shape=box];");
if (!title.empty()) {
graph_lines.push_back(" label=\"" + title + "\";");
graph_lines.push_back(" labelloc=t;");
}
// Add input node
graph_lines.push_back(" Input [shape=ellipse, label=\"Input Features\"];");
// Add base estimators
for (size_t i = 0; i < models.size(); ++i) {
std::stringstream ss;
ss << " Estimator" << i << " [label=\"Base Estimator " << i + 1
<< "\\nα = " << std::fixed << std::setprecision(3) << alphas[i] << "\"];";
graph_lines.push_back(ss.str());
// Connect input to estimator
ss.str("");
ss << " Input -> Estimator" << i << ";";
graph_lines.push_back(ss.str());
}
// Add combination node
graph_lines.push_back(" Combination [shape=diamond, label=\"Weighted Vote\"];");
// Connect estimators to combination
for (size_t i = 0; i < models.size(); ++i) {
std::stringstream ss;
ss << " Estimator" << i << " -> Combination;";
graph_lines.push_back(ss.str());
}
// Add output node
graph_lines.push_back(" Output [shape=ellipse, label=\"Final Prediction\"];");
graph_lines.push_back(" Combination -> Output;");
// Close graph
graph_lines.push_back("}");
return graph_lines;
}
void AdaBoost::checkValues() const
{
if (n_estimators <= 0) {
throw std::invalid_argument("n_estimators must be positive");
}
if (base_max_depth <= 0) {
throw std::invalid_argument("base_max_depth must be positive");
}
}
void AdaBoost::setHyperparameters(const nlohmann::json& hyperparameters_)
{
auto hyperparameters = hyperparameters_;
// Set hyperparameters from JSON
auto it = hyperparameters.find("n_estimators");
if (it != hyperparameters.end()) {
n_estimators = it->get<int>();
hyperparameters.erase("n_estimators");
}
it = hyperparameters.find("base_max_depth");
if (it != hyperparameters.end()) {
base_max_depth = it->get<int>();
hyperparameters.erase("base_max_depth");
}
checkValues();
Ensemble::setHyperparameters(hyperparameters);
}
int AdaBoost::predictSample(const torch::Tensor& x) const
{
// Early validation (keep essential checks only)
if (!fitted || models.empty()) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
// Pre-allocate and reuse memory
static thread_local std::vector<double> class_votes_cache;
if (class_votes_cache.size() != static_cast<size_t>(n_classes)) {
class_votes_cache.resize(n_classes);
}
std::fill(class_votes_cache.begin(), class_votes_cache.end(), 0.0);
// Optimized voting loop - avoid exception handling in hot path
for (size_t i = 0; i < models.size(); ++i) {
double alpha = alphas[i];
if (alpha <= 0 || !std::isfinite(alpha)) continue;
// Direct cast and call - avoid virtual dispatch overhead
int predicted_class = static_cast<DecisionTree*>(models[i].get())->predictSample(x);
// Bounds check with branch prediction hint
if (__builtin_expect(predicted_class >= 0 && predicted_class < n_classes, 1)) {
class_votes_cache[predicted_class] += alpha;
}
}
// Fast argmax using iterators
return std::distance(class_votes_cache.begin(),
std::max_element(class_votes_cache.begin(), class_votes_cache.end()));
}
torch::Tensor AdaBoost::predictProbaSample(const torch::Tensor& x) const
{
// Early validation
if (!fitted || models.empty()) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
// Use stack allocation for small arrays (typical case: n_classes <= 32)
constexpr int STACK_THRESHOLD = 32;
double stack_votes[STACK_THRESHOLD];
std::vector<double> heap_votes;
double* class_votes;
if (n_classes <= STACK_THRESHOLD) {
class_votes = stack_votes;
std::fill_n(class_votes, n_classes, 0.0);
} else {
heap_votes.resize(n_classes, 0.0);
class_votes = heap_votes.data();
}
double total_votes = 0.0;
// Optimized voting loop
for (size_t i = 0; i < models.size(); ++i) {
double alpha = alphas[i];
if (alpha <= 0 || !std::isfinite(alpha)) continue;
int predicted_class = static_cast<DecisionTree*>(models[i].get())->predictSample(x);
if (__builtin_expect(predicted_class >= 0 && predicted_class < n_classes, 1)) {
class_votes[predicted_class] += alpha;
total_votes += alpha;
}
}
// Direct tensor creation with pre-computed size
torch::Tensor class_probs = torch::empty({ n_classes }, torch::TensorOptions().dtype(torch::kFloat32));
auto probs_accessor = class_probs.accessor<float, 1>();
if (__builtin_expect(total_votes > 0.0, 1)) {
// Vectorized probability calculation
const double inv_total = 1.0 / total_votes;
for (int j = 0; j < n_classes; ++j) {
probs_accessor[j] = static_cast<float>(class_votes[j] * inv_total);
}
} else {
// Uniform distribution fallback
const float uniform_prob = 1.0f / n_classes;
for (int j = 0; j < n_classes; ++j) {
probs_accessor[j] = uniform_prob;
}
}
return class_probs;
}
torch::Tensor AdaBoost::predict_proba(torch::Tensor& X)
{
if (!fitted || models.empty()) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
// Input validation
if (X.size(0) != n) {
throw std::runtime_error("Input has wrong number of features. Expected " +
std::to_string(n) + " but got " + std::to_string(X.size(0)));
}
const int n_samples = X.size(1);
// Pre-allocate output tensor with correct layout
torch::Tensor probabilities = torch::empty({ n_samples, n_classes },
torch::TensorOptions().dtype(torch::kFloat32));
// Convert to contiguous memory if needed (optimization for memory access)
if (!X.is_contiguous()) {
X = X.contiguous();
}
// Batch processing with memory-efficient sample extraction
for (int i = 0; i < n_samples; ++i) {
// Extract sample without unnecessary copies
auto sample = X.select(1, i);
// Direct assignment to pre-allocated tensor
probabilities[i] = predictProbaSample(sample);
}
return probabilities;
}
std::vector<std::vector<double>> AdaBoost::predict_proba(std::vector<std::vector<int>>& X)
{
const size_t n_samples = X[0].size();
// Pre-allocate result with exact size
std::vector<std::vector<double>> result;
result.reserve(n_samples);
// Avoid repeated allocations
for (size_t i = 0; i < n_samples; ++i) {
result.emplace_back(n_classes, 0.0);
}
// Convert to tensor only once (batch conversion is more efficient)
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
torch::Tensor proba_tensor = predict_proba(X_tensor);
// Optimized tensor-to-vector conversion
auto proba_accessor = proba_tensor.accessor<float, 2>();
for (size_t i = 0; i < n_samples; ++i) {
for (int j = 0; j < n_classes; ++j) {
result[i][j] = static_cast<double>(proba_accessor[i][j]);
}
}
return result;
}
torch::Tensor AdaBoost::predict(torch::Tensor& X)
{
if (!fitted || models.empty()) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
if (X.size(0) != n) {
throw std::runtime_error("Input has wrong number of features. Expected " +
std::to_string(n) + " but got " + std::to_string(X.size(0)));
}
const int n_samples = X.size(1);
// Pre-allocate with correct dtype
torch::Tensor predictions = torch::empty({ n_samples }, torch::TensorOptions().dtype(torch::kInt32));
auto pred_accessor = predictions.accessor<int32_t, 1>();
// Ensure contiguous memory layout
if (!X.is_contiguous()) {
X = X.contiguous();
}
// Optimized prediction loop
for (int i = 0; i < n_samples; ++i) {
auto sample = X.select(1, i);
pred_accessor[i] = predictSample(sample);
}
return predictions;
}
std::vector<int> AdaBoost::predict(std::vector<std::vector<int>>& X)
{
// Single tensor conversion for batch processing
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
torch::Tensor predictions_tensor = predict(X_tensor);
// Optimized tensor-to-vector conversion
std::vector<int> result = platform::TensorUtils::to_vector<int>(predictions_tensor);
return result;
}
} // namespace bayesnet

View File

@@ -0,0 +1,81 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#ifndef ADABOOST_H
#define ADABOOST_H
#include <vector>
#include <memory>
#include "bayesnet/ensembles/Ensemble.h"
namespace bayesnet {
class AdaBoost : public Ensemble {
public:
explicit AdaBoost(int n_estimators = 100, int max_depth = 1);
virtual ~AdaBoost() = default;
// Override base class methods
std::vector<std::string> graph(const std::string& title = "") const override;
// AdaBoost specific methods
void setNEstimators(int n_estimators) { this->n_estimators = n_estimators; checkValues(); }
int getNEstimators() const { return n_estimators; }
void setBaseMaxDepth(int depth) { this->base_max_depth = depth; checkValues(); }
int getBaseMaxDepth() const { return base_max_depth; }
// Get the weight of each base estimator
std::vector<double> getEstimatorWeights() const { return alphas; }
// Get training errors for each iteration
std::vector<double> getTrainingErrors() const { return training_errors; }
// Override setHyperparameters from BaseClassifier
void setHyperparameters(const nlohmann::json& hyperparameters) override;
torch::Tensor predict(torch::Tensor& X) override;
std::vector<int> predict(std::vector<std::vector<int>>& X) override;
torch::Tensor predict_proba(torch::Tensor& X) override;
std::vector<std::vector<double>> predict_proba(std::vector<std::vector<int>>& X) override;
void setDebug(bool debug) { this->debug = debug; }
protected:
void buildModel(const torch::Tensor& weights) override;
void trainModel(const torch::Tensor& weights, const Smoothing_t smoothing) override;
private:
int n_estimators;
int base_max_depth; // Max depth for base decision trees
std::vector<double> alphas; // Weight of each base estimator
std::vector<double> training_errors; // Training error at each iteration
torch::Tensor sample_weights; // Current sample weights
int n_classes; // Number of classes in the target variable
int n; // Number of features
// Train a single base estimator
std::unique_ptr<Classifier> trainBaseEstimator(const torch::Tensor& weights);
// Calculate weighted error
double calculateWeightedError(Classifier* estimator, const torch::Tensor& weights);
// Update sample weights based on predictions
void updateSampleWeights(Classifier* estimator, double alpha);
// Normalize weights to sum to 1
void normalizeWeights();
// Check if hyperparameters values are valid
void checkValues() const;
// Make predictions for a single sample
int predictSample(const torch::Tensor& x) const;
// Make probabilistic predictions for a single sample
torch::Tensor predictProbaSample(const torch::Tensor& x) const;
bool debug = false; // Enable debug mode for debug output
};
}
#endif // ADABOOST_H

View File

@@ -0,0 +1,53 @@
#ifndef COUNTING_SEMAPHORE_H
#define COUNTING_SEMAPHORE_H
#include <mutex>
#include <condition_variable>
#include <algorithm>
#include <thread>
#include <mutex>
#include <condition_variable>
class CountingSemaphore {
public:
static CountingSemaphore& getInstance()
{
static CountingSemaphore instance;
return instance;
}
// Delete copy constructor and assignment operator
CountingSemaphore(const CountingSemaphore&) = delete;
CountingSemaphore& operator=(const CountingSemaphore&) = delete;
void acquire()
{
std::unique_lock<std::mutex> lock(mtx_);
cv_.wait(lock, [this]() { return count_ > 0; });
--count_;
}
void release()
{
std::lock_guard<std::mutex> lock(mtx_);
++count_;
if (count_ <= max_count_) {
cv_.notify_one();
}
}
uint getCount() const
{
return count_;
}
uint getMaxCount() const
{
return max_count_;
}
private:
CountingSemaphore()
: max_count_(std::max(1u, static_cast<uint>(0.95 * std::thread::hardware_concurrency()))),
count_(max_count_)
{
}
std::mutex mtx_;
std::condition_variable cv_;
const uint max_count_;
uint count_;
};
#endif

View File

@@ -0,0 +1,495 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#include "DecisionTree.h"
#include <algorithm>
#include <numeric>
#include <sstream>
#include <iomanip>
#include <limits>
#include "common/TensorUtils.hpp"
namespace bayesnet {
DecisionTree::DecisionTree(int max_depth, int min_samples_split, int min_samples_leaf)
: Classifier(Network()), max_depth(max_depth),
min_samples_split(min_samples_split), min_samples_leaf(min_samples_leaf)
{
validHyperparameters = { "max_depth", "min_samples_split", "min_samples_leaf" };
}
void DecisionTree::setHyperparameters(const nlohmann::json& hyperparameters_)
{
auto hyperparameters = hyperparameters_;
// Set hyperparameters from JSON
auto it = hyperparameters.find("max_depth");
if (it != hyperparameters.end()) {
max_depth = it->get<int>();
hyperparameters.erase("max_depth"); // Remove 'order' if present
}
it = hyperparameters.find("min_samples_split");
if (it != hyperparameters.end()) {
min_samples_split = it->get<int>();
hyperparameters.erase("min_samples_split"); // Remove 'min_samples_split' if present
}
it = hyperparameters.find("min_samples_leaf");
if (it != hyperparameters.end()) {
min_samples_leaf = it->get<int>();
hyperparameters.erase("min_samples_leaf"); // Remove 'min_samples_leaf' if present
}
Classifier::setHyperparameters(hyperparameters);
checkValues();
}
void DecisionTree::checkValues()
{
if (max_depth <= 0) {
throw std::invalid_argument("max_depth must be positive");
}
if (min_samples_leaf <= 0) {
throw std::invalid_argument("min_samples_leaf must be positive");
}
if (min_samples_split <= 0) {
throw std::invalid_argument("min_samples_split must be positive");
}
}
void DecisionTree::buildModel(const torch::Tensor& weights)
{
// Extract features (X) and labels (y) from dataset
auto X = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), torch::indexing::Slice() }).t();
auto y = dataset.index({ -1, torch::indexing::Slice() });
if (X.size(0) != y.size(0)) {
throw std::runtime_error("X and y must have the same number of samples");
}
n_classes = states[className].size();
// Use provided weights or uniform weights
torch::Tensor sample_weights;
if (weights.defined() && weights.numel() > 0) {
if (weights.size(0) != X.size(0)) {
throw std::runtime_error("weights must have the same length as number of samples");
}
sample_weights = weights;
} else {
sample_weights = torch::ones({ X.size(0) }) / X.size(0);
}
// Normalize weights
sample_weights = sample_weights / sample_weights.sum();
// Build the tree
root = buildTree(X, y, sample_weights, 0);
// Mark as fitted
fitted = true;
}
bool DecisionTree::validateTensors(const torch::Tensor& X, const torch::Tensor& y,
const torch::Tensor& sample_weights) const
{
if (X.size(0) != y.size(0) || X.size(0) != sample_weights.size(0)) {
return false;
}
if (X.size(0) == 0) {
return false;
}
return true;
}
std::unique_ptr<TreeNode> DecisionTree::buildTree(
const torch::Tensor& X,
const torch::Tensor& y,
const torch::Tensor& sample_weights,
int current_depth)
{
auto node = std::make_unique<TreeNode>();
int n_samples = y.size(0);
// Check stopping criteria
auto unique = at::_unique(y);
bool should_stop = (current_depth >= max_depth) ||
(n_samples < min_samples_split) ||
(std::get<0>(unique).size(0) == 1); // All samples same class
if (should_stop || n_samples <= min_samples_leaf) {
// Create leaf node
node->is_leaf = true;
// Calculate class probabilities
node->class_probabilities = torch::zeros({ n_classes });
for (int i = 0; i < n_samples; i++) {
int class_idx = y[i].item<int>();
node->class_probabilities[class_idx] += sample_weights[i].item<float>();
}
// Normalize probabilities
node->class_probabilities /= node->class_probabilities.sum();
// Set predicted class as the one with highest probability
node->predicted_class = torch::argmax(node->class_probabilities).item<int>();
return node;
}
// Find best split
SplitInfo best_split = findBestSplit(X, y, sample_weights);
// If no valid split found, create leaf
if (best_split.feature_index == -1 || best_split.impurity_decrease <= 0) {
node->is_leaf = true;
// Calculate class probabilities
node->class_probabilities = torch::zeros({ n_classes });
for (int i = 0; i < n_samples; i++) {
int class_idx = y[i].item<int>();
node->class_probabilities[class_idx] += sample_weights[i].item<float>();
}
node->class_probabilities /= node->class_probabilities.sum();
node->predicted_class = torch::argmax(node->class_probabilities).item<int>();
return node;
}
// Create internal node
node->is_leaf = false;
node->split_feature = best_split.feature_index;
node->split_value = best_split.split_value;
// Split data
auto left_X = X.index({ best_split.left_mask });
auto left_y = y.index({ best_split.left_mask });
auto left_weights = sample_weights.index({ best_split.left_mask });
auto right_X = X.index({ best_split.right_mask });
auto right_y = y.index({ best_split.right_mask });
auto right_weights = sample_weights.index({ best_split.right_mask });
// Recursively build subtrees
if (left_X.size(0) >= min_samples_leaf) {
node->left = buildTree(left_X, left_y, left_weights, current_depth + 1);
} else {
// Force leaf if not enough samples
node->left = std::make_unique<TreeNode>();
node->left->is_leaf = true;
auto mode = std::get<0>(torch::mode(left_y));
node->left->predicted_class = mode.item<int>();
node->left->class_probabilities = torch::zeros({ n_classes });
node->left->class_probabilities[node->left->predicted_class] = 1.0;
}
if (right_X.size(0) >= min_samples_leaf) {
node->right = buildTree(right_X, right_y, right_weights, current_depth + 1);
} else {
// Force leaf if not enough samples
node->right = std::make_unique<TreeNode>();
node->right->is_leaf = true;
auto mode = std::get<0>(torch::mode(right_y));
node->right->predicted_class = mode.item<int>();
node->right->class_probabilities = torch::zeros({ n_classes });
node->right->class_probabilities[node->right->predicted_class] = 1.0;
}
return node;
}
DecisionTree::SplitInfo DecisionTree::findBestSplit(
const torch::Tensor& X,
const torch::Tensor& y,
const torch::Tensor& sample_weights)
{
SplitInfo best_split;
best_split.feature_index = -1;
best_split.split_value = -1;
best_split.impurity_decrease = -std::numeric_limits<double>::infinity();
int n_features = X.size(1);
int n_samples = X.size(0);
// Calculate impurity of current node
double current_impurity = calculateGiniImpurity(y, sample_weights);
double total_weight = sample_weights.sum().item<double>();
// Try each feature
for (int feat_idx = 0; feat_idx < n_features; feat_idx++) {
auto feature_values = X.index({ torch::indexing::Slice(), feat_idx });
auto unique_values = std::get<0>(torch::unique_consecutive(std::get<0>(torch::sort(feature_values))));
// Try each unique value as split point
for (int i = 0; i < unique_values.size(0); i++) {
int split_val = unique_values[i].item<int>();
// Create masks for left and right splits
auto left_mask = feature_values == split_val;
auto right_mask = ~left_mask;
int left_count = left_mask.sum().item<int>();
int right_count = right_mask.sum().item<int>();
// Skip if split doesn't satisfy minimum samples requirement
if (left_count < min_samples_leaf || right_count < min_samples_leaf) {
continue;
}
// Calculate weighted impurities
auto left_y = y.index({ left_mask });
auto left_weights = sample_weights.index({ left_mask });
double left_weight = left_weights.sum().item<double>();
double left_impurity = calculateGiniImpurity(left_y, left_weights);
auto right_y = y.index({ right_mask });
auto right_weights = sample_weights.index({ right_mask });
double right_weight = right_weights.sum().item<double>();
double right_impurity = calculateGiniImpurity(right_y, right_weights);
// Calculate impurity decrease
double impurity_decrease = current_impurity -
(left_weight / total_weight * left_impurity +
right_weight / total_weight * right_impurity);
// Update best split if this is better
if (impurity_decrease > best_split.impurity_decrease) {
best_split.feature_index = feat_idx;
best_split.split_value = split_val;
best_split.impurity_decrease = impurity_decrease;
best_split.left_mask = left_mask;
best_split.right_mask = right_mask;
}
}
}
return best_split;
}
double DecisionTree::calculateGiniImpurity(
const torch::Tensor& y,
const torch::Tensor& sample_weights)
{
if (y.size(0) == 0 || sample_weights.size(0) == 0) {
return 0.0;
}
if (y.size(0) != sample_weights.size(0)) {
throw std::runtime_error("y and sample_weights must have same size");
}
torch::Tensor class_weights = torch::zeros({ n_classes });
// Calculate weighted class counts
for (int i = 0; i < y.size(0); i++) {
int class_idx = y[i].item<int>();
if (class_idx < 0 || class_idx >= n_classes) {
throw std::runtime_error("Invalid class index: " + std::to_string(class_idx));
}
class_weights[class_idx] += sample_weights[i].item<float>();
}
// Normalize
double total_weight = class_weights.sum().item<double>();
if (total_weight == 0) return 0.0;
class_weights /= total_weight;
// Calculate Gini impurity: 1 - sum(p_i^2)
double gini = 1.0;
for (int i = 0; i < n_classes; i++) {
double p = class_weights[i].item<double>();
gini -= p * p;
}
return gini;
}
torch::Tensor DecisionTree::predict(torch::Tensor& X)
{
if (!fitted) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
int n_samples = X.size(1);
torch::Tensor predictions = torch::zeros({ n_samples }, torch::kInt32);
for (int i = 0; i < n_samples; i++) {
auto sample = X.index({ torch::indexing::Slice(), i }).ravel();
predictions[i] = predictSample(sample);
}
return predictions;
}
std::vector<int> DecisionTree::predict(std::vector<std::vector<int>>& X)
{
// Convert to tensor
long n = X.size();
long m = X.at(0).size();
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
auto predictions = predict(X_tensor);
std::vector<int> result = platform::TensorUtils::to_vector<int>(predictions);
return result;
}
torch::Tensor DecisionTree::predict_proba(torch::Tensor& X)
{
if (!fitted) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
int n_samples = X.size(1);
torch::Tensor probabilities = torch::zeros({ n_samples, n_classes });
for (int i = 0; i < n_samples; i++) {
auto sample = X.index({ torch::indexing::Slice(), i }).ravel();
probabilities[i] = predictProbaSample(sample);
}
return probabilities;
}
std::vector<std::vector<double>> DecisionTree::predict_proba(std::vector<std::vector<int>>& X)
{
auto n_samples = X.at(0).size();
// Convert to tensor
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
auto proba_tensor = predict_proba(X_tensor);
std::vector<std::vector<double>> result(n_samples, std::vector<double>(n_classes, 0.0));
for (int i = 0; i < n_samples; i++) {
for (int j = 0; j < n_classes; j++) {
result[i][j] = proba_tensor[i][j].item<double>();
}
}
return result;
}
int DecisionTree::predictSample(const torch::Tensor& x) const
{
if (!fitted) {
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
}
if (x.size(0) != n) { // n debería ser el número de características
throw std::runtime_error("Input sample has wrong number of features");
}
const TreeNode* leaf = traverseTree(x, root.get());
return leaf->predicted_class;
}
torch::Tensor DecisionTree::predictProbaSample(const torch::Tensor& x) const
{
const TreeNode* leaf = traverseTree(x, root.get());
return leaf->class_probabilities.clone();
}
const TreeNode* DecisionTree::traverseTree(const torch::Tensor& x, const TreeNode* node) const
{
if (!node) {
throw std::runtime_error("Null node encountered during tree traversal");
}
if (node->is_leaf) {
return node;
}
if (node->split_feature < 0 || node->split_feature >= x.size(0)) {
throw std::runtime_error("Invalid split_feature index: " + std::to_string(node->split_feature));
}
int feature_value = x[node->split_feature].item<int>();
if (feature_value == node->split_value) {
if (!node->left) {
throw std::runtime_error("Missing left child in tree");
}
return traverseTree(x, node->left.get());
} else {
if (!node->right) {
throw std::runtime_error("Missing right child in tree");
}
return traverseTree(x, node->right.get());
}
}
std::vector<std::string> DecisionTree::graph(const std::string& title) const
{
std::vector<std::string> lines;
lines.push_back("digraph DecisionTree {");
lines.push_back(" rankdir=TB;");
lines.push_back(" node [shape=box, style=\"filled, rounded\", fontname=\"helvetica\"];");
lines.push_back(" edge [fontname=\"helvetica\"];");
if (!title.empty()) {
lines.push_back(" label=\"" + title + "\";");
lines.push_back(" labelloc=t;");
}
if (root) {
int node_id = 0;
treeToGraph(root.get(), lines, node_id);
}
lines.push_back("}");
return lines;
}
void DecisionTree::treeToGraph(
const TreeNode* node,
std::vector<std::string>& lines,
int& node_id,
int parent_id,
const std::string& edge_label) const
{
int current_id = node_id++;
std::stringstream ss;
if (node->is_leaf) {
// Leaf node
ss << " node" << current_id << " [label=\"Class: " << node->predicted_class;
ss << "\\nProb: " << std::fixed << std::setprecision(3)
<< node->class_probabilities[node->predicted_class].item<float>();
ss << "\", fillcolor=\"lightblue\"];";
lines.push_back(ss.str());
} else {
// Internal node
ss << " node" << current_id << " [label=\"" << features[node->split_feature];
ss << " = " << node->split_value << "?\", fillcolor=\"lightgreen\"];";
lines.push_back(ss.str());
}
// Add edge from parent
if (parent_id >= 0) {
ss.str("");
ss << " node" << parent_id << " -> node" << current_id;
if (!edge_label.empty()) {
ss << " [label=\"" << edge_label << "\"];";
} else {
ss << ";";
}
lines.push_back(ss.str());
}
// Recurse on children
if (!node->is_leaf) {
if (node->left) {
treeToGraph(node->left.get(), lines, node_id, current_id, "Yes");
}
if (node->right) {
treeToGraph(node->right.get(), lines, node_id, current_id, "No");
}
}
}
} // namespace bayesnet

View File

@@ -0,0 +1,134 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#ifndef DECISION_TREE_H
#define DECISION_TREE_H
#include <memory>
#include <vector>
#include <map>
#include <torch/torch.h>
#include "bayesnet/classifiers/Classifier.h"
namespace bayesnet {
// Forward declaration
struct TreeNode;
class DecisionTree : public Classifier {
public:
explicit DecisionTree(int max_depth = 3, int min_samples_split = 2, int min_samples_leaf = 1);
virtual ~DecisionTree() = default;
// Override graph method to show tree structure
std::vector<std::string> graph(const std::string& title = "") const override;
// Setters for hyperparameters
void setMaxDepth(int depth) { max_depth = depth; checkValues(); }
void setMinSamplesSplit(int samples) { min_samples_split = samples; checkValues(); }
void setMinSamplesLeaf(int samples) { min_samples_leaf = samples; checkValues(); }
int getMaxDepth() const { return max_depth; }
int getMinSamplesSplit() const { return min_samples_split; }
int getMinSamplesLeaf() const { return min_samples_leaf; }
// Override setHyperparameters
void setHyperparameters(const nlohmann::json& hyperparameters) override;
torch::Tensor predict(torch::Tensor& X) override;
std::vector<int> predict(std::vector<std::vector<int>>& X) override;
torch::Tensor predict_proba(torch::Tensor& X) override;
std::vector<std::vector<double>> predict_proba(std::vector<std::vector<int>>& X) override;
// Make predictions for a single sample
int predictSample(const torch::Tensor& x) const;
// Make probabilistic predictions for a single sample
torch::Tensor predictProbaSample(const torch::Tensor& x) const;
protected:
void buildModel(const torch::Tensor& weights) override;
void trainModel(const torch::Tensor& weights, const Smoothing_t smoothing) override
{
// Decision trees do not require training in the traditional sense
// as they are built from the data directly.
// This method can be used to set weights or other parameters if needed.
}
private:
void checkValues();
bool validateTensors(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& sample_weights) const;
// Tree hyperparameters
int max_depth;
int min_samples_split;
int min_samples_leaf;
int n_classes; // Number of classes in the target variable
// Root of the decision tree
std::unique_ptr<TreeNode> root;
// Build tree recursively
std::unique_ptr<TreeNode> buildTree(
const torch::Tensor& X,
const torch::Tensor& y,
const torch::Tensor& sample_weights,
int current_depth
);
// Find best split for a node
struct SplitInfo {
int feature_index;
int split_value;
double impurity_decrease;
torch::Tensor left_mask;
torch::Tensor right_mask;
};
SplitInfo findBestSplit(
const torch::Tensor& X,
const torch::Tensor& y,
const torch::Tensor& sample_weights
);
// Calculate weighted Gini impurity for multi-class
double calculateGiniImpurity(
const torch::Tensor& y,
const torch::Tensor& sample_weights
);
// Traverse tree to find leaf node
const TreeNode* traverseTree(const torch::Tensor& x, const TreeNode* node) const;
// Convert tree to graph representation
void treeToGraph(
const TreeNode* node,
std::vector<std::string>& lines,
int& node_id,
int parent_id = -1,
const std::string& edge_label = ""
) const;
};
// Tree node structure
struct TreeNode {
bool is_leaf;
// For internal nodes
int split_feature;
int split_value;
std::unique_ptr<TreeNode> left;
std::unique_ptr<TreeNode> right;
// For leaf nodes
int predicted_class;
torch::Tensor class_probabilities; // Probability for each class
TreeNode() : is_leaf(false), split_feature(-1), split_value(-1), predicted_class(-1) {}
};
} // namespace bayesnet
#endif // DECISION_TREE_H

View File

@@ -0,0 +1,182 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#include "ExpClf.h"
#include "common/TensorUtils.hpp"
namespace platform {
ExpClf::ExpClf() : semaphore_{ CountingSemaphore::getInstance() }, Boost(false)
{
validHyperparameters = {};
}
//
// Parents
//
void ExpClf::add_active_parents(const std::vector<int>& active_parents)
{
for (const auto& parent : active_parents)
aode_.add_active_parent(parent);
}
void ExpClf::add_active_parent(int parent)
{
aode_.add_active_parent(parent);
}
void ExpClf::remove_last_parent()
{
aode_.remove_last_parent();
}
//
// Predict
//
std::vector<int> ExpClf::predict_spode(std::vector<std::vector<int>>& test_data, int parent)
{
int test_size = test_data[0].size();
int sample_size = test_data.size();
auto predictions = std::vector<int>(test_size);
int chunk_size = std::min(150, int(test_size / semaphore_.getMaxCount()) + 1);
std::vector<std::thread> threads;
auto worker = [&](const std::vector<std::vector<int>>& samples, int begin, int chunk, int sample_size, std::vector<int>& predictions) {
std::string threadName = "(V)PWorker-" + std::to_string(begin) + "-" + std::to_string(chunk);
#if defined(__linux__)
pthread_setname_np(pthread_self(), threadName.c_str());
#else
pthread_setname_np(threadName.c_str());
#endif
std::vector<int> instance(sample_size);
for (int sample = begin; sample < begin + chunk; ++sample) {
for (int feature = 0; feature < sample_size; ++feature) {
instance[feature] = samples[feature][sample];
}
predictions[sample] = aode_.predict_spode(instance, parent);
}
semaphore_.release();
};
for (int begin = 0; begin < test_size; begin += chunk_size) {
int chunk = std::min(chunk_size, test_size - begin);
semaphore_.acquire();
threads.emplace_back(worker, test_data, begin, chunk, sample_size, std::ref(predictions));
}
for (auto& thread : threads) {
thread.join();
}
return predictions;
}
torch::Tensor ExpClf::predict(torch::Tensor& X)
{
auto X_ = TensorUtils::to_matrix(X);
torch::Tensor y = torch::tensor(predict(X_));
return y;
}
torch::Tensor ExpClf::predict_proba(torch::Tensor& X)
{
auto X_ = TensorUtils::to_matrix(X);
auto probabilities = predict_proba(X_);
auto n_samples = X.size(1);
int n_classes = probabilities[0].size();
auto y = torch::zeros({ n_samples, n_classes });
for (int i = 0; i < n_samples; i++) {
for (int j = 0; j < n_classes; j++) {
y[i][j] = probabilities[i][j];
}
}
return y;
}
float ExpClf::score(torch::Tensor& X, torch::Tensor& y)
{
auto X_ = TensorUtils::to_matrix(X);
auto y_ = TensorUtils::to_vector<int>(y);
return score(X_, y_);
}
std::vector<std::vector<double>> ExpClf::predict_proba(const std::vector<std::vector<int>>& test_data)
{
int test_size = test_data[0].size();
int sample_size = test_data.size();
auto probabilities = std::vector<std::vector<double>>(test_size, std::vector<double>(aode_.statesClass()));
int chunk_size = std::min(150, int(test_size / semaphore_.getMaxCount()) + 1);
std::vector<std::thread> threads;
auto worker = [&](const std::vector<std::vector<int>>& samples, int begin, int chunk, int sample_size, std::vector<std::vector<double>>& predictions) {
std::string threadName = "(V)PWorker-" + std::to_string(begin) + "-" + std::to_string(chunk);
#if defined(__linux__)
pthread_setname_np(pthread_self(), threadName.c_str());
#else
pthread_setname_np(threadName.c_str());
#endif
std::vector<int> instance(sample_size);
for (int sample = begin; sample < begin + chunk; ++sample) {
for (int feature = 0; feature < sample_size; ++feature) {
instance[feature] = samples[feature][sample];
}
predictions[sample] = aode_.predict_proba(instance);
}
semaphore_.release();
};
for (int begin = 0; begin < test_size; begin += chunk_size) {
int chunk = std::min(chunk_size, test_size - begin);
semaphore_.acquire();
threads.emplace_back(worker, test_data, begin, chunk, sample_size, std::ref(probabilities));
}
for (auto& thread : threads) {
thread.join();
}
return probabilities;
}
std::vector<int> ExpClf::predict(std::vector<std::vector<int>>& test_data)
{
if (!fitted) {
throw std::logic_error(CLASSIFIER_NOT_FITTED);
}
auto probabilities = predict_proba(test_data);
std::vector<int> predictions(probabilities.size(), 0);
for (size_t i = 0; i < probabilities.size(); i++) {
predictions[i] = std::distance(probabilities[i].begin(), std::max_element(probabilities[i].begin(), probabilities[i].end()));
}
return predictions;
}
float ExpClf::score(std::vector<std::vector<int>>& test_data, std::vector<int>& labels)
{
Timer timer;
timer.start();
std::vector<int> predictions = predict(test_data);
int correct = 0;
for (size_t i = 0; i < predictions.size(); i++) {
if (predictions[i] == labels[i]) {
correct++;
}
}
if (debug) {
std::cout << "* Time to predict: " << timer.getDurationString() << std::endl;
}
return static_cast<float>(correct) / predictions.size();
}
//
// statistics
//
int ExpClf::getNumberOfNodes() const
{
return aode_.getNumberOfNodes();
}
int ExpClf::getNumberOfEdges() const
{
return aode_.getNumberOfEdges();
}
int ExpClf::getNumberOfStates() const
{
return aode_.getNumberOfStates();
}
int ExpClf::getClassNumStates() const
{
return aode_.statesClass();
}
}

View File

@@ -0,0 +1,67 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#ifndef EXPCLF_H
#define EXPCLF_H
#include <vector>
#include <string>
#include <cmath>
#include <algorithm>
#include <limits>
#include <bayesnet/ensembles/Boost.h>
#include <bayesnet/network/Smoothing.h>
#include "common/Timer.hpp"
#include "CountingSemaphore.hpp"
#include "Xaode.hpp"
namespace platform {
class ExpClf : public bayesnet::Boost {
public:
ExpClf();
virtual ~ExpClf() = default;
std::vector<int> predict(std::vector<std::vector<int>>& X) override;
torch::Tensor predict(torch::Tensor& X) override;
torch::Tensor predict_proba(torch::Tensor& X) override;
std::vector<int> predict_spode(std::vector<std::vector<int>>& test_data, int parent);
std::vector<std::vector<double>> predict_proba(const std::vector<std::vector<int>>& X);
float score(std::vector<std::vector<int>>& X, std::vector<int>& y) override;
float score(torch::Tensor& X, torch::Tensor& y) override;
int getNumberOfNodes() const override;
int getNumberOfEdges() const override;
int getNumberOfStates() const override;
int getClassNumStates() const override;
std::vector<std::string> show() const override { return {}; }
std::vector<std::string> topological_order() override { return {}; }
std::string dump_cpt() const override { return ""; }
void setDebug(bool debug) { this->debug = debug; }
bayesnet::status_t getStatus() const override { return status; }
std::vector<std::string> getNotes() const override { return notes; }
std::vector<std::string> graph(const std::string& title = "") const override { return {}; }
void add_active_parents(const std::vector<int>& active_parents);
void add_active_parent(int parent);
void remove_last_parent();
void setHyperparameters(const nlohmann::json& hyperparameters_) override {};
protected:
bool debug = false;
Xaode aode_;
torch::Tensor weights_;
const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted";
inline void normalize_weights(int num_instances)
{
double sum = weights_.sum().item<double>();
if (sum == 0) {
weights_ = torch::full({ num_instances }, 1.0);
} else {
for (int i = 0; i < weights_.size(0); ++i) {
weights_[i] = weights_[i].item<double>() * num_instances / sum;
}
}
}
private:
CountingSemaphore& semaphore_;
};
}
#endif // EXPCLF_H

View File

@@ -0,0 +1,158 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#include "ExpEnsemble.h"
#include "common/TensorUtils.hpp"
namespace platform {
ExpEnsemble::ExpEnsemble() : semaphore_{ CountingSemaphore::getInstance() }, Boost(false)
{
validHyperparameters = {};
}
//
// Parents
//
void ExpEnsemble::add_model(std::unique_ptr<XSpode> model)
{
models.push_back(std::move(model));
n_models++;
}
void ExpEnsemble::remove_last_model()
{
models.pop_back();
n_models--;
}
//
// Predict
//
torch::Tensor ExpEnsemble::predict(torch::Tensor& X)
{
auto X_ = TensorUtils::to_matrix(X);
torch::Tensor y = torch::tensor(predict(X_));
return y;
}
torch::Tensor ExpEnsemble::predict_proba(torch::Tensor& X)
{
auto X_ = TensorUtils::to_matrix(X);
auto probabilities = predict_proba(X_);
auto n_samples = X.size(1);
int n_classes = probabilities[0].size();
auto y = torch::zeros({ n_samples, n_classes });
for (int i = 0; i < n_samples; i++) {
for (int j = 0; j < n_classes; j++) {
y[i][j] = probabilities[i][j];
}
}
return y;
}
float ExpEnsemble::score(torch::Tensor& X, torch::Tensor& y)
{
auto X_ = TensorUtils::to_matrix(X);
auto y_ = TensorUtils::to_vector<int>(y);
return score(X_, y_);
}
std::vector<std::vector<double>> ExpEnsemble::predict_proba(const std::vector<std::vector<int>>& test_data)
{
int test_size = test_data[0].size();
int sample_size = test_data.size();
auto probabilities = std::vector<std::vector<double>>(test_size, std::vector<double>(getClassNumStates()));
int chunk_size = std::min(150, int(test_size / semaphore_.getMaxCount()) + 1);
std::vector<std::thread> threads;
auto worker = [&](const std::vector<std::vector<int>>& samples, int begin, int chunk, int sample_size, std::vector<std::vector<double>>& predictions) {
std::string threadName = "(V)PWorker-" + std::to_string(begin) + "-" + std::to_string(chunk);
#if defined(__linux__)
pthread_setname_np(pthread_self(), threadName.c_str());
#else
pthread_setname_np(threadName.c_str());
#endif
std::vector<int> instance(sample_size);
for (int sample = begin; sample < begin + chunk; ++sample) {
for (int feature = 0; feature < sample_size; ++feature) {
instance[feature] = samples[feature][sample];
}
// predictions[sample] = aode_.predict_proba(instance);
}
semaphore_.release();
};
for (int begin = 0; begin < test_size; begin += chunk_size) {
int chunk = std::min(chunk_size, test_size - begin);
semaphore_.acquire();
threads.emplace_back(worker, test_data, begin, chunk, sample_size, std::ref(probabilities));
}
for (auto& thread : threads) {
thread.join();
}
return probabilities;
}
std::vector<int> ExpEnsemble::predict(std::vector<std::vector<int>>& test_data)
{
if (!fitted) {
throw std::logic_error(CLASSIFIER_NOT_FITTED);
}
auto probabilities = predict_proba(test_data);
std::vector<int> predictions(probabilities.size(), 0);
for (size_t i = 0; i < probabilities.size(); i++) {
predictions[i] = std::distance(probabilities[i].begin(), std::max_element(probabilities[i].begin(), probabilities[i].end()));
}
return predictions;
}
float ExpEnsemble::score(std::vector<std::vector<int>>& test_data, std::vector<int>& labels)
{
Timer timer;
timer.start();
std::vector<int> predictions = predict(test_data);
int correct = 0;
for (size_t i = 0; i < predictions.size(); i++) {
if (predictions[i] == labels[i]) {
correct++;
}
}
if (debug) {
std::cout << "* Time to predict: " << timer.getDurationString() << std::endl;
}
return static_cast<float>(correct) / predictions.size();
}
//
// statistics
//
int ExpEnsemble::getNumberOfNodes() const
{
if (models_.empty()) {
return 0;
}
return n_models * (models_.at(0)->getNFeatures() + 1);
}
int ExpEnsemble::getNumberOfEdges() const
{
if (models_.empty()) {
return 0;
}
return n_models * (2 * models_.at(0)->getNFeatures() - 1);
}
int ExpEnsemble::getNumberOfStates() const
{
if (models_.empty()) {
return 0;
}
auto states = models_.at(0)->getStates();
int nFeatures = models_.at(0)->getNFeatures();
return std::accumulate(states.begin(), states.end(), 0) * nFeatures * n_models;
}
int ExpEnsemble::getClassNumStates() const
{
if (models_.empty()) {
return 0;
}
return models_.at(0)->statesClass();
}
}

View File

@@ -0,0 +1,66 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#ifndef EXPENSEMBLE_H
#define EXPENSEMBLE_H
#include <vector>
#include <string>
#include <cmath>
#include <algorithm>
#include <limits>
#include <bayesnet/ensembles/Boost.h>
#include <bayesnet/network/Smoothing.h>
#include "common/Timer.hpp"
#include "CountingSemaphore.hpp"
#include "XSpode.hpp"
namespace platform {
class ExpEnsemble : public bayesnet::Boost {
public:
ExpEnsemble();
virtual ~ExpEnsemble() = default;
std::vector<int> predict(std::vector<std::vector<int>>& X) override;
torch::Tensor predict(torch::Tensor& X) override;
torch::Tensor predict_proba(torch::Tensor& X) override;
std::vector<int> predict_spode(std::vector<std::vector<int>>& test_data, int parent);
std::vector<std::vector<double>> predict_proba(const std::vector<std::vector<int>>& X);
float score(std::vector<std::vector<int>>& X, std::vector<int>& y) override;
float score(torch::Tensor& X, torch::Tensor& y) override;
int getNumberOfNodes() const override;
int getNumberOfEdges() const override;
int getNumberOfStates() const override;
int getClassNumStates() const override;
std::vector<std::string> show() const override { return {}; }
std::vector<std::string> topological_order() override { return {}; }
std::string dump_cpt() const override { return ""; }
void setDebug(bool debug) { this->debug = debug; }
bayesnet::status_t getStatus() const override { return status; }
std::vector<std::string> getNotes() const override { return notes; }
std::vector<std::string> graph(const std::string& title = "") const override { return {}; }
protected:
void add_model(std::unique_ptr<XSpode> model);
void remove_last_model();
bool debug = false;
std::vector <std::unique_ptr<XSpode>> models_;
torch::Tensor weights_;
std::vector<double> significanceModels_;
const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted";
inline void normalize_weights(int num_instances)
{
double sum = weights_.sum().item<double>();
if (sum == 0) {
weights_ = torch::full({ num_instances }, 1.0);
} else {
for (int i = 0; i < weights_.size(0); ++i) {
weights_[i] = weights_[i].item<double>() * num_instances / sum;
}
}
}
private:
CountingSemaphore& semaphore_;
};
}
#endif // EXPENSEMBLE_H

View File

@@ -0,0 +1,142 @@
# AdaBoost and DecisionTree Classifier Implementation
This implementation provides both a Decision Tree classifier and a multi-class AdaBoost classifier based on the SAMME (Stagewise Additive Modeling using a Multi-class Exponential loss) algorithm described in the paper "Multi-class AdaBoost" by Zhu et al. Implemented in C++ using <https://claude.ai>
## Components
### 1. DecisionTree Classifier
A classic decision tree implementation that:
- Supports multi-class classification
- Handles weighted samples (essential for boosting)
- Uses Gini impurity as the splitting criterion
- Works with discrete/categorical features
- Provides both class predictions and probability estimates
#### Key Features
- **Max Depth Control**: Limit tree depth to create weak learners
- **Minimum Samples**: Control minimum samples for splitting and leaf nodes
- **Weighted Training**: Properly handles sample weights for boosting
- **Visualization**: Generates DOT format graphs of the tree structure
#### Hyperparameters
- `max_depth`: Maximum depth of the tree (default: 3)
- `min_samples_split`: Minimum samples required to split a node (default: 2)
- `min_samples_leaf`: Minimum samples required in a leaf node (default: 1)
### 2. AdaBoost Classifier
A multi-class AdaBoost implementation using DecisionTree as base estimators:
- **SAMME Algorithm**: Implements the multi-class extension of AdaBoost
- **Automatic Stumps**: Uses decision stumps (max_depth=1) by default
- **Early Stopping**: Stops if base classifier performs worse than random
- **Ensemble Visualization**: Shows the weighted combination of base estimators
#### Key Features
- **Multi-class Support**: Natural extension to K classes
- **Base Estimator Control**: Configure depth of base decision trees
- **Training Monitoring**: Track training errors and estimator weights
- **Probability Estimates**: Provides class probability predictions
#### Hyperparameters
- `n_estimators`: Number of base estimators to train (default: 50)
- `base_max_depth`: Maximum depth for base decision trees (default: 1)
## Algorithm Details
The SAMME algorithm differs from binary AdaBoost in the calculation of the estimator weight (alpha):
```
α = log((1 - err) / err) + log(K - 1)
```
where `K` is the number of classes. This formula ensures that:
- When K = 2, it reduces to standard AdaBoost
- For K > 2, base classifiers only need to be better than random guessing (1/K) rather than 50%
## Usage Example
```cpp
// Create AdaBoost with decision stumps
AdaBoost ada(100, 1); // 100 estimators, max_depth=1
// Train
ada.fit(X_train, y_train, features, className, states, Smoothing_t::NONE);
// Predict
auto predictions = ada.predict(X_test);
auto probabilities = ada.predict_proba(X_test);
// Evaluate
float accuracy = ada.score(X_test, y_test);
// Get ensemble information
auto weights = ada.getEstimatorWeights();
auto errors = ada.getTrainingErrors();
```
## Implementation Structure
```
AdaBoost (inherits from Ensemble)
└── Uses multiple DecisionTree instances as base estimators
└── DecisionTree (inherits from Classifier)
└── Implements weighted Gini impurity splitting
```
## Visualization
Both classifiers support graph visualization:
- **DecisionTree**: Shows the tree structure with split conditions
- **AdaBoost**: Shows the ensemble of weighted base estimators
Generate visualizations using:
```cpp
auto graph = classifier.graph("Title");
```
## Data Format
Both classifiers expect discrete/categorical data:
- **Features**: Integer values representing categories (stored in `torch::Tensor` or `std::vector<std::vector<int>>`)
- **Labels**: Integer values representing class indices (0, 1, ..., K-1)
- **States**: Map defining possible values for each feature and the class variable
- **Sample Weights**: Optional weights for each training sample (important for boosting)
Example data setup:
```cpp
// Features matrix (n_features x n_samples)
torch::Tensor X = torch::tensor({{0, 1, 2}, {1, 0, 1}}); // 2 features, 3 samples
// Labels vector
torch::Tensor y = torch::tensor({0, 1, 0}); // 3 samples
// States definition
std::map<std::string, std::vector<int>> states;
states["feature1"] = {0, 1, 2}; // Feature 1 can take values 0, 1, or 2
states["feature2"] = {0, 1}; // Feature 2 can take values 0 or 1
states["class"] = {0, 1}; // Binary classification
```
## Notes
- The implementation handles discrete/categorical features as indicated by the int-based data structures
- Sample weights are properly propagated through the tree building process
- The DecisionTree implementation uses equality testing for splits (suitable for categorical data)
- Both classifiers support the standard fit/predict interface from the base framework
## References
- Zhu, J., Zou, H., Rosset, S., & Hastie, T. (2009). Multi-class AdaBoost. Statistics and its interface, 2(3), 349-360.
- Breiman, L., Friedman, J., Olshen, R., & Stone, C. (1984). Classification and Regression Trees. Wadsworth, Belmont, CA.

View File

@@ -0,0 +1,20 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#include "XA1DE.h"
#include "common/TensorUtils.hpp"
namespace platform {
void XA1DE::trainModel(const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing)
{
auto X = TensorUtils::to_matrix(dataset.slice(0, 0, dataset.size(0) - 1));
auto y = TensorUtils::to_vector<int>(dataset.index({ -1, "..." }));
int num_instances = X[0].size();
weights_ = torch::full({ num_instances }, 1.0);
//normalize_weights(num_instances);
aode_.fit(X, y, features, className, states, weights_, true, smoothing);
}
}

View File

@@ -0,0 +1,26 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#ifndef XA1DE_H
#define XA1DE_H
#include "Xaode.hpp"
#include "ExpClf.h"
#include <bayesnet/network/Smoothing.h>
namespace platform {
class XA1DE : public ExpClf {
public:
XA1DE() = default;
virtual ~XA1DE() override = default;
std::string getVersion() override { return version; };
protected:
void buildModel(const torch::Tensor& weights) override {};
void trainModel(const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing) override;
private:
std::string version = "1.0.0";
};
}
#endif // XA1DE_H

View File

@@ -0,0 +1,183 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#include <random>
#include <set>
#include <functional>
#include <limits.h>
#include <tuple>
#include "XBAODE.h"
#include "XSpode.hpp"
#include "common/TensorUtils.hpp"
#include <loguru.hpp>
namespace platform {
XBAODE::XBAODE()
{
validHyperparameters = { "alpha_block", "order", "convergence", "convergence_best", "bisection", "threshold", "maxTolerance",
"predict_voting", "select_features" };
}
void XBAODE::add_model(std::unique_ptr<XSpode> model)
{
models.push_back(std::move(model));
n_models++;
}
void XBAODE::remove_last_model()
{
models.pop_back();
n_models--;
}
void XBAODE::trainModel(const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing)
{
fitted = true;
X_train_ = TensorUtils::to_matrix(X_train);
y_train_ = TensorUtils::to_vector<int>(y_train);
X_test_ = TensorUtils::to_matrix(X_test);
y_test_ = TensorUtils::to_vector<int>(y_test);
maxTolerance = 3;
//
// Logging setup
//
// loguru::set_thread_name("XBAODE");
// loguru::g_stderr_verbosity = loguru::Verbosity_OFF;
// loguru::add_file("XBAODE.log", loguru::Truncate, loguru::Verbosity_MAX);
// Algorithm based on the adaboost algorithm for classification
// as explained in Ensemble methods (Zhi-Hua Zhou, 2012)
double alpha_t = 0;
weights_ = torch::full({ m }, 1.0 / static_cast<double>(m), torch::kFloat64); // m initialized in Classifier.cc
significanceModels.resize(n, 0.0); // n initialized in Classifier.cc
bool finished = false;
std::vector<int> featuresUsed;
n_models = 0;
std::unique_ptr<XSpode> model;
if (selectFeatures) {
featuresUsed = featureSelection(weights_);
for (const auto& parent : featuresUsed) {
model = std::unique_ptr<XSpode>(new XSpode(parent));
model->fit(X_train_, y_train_, weights_, smoothing);
std::cout << model->getNFeatures() << std::endl;
add_model(std::move(model));
}
notes.push_back("Used features in initialization: " + std::to_string(featuresUsed.size()) + " of " + std::to_string(features.size()) + " with " + select_features_algorithm);
auto ypred = ExpEnsemble::predict(X_train);
std::tie(weights_, alpha_t, finished) = update_weights(y_train, ypred, weights_);
// Update significance of the models
for (const auto& parent : featuresUsed) {
significanceModels_[parent] = alpha_t;
}
n_models = featuresUsed.size();
// VLOG_SCOPE_F(1, "SelectFeatures. alpha_t: %f n_models: %d", alpha_t, n_models);
if (finished) {
return;
}
}
int numItemsPack = 0; // The counter of the models inserted in the current pack
// Variables to control the accuracy finish condition
double priorAccuracy = 0.0;
double improvement = 1.0;
double convergence_threshold = 1e-4;
int tolerance = 0; // number of times the accuracy is lower than the convergence_threshold
// Step 0: Set the finish condition
// epsilon sub t > 0.5 => inverse the weights policy
// validation error is not decreasing
// run out of features
bool ascending = order_algorithm == bayesnet::Orders.ASC;
std::mt19937 g{ 173 };
while (!finished) {
// Step 1: Build ranking with mutual information
auto featureSelection = metrics.SelectKBestWeighted(weights_, ascending, n); // Get all the features sorted
if (order_algorithm == bayesnet::Orders.RAND) {
std::shuffle(featureSelection.begin(), featureSelection.end(), g);
}
// Remove used features
featureSelection.erase(remove_if(featureSelection.begin(), featureSelection.end(), [&](auto x)
{ return std::find(featuresUsed.begin(), featuresUsed.end(), x) != featuresUsed.end();}),
featureSelection.end()
);
int k = bisection ? pow(2, tolerance) : 1;
int counter = 0; // The model counter of the current pack
// VLOG_SCOPE_F(1, "counter=%d k=%d featureSelection.size: %zu", counter, k, featureSelection.size());
while (counter++ < k && featureSelection.size() > 0) {
auto feature = featureSelection[0];
featureSelection.erase(featureSelection.begin());
model = std::unique_ptr<XSpode>(new XSpode(feature));
model->fit(X_train_, y_train_, weights_, smoothing);
std::vector<int> ypred;
if (alpha_block) {
//
// Compute the prediction with the current ensemble + model
//
// Add the model to the ensemble
significanceModels[feature] = 1.0;
add_model(std::move(model));
// Compute the prediction
ypred = ExpEnsemble::predict(X_train_);
// Remove the model from the ensemble
significanceModels[feature] = 0.0;
model = std::move(models_.back());
remove_last_model();
} else {
ypred = model->predict(X_train_);
}
// Step 3.1: Compute the classifier amout of say
auto ypred_t = torch::tensor(ypred);
std::tie(weights_, alpha_t, finished) = update_weights(y_train, ypred_t, weights_);
// Step 3.4: Store classifier and its accuracy to weigh its future vote
numItemsPack++;
featuresUsed.push_back(feature);
add_model(std::move(model));
significanceModels[feature] = alpha_t;
// VLOG_SCOPE_F(2, "finished: %d numItemsPack: %d n_models: %d featuresUsed: %zu", finished, numItemsPack, n_models, featuresUsed.size());
} // End of the pack
if (convergence && !finished) {
auto y_val_predict = ExpEnsemble::predict(X_test);
double accuracy = (y_val_predict == y_test).sum().item<double>() / (double)y_test.size(0);
if (priorAccuracy == 0) {
priorAccuracy = accuracy;
} else {
improvement = accuracy - priorAccuracy;
}
if (improvement < convergence_threshold) {
// VLOG_SCOPE_F(3, " (improvement<threshold) tolerance: %d numItemsPack: %d improvement: %f prior: %f current: %f", tolerance, numItemsPack, improvement, priorAccuracy, accuracy);
tolerance++;
} else {
// VLOG_SCOPE_F(3, "* (improvement>=threshold) Reset. tolerance: %d numItemsPack: %d improvement: %f prior: %f current: %f", tolerance, numItemsPack, improvement, priorAccuracy, accuracy);
tolerance = 0; // Reset the counter if the model performs better
numItemsPack = 0;
}
if (convergence_best) {
// Keep the best accuracy until now as the prior accuracy
priorAccuracy = std::max(accuracy, priorAccuracy);
} else {
// Keep the last accuray obtained as the prior accuracy
priorAccuracy = accuracy;
}
}
// VLOG_SCOPE_F(1, "tolerance: %d featuresUsed.size: %zu features.size: %zu", tolerance, featuresUsed.size(), features.size());
finished = finished || tolerance > maxTolerance || featuresUsed.size() == features.size();
}
if (tolerance > maxTolerance) {
if (numItemsPack < n_models) {
notes.push_back("Convergence threshold reached & " + std::to_string(numItemsPack) + " models eliminated");
// VLOG_SCOPE_F(4, "Convergence threshold reached & %d models eliminated of %d", numItemsPack, n_models);
for (int i = featuresUsed.size() - 1; i >= featuresUsed.size() - numItemsPack; --i) {
remove_last_model();
significanceModels[featuresUsed[i]] = 0.0;
}
// VLOG_SCOPE_F(4, "*Convergence threshold %d models left & %d features used.", n_models, featuresUsed.size());
} else {
notes.push_back("Convergence threshold reached & 0 models eliminated");
// VLOG_SCOPE_F(4, "Convergence threshold reached & 0 models eliminated n_models=%d numItemsPack=%d", n_models, numItemsPack);
}
}
if (featuresUsed.size() != features.size()) {
notes.push_back("Used features in train: " + std::to_string(featuresUsed.size()) + " of " + std::to_string(features.size()));
status = bayesnet::WARNING;
}
notes.push_back("Number of models: " + std::to_string(n_models));
return;
}
}

View File

@@ -0,0 +1,35 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
#ifndef XBAODE_H
#define XBAODE_H
#include <iostream>
#include <vector>
#include <cmath>
#include <algorithm>
#include <limits>
#include "common/Timer.hpp"
#include "ExpEnsemble.h"
namespace platform {
class XBAODE : public Boost {
// Hay que hacer un vector de modelos entrenados y hacer un predict ensemble con todos ellos
// Probar XA1DE con smooth original y laplace y comprobar diferencias si se pasan pesos a 1 o a 1/m
public:
XBAODE();
std::string getVersion() override { return version; };
protected:
void trainModel(const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing) override;
private:
void add_model(std::unique_ptr<XSpode> model);
void remove_last_model();
std::vector<std::vector<int>> X_train_, X_test_;
std::vector<int> y_train_, y_test_;
std::string version = "0.9.7";
};
}
#endif // XBAODE_H

View File

@@ -0,0 +1,436 @@
#ifndef XSPODE_H
#define XSPODE_H
#include <vector>
#include <map>
#include <stdexcept>
#include <algorithm>
#include <numeric>
#include <string>
#include <cmath>
#include <limits>
#include <sstream>
#include <iostream>
#include <torch/torch.h>
#include <bayesnet/network/Smoothing.h>
#include <bayesnet/classifiers/Classifier.h>
#include "CountingSemaphore.hpp"
namespace platform {
class XSpode : public bayesnet::Classifier {
public:
// --------------------------------------
// Constructor
//
// Supply which feature index is the single super-parent (“spIndex”).
// --------------------------------------
explicit XSpode(int spIndex)
: superParent_{ spIndex },
nFeatures_{ 0 },
statesClass_{ 0 },
fitted_{ false },
alpha_{ 1.0 },
initializer_{ 1.0 },
semaphore_{ CountingSemaphore::getInstance() } : bayesnet::Classifier(bayesnet::Network())
{
}
// --------------------------------------
// fit
// --------------------------------------
//
// Trains the SPODE given data:
// X: X[f][n] is the f-th feature value for instance n
// y: y[n] is the class value for instance n
// states: a map or array that tells how many distinct states each feature and the class can take
//
// For example, states_.back() is the number of class states,
// and states_[f] is the number of distinct values for feature f.
//
// We only store conditional probabilities for:
// p(x_sp| c) (the super-parent feature)
// p(x_child| c, x_sp) for all child ≠ sp
//
// The “weights” can be a vector of per-instance weights; if not used, pass them as 1.0.
// --------------------------------------
void fit(const std::vector<std::vector<int>>& X,
const std::vector<int>& y,
const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing)
{
int numInstances = static_cast<int>(y.size());
nFeatures_ = static_cast<int>(X.size());
// Derive the number of states for each feature and for the class.
// (This is just one approach; adapt to match your environment.)
// Here, we assume the user also gave us the total #states per feature in e.g. statesMap.
// We'll simply reconstruct the integer states_ array. The last entry is statesClass_.
states_.resize(nFeatures_);
for (int f = 0; f < nFeatures_; f++) {
// Suppose you look up in “statesMap” by the feature name, or read directly from X.
// We'll assume states_[f] = max value in X[f] + 1.
auto maxIt = std::max_element(X[f].begin(), X[f].end());
states_[f] = (*maxIt) + 1;
}
// For the class: states_.back() = max(y)+1
statesClass_ = (*std::max_element(y.begin(), y.end())) + 1;
// Initialize counts
classCounts_.resize(statesClass_, 0.0);
// p(x_sp = spVal | c)
// We'll store these counts in spFeatureCounts_[spVal * statesClass_ + c].
spFeatureCounts_.resize(states_[superParent_] * statesClass_, 0.0);
// For each child ≠ sp, we store p(childVal| c, spVal) in a separate block of childCounts_.
// childCounts_ will be sized as sum_{child≠sp} (states_[child] * statesClass_ * states_[sp]).
// We also need an offset for each child to index into childCounts_.
childOffsets_.resize(nFeatures_, -1);
int totalSize = 0;
for (int f = 0; f < nFeatures_; f++) {
if (f == superParent_) continue; // skip sp
childOffsets_[f] = totalSize;
// block size for this child's counts: states_[f] * statesClass_ * states_[superParent_]
totalSize += (states_[f] * statesClass_ * states_[superParent_]);
}
childCounts_.resize(totalSize, 0.0);
// Accumulate raw counts
for (int n = 0; n < numInstances; n++) {
std::vector<int> instance(nFeatures_ + 1);
for (int f = 0; f < nFeatures_; f++) {
instance[f] = X[f][n];
}
instance[nFeatures_] = y[n];
addSample(instance, weights[n].item<double>());
}
switch (smoothing) {
case bayesnet::Smoothing_t::ORIGINAL:
alpha_ = 1.0 / numInstances;
break;
case bayesnet::Smoothing_t::LAPLACE:
alpha_ = 1.0;
break;
default:
alpha_ = 0.0; // No smoothing
}
initializer_ = initializer_ = std::numeric_limits<double>::max() / (nFeatures_ * nFeatures_);
// Convert raw counts to probabilities
computeProbabilities();
fitted_ = true;
}
// --------------------------------------
// addSample (only valid in COUNTS mode)
// --------------------------------------
//
// instance has size nFeatures_ + 1, with the class at the end.
// We add 1 to the appropriate counters for each (c, superParentVal, childVal).
//
void addSample(const std::vector<int>& instance, double weight)
{
if (weight <= 0.0) return;
int c = instance.back();
// (A) increment classCounts
classCounts_[c] += weight;
// (B) increment super-parent counts => p(x_sp | c)
int spVal = instance[superParent_];
spFeatureCounts_[spVal * statesClass_ + c] += weight;
// (C) increment child counts => p(childVal | c, x_sp)
for (int f = 0; f < nFeatures_; f++) {
if (f == superParent_) continue;
int childVal = instance[f];
int offset = childOffsets_[f];
// Compute index in childCounts_.
// Layout: [ offset + (spVal * states_[f] + childVal) * statesClass_ + c ]
int blockSize = states_[f] * statesClass_;
int idx = offset + spVal * blockSize + childVal * statesClass_ + c;
childCounts_[idx] += weight;
}
}
// --------------------------------------
// computeProbabilities
// --------------------------------------
//
// Once all samples are added in COUNTS mode, call this to:
// p(c)
// p(x_sp = spVal | c)
// p(x_child = v | c, x_sp = s_sp)
//
// We store them in the corresponding *Probs_ arrays for inference.
// --------------------------------------
void computeProbabilities()
{
double totalCount = std::accumulate(classCounts_.begin(), classCounts_.end(), 0.0);
// p(c) => classPriors_
classPriors_.resize(statesClass_, 0.0);
if (totalCount <= 0.0) {
// fallback => uniform
double unif = 1.0 / static_cast<double>(statesClass_);
for (int c = 0; c < statesClass_; c++) {
classPriors_[c] = unif;
}
} else {
for (int c = 0; c < statesClass_; c++) {
classPriors_[c] = (classCounts_[c] + alpha_)
/ (totalCount + alpha_ * statesClass_);
}
}
// p(x_sp | c)
spFeatureProbs_.resize(spFeatureCounts_.size());
// denominator for spVal * statesClass_ + c is just classCounts_[c] + alpha_ * (#states of sp)
int spCard = states_[superParent_];
for (int spVal = 0; spVal < spCard; spVal++) {
for (int c = 0; c < statesClass_; c++) {
double denom = classCounts_[c] + alpha_ * spCard;
double num = spFeatureCounts_[spVal * statesClass_ + c] + alpha_;
spFeatureProbs_[spVal * statesClass_ + c] = (denom <= 0.0 ? 0.0 : num / denom);
}
}
// p(x_child | c, x_sp)
childProbs_.resize(childCounts_.size());
for (int f = 0; f < nFeatures_; f++) {
if (f == superParent_) continue;
int offset = childOffsets_[f];
int childCard = states_[f];
// For each spVal, c, childVal in childCounts_:
for (int spVal = 0; spVal < spCard; spVal++) {
for (int childVal = 0; childVal < childCard; childVal++) {
for (int c = 0; c < statesClass_; c++) {
int idx = offset + spVal * (childCard * statesClass_)
+ childVal * statesClass_
+ c;
double num = childCounts_[idx] + alpha_;
// denominator = spFeatureCounts_[spVal * statesClass_ + c] + alpha_ * (#states of child)
double denom = spFeatureCounts_[spVal * statesClass_ + c]
+ alpha_ * childCard;
childProbs_[idx] = (denom <= 0.0 ? 0.0 : num / denom);
}
}
}
}
}
// --------------------------------------
// predict_proba
// --------------------------------------
//
// For a single instance x of dimension nFeatures_:
// P(c | x) ∝ p(c) × p(x_sp | c) × ∏(child ≠ sp) p(x_child | c, x_sp).
//
// Then we normalize the result.
// --------------------------------------
std::vector<double> predict_proba(const std::vector<int>& instance) const
{
std::vector<double> probs(statesClass_, 0.0);
// Multiply p(c) × p(x_sp | c)
int spVal = instance[superParent_];
for (int c = 0; c < statesClass_; c++) {
double pc = classPriors_[c];
double pSpC = spFeatureProbs_[spVal * statesClass_ + c];
probs[c] = pc * pSpC * initializer_;
}
// Multiply by each childs probability p(x_child | c, x_sp)
for (int feature = 0; feature < nFeatures_; feature++) {
if (feature == superParent_) continue; // skip sp
int sf = instance[feature];
int offset = childOffsets_[feature];
int childCard = states_[feature]; // not used directly, but for clarity
// Index into childProbs_ = offset + spVal*(childCard*statesClass_) + childVal*statesClass_ + c
int base = offset + spVal * (childCard * statesClass_) + sf * statesClass_;
for (int c = 0; c < statesClass_; c++) {
probs[c] *= childProbs_[base + c];
}
}
// Normalize
normalize(probs);
return probs;
}
std::vector<std::vector<double>> predict_proba(const std::vector<std::vector<int>>& test_data)
{
int test_size = test_data[0].size();
int sample_size = test_data.size();
auto probabilities = std::vector<std::vector<double>>(test_size, std::vector<double>(statesClass_));
int chunk_size = std::min(150, int(test_size / semaphore_.getMaxCount()) + 1);
std::vector<std::thread> threads;
auto worker = [&](const std::vector<std::vector<int>>& samples, int begin, int chunk, int sample_size, std::vector<std::vector<double>>& predictions) {
std::string threadName = "(V)PWorker-" + std::to_string(begin) + "-" + std::to_string(chunk);
#if defined(__linux__)
pthread_setname_np(pthread_self(), threadName.c_str());
#else
pthread_setname_np(threadName.c_str());
#endif
std::vector<int> instance(sample_size);
for (int sample = begin; sample < begin + chunk; ++sample) {
for (int feature = 0; feature < sample_size; ++feature) {
instance[feature] = samples[feature][sample];
}
predictions[sample] = predict_proba(instance);
}
semaphore_.release();
};
for (int begin = 0; begin < test_size; begin += chunk_size) {
int chunk = std::min(chunk_size, test_size - begin);
semaphore_.acquire();
threads.emplace_back(worker, test_data, begin, chunk, sample_size, std::ref(probabilities));
}
for (auto& thread : threads) {
thread.join();
}
return probabilities;
}
// --------------------------------------
// predict
// --------------------------------------
//
// Return the class argmax( P(c|x) ).
// --------------------------------------
int predict(const std::vector<int>& instance) const
{
auto p = predict_proba(instance);
return static_cast<int>(std::distance(p.begin(),
std::max_element(p.begin(), p.end())));
}
std::vector<int> predict(std::vector<std::vector<int>>& test_data)
{
if (!fitted_) {
throw std::logic_error(CLASSIFIER_NOT_FITTED);
}
auto probabilities = predict_proba(test_data);
std::vector<int> predictions(probabilities.size(), 0);
for (size_t i = 0; i < probabilities.size(); i++) {
predictions[i] = std::distance(probabilities[i].begin(), std::max_element(probabilities[i].begin(), probabilities[i].end()));
}
return predictions;
}
// --------------------------------------
// Utility: normalize
// --------------------------------------
void normalize(std::vector<double>& v) const
{
double sum = 0.0;
for (auto val : v) { sum += val; }
if (sum <= 0.0) {
return;
}
for (auto& val : v) {
val /= sum;
}
}
// --------------------------------------
// debug printing, if desired
// --------------------------------------
std::string to_string() const
{
std::ostringstream oss;
oss << "---- SPODE Model ----\n"
<< "nFeatures_ = " << nFeatures_ << "\n"
<< "superParent_ = " << superParent_ << "\n"
<< "statesClass_ = " << statesClass_ << "\n"
<< "\n";
oss << "States: [";
for (int s : states_) oss << s << " ";
oss << "]\n";
oss << "classCounts_: [";
for (double c : classCounts_) oss << c << " ";
oss << "]\n";
oss << "classPriors_: [";
for (double c : classPriors_) oss << c << " ";
oss << "]\n";
oss << "spFeatureCounts_: size = " << spFeatureCounts_.size() << "\n[";
for (double c : spFeatureCounts_) oss << c << " ";
oss << "]\n";
oss << "spFeatureProbs_: size = " << spFeatureProbs_.size() << "\n[";
for (double c : spFeatureProbs_) oss << c << " ";
oss << "]\n";
oss << "childCounts_: size = " << childCounts_.size() << "\n[";
for (double cc : childCounts_) oss << cc << " ";
oss << "]\n";
oss << "childProbs_: size = " << childProbs_.size() << "\n[";
for (double cp : childProbs_) oss << cp << " ";
oss << "]\n";
oss << "childOffsets_: [";
for (int co : childOffsets_) oss << co << " ";
oss << "]\n";
oss << "---------------------\n";
return oss.str();
}
int statesClass() const { return statesClass_; }
int getNFeatures() const { return nFeatures_; }
int getNumberOfStates() const
{
return std::accumulate(states_.begin(), states_.end(), 0) * nFeatures_;
}
int getNumberOfEdges() const
{
return nFeatures_ * (2 * nFeatures_ - 1);
}
std::vector<int>& getStates() { return states_; }
private:
// --------------------------------------
// MEMBERS
// --------------------------------------
int superParent_; // which feature is the single super-parent
int nFeatures_;
int statesClass_;
bool fitted_ = false;
std::vector<int> states_; // [states_feat0, ..., states_feat(N-1)] (class not included in this array)
const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted";
// Class counts
std::vector<double> classCounts_; // [c], accumulative
std::vector<double> classPriors_; // [c], after normalization
// For p(x_sp = spVal | c)
std::vector<double> spFeatureCounts_; // [spVal * statesClass_ + c]
std::vector<double> spFeatureProbs_; // same shape, after normalization
// For p(x_child = childVal | x_sp = spVal, c)
// childCounts_ is big enough to hold all child features except sp:
// For each child f, we store childOffsets_[f] as the start index, then
// childVal, spVal, c => the data.
std::vector<double> childCounts_;
std::vector<double> childProbs_;
std::vector<int> childOffsets_;
double alpha_ = 1.0;
double initializer_; // for numerical stability
CountingSemaphore& semaphore_;
};
} // namespace platform
#endif // XSPODE_H

View File

@@ -0,0 +1,478 @@
// ***************************************************************
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT
// ***************************************************************
// Based on the Geoff. I. Webb A1DE java algorithm
// https://weka.sourceforge.io/packageMetaData/AnDE/Latest.html
#ifndef XAODE_H
#define XAODE_H
#include <vector>
#include <map>
#include <stdexcept>
#include <algorithm>
#include <numeric>
#include <string>
#include <cmath>
#include <limits>
#include <sstream>
#include <torch/torch.h>
#include <bayesnet/network/Smoothing.h>
namespace platform {
class Xaode {
public:
// -------------------------------------------------------
// The Xaode can be EMPTY (just created), in COUNTS mode (accumulating raw counts)
// or PROBS mode (storing conditional probabilities).
enum class MatrixState {
EMPTY,
COUNTS,
PROBS
};
std::vector<double> significance_models_;
Xaode() : nFeatures_{ 0 }, statesClass_{ 0 }, matrixState_{ MatrixState::EMPTY } {}
// -------------------------------------------------------
// fit
// -------------------------------------------------------
//
// Classifiers interface
// all parameter decide if the model is initialized with all the parents active or none of them
//
// states.size() = nFeatures + 1,
// where states.back() = number of class states.
//
// We'll store:
// 1) p(x_i=si | c) in classFeatureProbs_
// 2) p(x_j=sj | c, x_i=si) in data_, with i<j => i is "superparent," j is "child."
//
// Internally, in COUNTS mode, data_ accumulates raw counts, then
// computeProbabilities(...) normalizes them into conditionals.
void fit(std::vector<std::vector<int>>& X, std::vector<int>& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights, const bool all_parents, const bayesnet::Smoothing_t smoothing)
{
int num_instances = X[0].size();
nFeatures_ = X.size();
significance_models_.resize(nFeatures_, (all_parents ? 1.0 : 0.0));
for (int i = 0; i < nFeatures_; i++) {
if (all_parents) active_parents.push_back(i);
states_.push_back(*max_element(X[i].begin(), X[i].end()) + 1);
}
states_.push_back(*max_element(y.begin(), y.end()) + 1);
//
statesClass_ = states_.back();
classCounts_.resize(statesClass_, 0.0);
classPriors_.resize(statesClass_, 0.0);
//
// Initialize data structures
//
active_parents.resize(nFeatures_);
int totalStates = std::accumulate(states_.begin(), states_.end(), 0) - statesClass_;
// For p(x_i=si | c), we store them in a 1D array classFeatureProbs_ after we compute.
// We'll need the offsets for each feature i in featureClassOffset_.
featureClassOffset_.resize(nFeatures_);
// We'll store p(x_child=sj | c, x_sp=si) for each pair (i<j).
// So data_(i, si, j, sj, c) indexes into a big 1D array with an offset.
// For p(x_i=si | c), we store them in a 1D array classFeatureProbs_ after we compute.
// We'll need the offsets for each feature i in featureClassOffset_.
featureClassOffset_.resize(nFeatures_);
pairOffset_.resize(totalStates);
int feature_offset = 0;
int runningOffset = 0;
int feature = 0, index = 0;
for (int i = 0; i < nFeatures_; ++i) {
featureClassOffset_[i] = feature_offset;
feature_offset += states_[i];
for (int j = 0; j < states_[i]; ++j) {
pairOffset_[feature++] = index;
index += runningOffset;
}
runningOffset += states_[i];
}
int totalSize = index * statesClass_;
data_.resize(totalSize);
dataOpp_.resize(totalSize);
classFeatureCounts_.resize(feature_offset * statesClass_);
classFeatureProbs_.resize(feature_offset * statesClass_);
matrixState_ = MatrixState::COUNTS;
//
// Add samples
//
std::vector<int> instance(nFeatures_ + 1);
for (int n_instance = 0; n_instance < num_instances; n_instance++) {
for (int feature = 0; feature < nFeatures_; feature++) {
instance[feature] = X[feature][n_instance];
}
instance[nFeatures_] = y[n_instance];
addSample(instance, weights[n_instance].item<double>());
}
switch (smoothing) {
case bayesnet::Smoothing_t::ORIGINAL:
alpha_ = 1.0 / num_instances;
break;
case bayesnet::Smoothing_t::LAPLACE:
alpha_ = 1.0;
break;
default:
alpha_ = 0.0; // No smoothing
}
initializer_ = std::numeric_limits<double>::max() / (nFeatures_ * nFeatures_);
computeProbabilities();
}
std::string to_string() const
{
std::ostringstream ostream;
ostream << "-------- Xaode.status --------" << std::endl
<< "- nFeatures = " << nFeatures_ << std::endl
<< "- statesClass = " << statesClass_ << std::endl
<< "- matrixState = " << (matrixState_ == MatrixState::COUNTS ? "COUNTS" : "PROBS") << std::endl;
ostream << "- states: size: " << states_.size() << std::endl;
for (int s : states_) ostream << s << " "; ostream << std::endl;
ostream << "- classCounts: size: " << classCounts_.size() << std::endl;
for (double cc : classCounts_) ostream << cc << " "; ostream << std::endl;
ostream << "- classPriors: size: " << classPriors_.size() << std::endl;
for (double cp : classPriors_) ostream << cp << " "; ostream << std::endl;
ostream << "- classFeatureCounts: size: " << classFeatureCounts_.size() << std::endl;
for (double cfc : classFeatureCounts_) ostream << cfc << " "; ostream << std::endl;
ostream << "- classFeatureProbs: size: " << classFeatureProbs_.size() << std::endl;
for (double cfp : classFeatureProbs_) ostream << cfp << " "; ostream << std::endl;
ostream << "- featureClassOffset: size: " << featureClassOffset_.size() << std::endl;
for (int f : featureClassOffset_) ostream << f << " "; ostream << std::endl;
ostream << "- pairOffset_: size: " << pairOffset_.size() << std::endl;
for (int p : pairOffset_) ostream << p << " "; ostream << std::endl;
ostream << "- data: size: " << data_.size() << std::endl;
for (double d : data_) ostream << d << " "; ostream << std::endl;
ostream << "- dataOpp: size: " << dataOpp_.size() << std::endl;
for (double d : dataOpp_) ostream << d << " "; ostream << std::endl;
ostream << "--------------------------------" << std::endl;
std::string output = ostream.str();
return output;
}
// -------------------------------------------------------
// addSample (only in COUNTS mode)
// -------------------------------------------------------
//
// instance should have the class at the end.
//
void addSample(const std::vector<int>& instance, double weight)
{
//
// (A) increment classCounts_
// (B) increment featureclass counts => for p(x_i|c)
// (C) increment pair (superparent= i, child= j) counts => data_
//
int c = instance.back();
if (weight <= 0.0) {
return;
}
// (A) increment classCounts_
classCounts_[c] += weight;
// (B,C)
// We'll store raw counts now and turn them into p(child| c, superparent) later.
int idx, fcIndex, sp, sc, i_offset;
for (int parent = 0; parent < nFeatures_; ++parent) {
sp = instance[parent];
// (B) increment featureclass counts => for p(x_i|c)
fcIndex = (featureClassOffset_[parent] + sp) * statesClass_ + c;
classFeatureCounts_[fcIndex] += weight;
// (C) increment pair (superparent= i, child= j) counts => data_
i_offset = pairOffset_[featureClassOffset_[parent] + sp];
for (int child = 0; child < parent; ++child) {
sc = instance[child];
idx = (i_offset + featureClassOffset_[child] + sc) * statesClass_ + c;
data_[idx] += weight;
}
}
}
// -------------------------------------------------------
// computeProbabilities
// -------------------------------------------------------
//
// Once all samples are added in COUNTS mode, call this to:
// 1) compute p(c) => classPriors_
// 2) compute p(x_i=si | c) => classFeatureProbs_
// 3) compute p(x_j=sj | c, x_i=si) => data_ (for i<j) dataOpp_ (for i>j)
//
void computeProbabilities()
{
if (matrixState_ != MatrixState::COUNTS) {
throw std::logic_error("computeProbabilities: must be in COUNTS mode.");
}
double totalCount = std::accumulate(classCounts_.begin(), classCounts_.end(), 0.0);
// (1) p(c)
if (totalCount <= 0.0) {
// fallback => uniform
double unif = 1.0 / statesClass_;
for (int c = 0; c < statesClass_; ++c) {
classPriors_[c] = unif;
}
} else {
for (int c = 0; c < statesClass_; ++c) {
classPriors_[c] = (classCounts_[c] + alpha_) / (totalCount + alpha_ * statesClass_);
}
}
// (2) p(x_i=si | c) => classFeatureProbs_
int idx, sf;
double denom;
for (int feature = 0; feature < nFeatures_; ++feature) {
sf = states_[feature];
for (int c = 0; c < statesClass_; ++c) {
denom = classCounts_[c] + alpha_ * sf;
for (int sf_value = 0; sf_value < sf; ++sf_value) {
idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c;
classFeatureProbs_[idx] = (classFeatureCounts_[idx] + alpha_) / denom;
}
}
}
// getCountFromTable(int classVal, int pIndex, int childIndex)
// (3) p(x_c=sc | c, x_p=sp) => data_(parent,sp,child,sc,c)
// (3) p(x_p=sp | c, x_c=sc) => dataOpp_(child,sc,parent,sp,c)
// C(x_c, x_p, c) + alpha_
// P(x_p | x_c, c) = -----------------------------------
// C(x_c, c) + alpha_
double pcc_count, pc_count, cc_count;
double conditionalProb, oppositeCondProb;
int part1, part2, p1, part2_class, p1_class;
for (int parent = 1; parent < nFeatures_; ++parent) {
for (int sp = 0; sp < states_[parent]; ++sp) {
p1 = featureClassOffset_[parent] + sp;
part1 = pairOffset_[p1];
p1_class = p1 * statesClass_;
for (int child = 0; child < parent; ++child) {
for (int sc = 0; sc < states_[child]; ++sc) {
part2 = featureClassOffset_[child] + sc;
part2_class = part2 * statesClass_;
for (int c = 0; c < statesClass_; c++) {
idx = (part1 + part2) * statesClass_ + c;
// Parent, Child, Class Count
pcc_count = data_[idx];
// Parent, Class count
pc_count = classFeatureCounts_[p1_class + c];
// Child, Class count
cc_count = classFeatureCounts_[part2_class + c];
// p(x_c=sc | c, x_p=sp)
conditionalProb = (pcc_count + alpha_) / (pc_count + alpha_ * states_[child]);
data_[idx] = conditionalProb;
// p(x_p=sp | c, x_c=sc)
oppositeCondProb = (pcc_count + alpha_) / (cc_count + alpha_ * states_[parent]);
dataOpp_[idx] = oppositeCondProb;
}
}
}
}
}
matrixState_ = MatrixState::PROBS;
}
// -------------------------------------------------------
// predict_proba_spode
// -------------------------------------------------------
//
// Single-superparent approach:
// P(c | x) ∝ p(c) * p(x_sp| c) * ∏_{i≠sp} p(x_i | c, x_sp)
//
// 'instance' should have size == nFeatures_ (no class).
// sp in [0..nFeatures_).
// We multiply p(c) * p(x_sp| c) * p(x_i| c, x_sp).
// Then normalize the distribution.
//
std::vector<double> predict_proba_spode(const std::vector<int>& instance, int parent)
{
// accumulates posterior probabilities for each class
auto probs = std::vector<double>(statesClass_);
auto spodeProbs = std::vector<double>(statesClass_, 0.0);
if (std::find(active_parents.begin(), active_parents.end(), parent) == active_parents.end()) {
return spodeProbs;
}
// Initialize the probabilities with the feature|class probabilities x class priors
int localOffset;
int sp = instance[parent];
localOffset = (featureClassOffset_[parent] + sp) * statesClass_;
for (int c = 0; c < statesClass_; ++c) {
spodeProbs[c] = classFeatureProbs_[localOffset + c] * classPriors_[c] * initializer_;
}
int idx, base, sc, parent_offset;
for (int child = 0; child < nFeatures_; ++child) {
if (child == parent) {
continue;
}
sc = instance[child];
if (child > parent) {
parent_offset = pairOffset_[featureClassOffset_[child] + sc];
base = (parent_offset + featureClassOffset_[parent] + sp) * statesClass_;
} else {
parent_offset = pairOffset_[featureClassOffset_[parent] + sp];
base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_;
}
for (int c = 0; c < statesClass_; ++c) {
/*
* The probability P(xc|xp,c) is stored in dataOpp_, and
* the probability P(xp|xc,c) is stored in data_
*/
idx = base + c;
double factor = child > parent ? dataOpp_[idx] : data_[idx];
// double factor = data_[idx];
spodeProbs[c] *= factor;
}
}
// Normalize the probabilities
normalize(spodeProbs);
return spodeProbs;
}
int predict_spode(const std::vector<int>& instance, int parent)
{
auto probs = predict_proba_spode(instance, parent);
return (int)std::distance(probs.begin(), std::max_element(probs.begin(), probs.end()));
}
// -------------------------------------------------------
// predict_proba
// -------------------------------------------------------
//
// P(c | x) ∝ p(c) * ∏_{i} p(x_i | c) * ∏_{i<j} p(x_j | c, x_i) * p(x_i | c, x_j)
//
// 'instance' should have size == nFeatures_ (no class).
// We multiply p(c) * p(x_i| c) * p(x_j| c, x_i) for all i, j.
// Then normalize the distribution.
//
std::vector<double> predict_proba(const std::vector<int>& instance)
{
// accumulates posterior probabilities for each class
auto probs = std::vector<double>(statesClass_);
auto spodeProbs = std::vector<std::vector<double>>(nFeatures_, std::vector<double>(statesClass_));
// Initialize the probabilities with the feature|class probabilities
int localOffset;
for (int feature = 0; feature < nFeatures_; ++feature) {
// if feature is not in the active_parents, skip it
if (std::find(active_parents.begin(), active_parents.end(), feature) == active_parents.end()) {
continue;
}
localOffset = (featureClassOffset_[feature] + instance[feature]) * statesClass_;
for (int c = 0; c < statesClass_; ++c) {
spodeProbs[feature][c] = classFeatureProbs_[localOffset + c] * classPriors_[c] * initializer_;
}
}
int idx, base, sp, sc, parent_offset;
for (int parent = 1; parent < nFeatures_; ++parent) {
// if parent is not in the active_parents, skip it
if (std::find(active_parents.begin(), active_parents.end(), parent) == active_parents.end()) {
continue;
}
sp = instance[parent];
parent_offset = pairOffset_[featureClassOffset_[parent] + sp];
for (int child = 0; child < parent; ++child) {
sc = instance[child];
if (child > parent) {
parent_offset = pairOffset_[featureClassOffset_[child] + sc];
base = (parent_offset + featureClassOffset_[parent] + sp) * statesClass_;
} else {
parent_offset = pairOffset_[featureClassOffset_[parent] + sp];
base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_;
}
for (int c = 0; c < statesClass_; ++c) {
/*
* The probability P(xc|xp,c) is stored in dataOpp_, and
* the probability P(xp|xc,c) is stored in data_
*/
idx = base + c;
double factor_child = child > parent ? data_[idx] : dataOpp_[idx];
double factor_parent = child > parent ? dataOpp_[idx] : data_[idx];
spodeProbs[child][c] *= factor_child;
spodeProbs[parent][c] *= factor_parent;
}
}
}
/* add all the probabilities for each class */
for (int c = 0; c < statesClass_; ++c) {
for (int i = 0; i < nFeatures_; ++i) {
probs[c] += spodeProbs[i][c] * significance_models_[i];
}
}
// Normalize the probabilities
normalize(probs);
return probs;
}
void normalize(std::vector<double>& probs) const
{
double sum = std::accumulate(probs.begin(), probs.end(), 0.0);
if (std::isnan(sum)) {
throw std::runtime_error("Can't normalize array. Sum is NaN.");
}
if (sum == 0) {
return;
}
for (int i = 0; i < (int)probs.size(); i++) {
probs[i] /= sum;
}
}
// Returns current mode: INIT, COUNTS or PROBS
MatrixState state() const
{
return matrixState_;
}
int statesClass() const
{
return statesClass_;
}
int nFeatures() const
{
return nFeatures_;
}
int getNumberOfStates() const
{
return std::accumulate(states_.begin(), states_.end(), 0) * nFeatures_;
}
int getNumberOfEdges() const
{
return nFeatures_ * (2 * nFeatures_ - 1);
}
int getNumberOfNodes() const
{
return (nFeatures_ + 1) * nFeatures_;
}
void add_active_parent(int active_parent)
{
active_parents.push_back(active_parent);
}
void remove_last_parent()
{
active_parents.pop_back();
}
private:
// -----------
// MEMBER DATA
// -----------
std::vector<int> states_; // [states_feat0, ..., states_feat(n-1), statesClass_]
int nFeatures_;
int statesClass_;
// data_ means p(child=sj | c, superparent= si) after normalization.
// But in COUNTS mode, it accumulates raw counts.
std::vector<int> pairOffset_;
// data_ stores p(child=sj | c, superparent=si) for each pair (i<j).
std::vector<double> data_;
// dataOpp_ stores p(superparent=si | c, child=sj) for each pair (i<j).
std::vector<double> dataOpp_;
// classCounts_[c]
std::vector<double> classCounts_;
std::vector<double> classPriors_; // => p(c)
// For p(x_i=si| c), we store counts in classFeatureCounts_ => offset by featureClassOffset_[i]
std::vector<int> featureClassOffset_;
std::vector<double> classFeatureCounts_;
std::vector<double> classFeatureProbs_; // => p(x_i=si | c) after normalization
MatrixState matrixState_;
double alpha_ = 1.0; // Laplace smoothing
double initializer_ = 1.0;
std::vector<int> active_parents;
};
}
#endif // XAODE_H

314
src/grid/GridBase.cpp Normal file
View File

@@ -0,0 +1,314 @@
#include <random>
#include <cstddef>
#include "common/DotEnv.h"
#include "common/Paths.h"
#include "common/Colors.h"
#include "GridBase.h"
namespace platform {
GridBase::GridBase(struct ConfigGrid& config)
{
this->config = config;
auto env = platform::DotEnv();
this->config.platform = env.get("platform");
}
void GridBase::validate_config()
{
if (config.smooth_strategy == "ORIGINAL")
smooth_type = bayesnet::Smoothing_t::ORIGINAL;
else if (config.smooth_strategy == "LAPLACE")
smooth_type = bayesnet::Smoothing_t::LAPLACE;
else if (config.smooth_strategy == "CESTNIK")
smooth_type = bayesnet::Smoothing_t::CESTNIK;
else {
std::cerr << "GridBase: Unknown smoothing strategy: " << config.smooth_strategy << std::endl;
exit(1);
}
}
std::string GridBase::get_color_rank(int rank)
{
auto colors = { Colors::WHITE(), Colors::RED(), Colors::GREEN(), Colors::BLUE(), Colors::MAGENTA(), Colors::CYAN(), Colors::YELLOW(), Colors::BLACK() };
std::string id = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
auto idx = rank % id.size();
return *(colors.begin() + rank % colors.size()) + id[idx];
}
void GridBase::shuffle_and_progress_bar(json& tasks)
{
// Shuffle the array so heavy datasets are eas ier spread across the workers
std::mt19937 g{ 271 }; // Use fixed seed to obtain the same shuffle
std::shuffle(tasks.begin(), tasks.end(), g);
std::cout << "* Number of tasks: " << tasks.size() << std::endl;
std::cout << separator << std::flush;
for (int i = 0; i < tasks.size(); ++i) {
if ((i + 1) % 10 == 0)
std::cout << separator;
else
std::cout << (i + 1) % 10;
}
std::cout << separator << std::endl << separator << std::flush;
}
json GridBase::build_tasks(Datasets& datasets)
{
/*
* Each task is a json object with the following structure:
* {
* "dataset": "dataset_name",
* "idx_dataset": idx_dataset, // used to identify the dataset in the results
* // this index is relative to the list of used datasets in the actual run not to the whole datasets list
* "seed": # of seed to use,
* "fold": # of fold to process
* }
* This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold
*/
auto tasks = json::array();
auto all_datasets = datasets.getNames();
auto datasets_names = filterDatasets(datasets);
for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) {
auto dataset = datasets_names[idx_dataset];
for (const auto& seed : config.seeds) {
for (int n_fold = 0; n_fold < config.n_folds; n_fold++) {
json task = {
{ "dataset", dataset },
{ "idx_dataset", idx_dataset},
{ "seed", seed },
{ "fold", n_fold},
};
tasks.push_back(task);
}
}
}
shuffle_and_progress_bar(tasks);
return tasks;
}
void GridBase::summary(json& all_results, json& tasks, struct ConfigMPI& config_mpi)
{
// Report the tasks done by each worker, showing dataset number, seed, fold and time spent
// The format I want to show is:
// worker, dataset, seed, fold, time
// with headers
std::cout << Colors::RESET() << "* Summary of tasks done by each worker" << std::endl;
json worker_tasks = json::array();
for (int i = 0; i < config_mpi.n_procs; ++i) {
worker_tasks.push_back(json::array());
}
int max_dataset = 7;
for (const auto& [key, results] : all_results.items()) {
auto dataset = key;
if (dataset.size() > max_dataset)
max_dataset = dataset.size();
for (const auto& result : results) {
int n_task = result["task"].get<int>();
json task = tasks[n_task];
auto seed = task["seed"].get<int>();
auto fold = task["fold"].get<int>();
auto time = result["time"].get<double>();
auto worker = result["process"].get<int>();
json line = {
{ "dataset", dataset },
{ "seed", seed },
{ "fold", fold },
{ "time", time }
};
worker_tasks[worker].push_back(line);
}
}
std::cout << Colors::MAGENTA() << " W " << setw(max_dataset) << std::left << "Dataset";
std::cout << " Seed Fold Time" << std::endl;
std::cout << "=== " << std::string(max_dataset, '=') << " ==== ==== " << std::string(15, '=') << std::endl;
for (int worker = 0; worker < config_mpi.n_procs; ++worker) {
auto color = (worker % 2) ? Colors::CYAN() : Colors::BLUE();
std::cout << color << std::right << setw(3) << worker << " ";
if (worker == config_mpi.manager) {
std::cout << "Manager" << std::endl;
continue;
}
if (worker_tasks[worker].empty()) {
std::cout << "No tasks" << std::endl;
continue;
}
bool first = true;
double total = 0.0;
int num_tasks = 0;
for (const auto& task : worker_tasks[worker]) {
num_tasks++;
if (!first)
std::cout << std::string(4, ' ');
else
first = false;
std::cout << std::left << setw(max_dataset) << task["dataset"].get<std::string>();
std::cout << " " << setw(4) << std::right << task["seed"].get<int>();
std::cout << " " << setw(4) << task["fold"].get<int>();
std::cout << " " << setw(15) << std::setprecision(7) << std::fixed << task["time"].get<double>() << std::endl;
total += task["time"].get<double>();
}
if (num_tasks > 1) {
std::cout << Colors::MAGENTA() << " ";
std::cout << setw(max_dataset) << "Total (" << setw(2) << std::right << num_tasks << ")" << std::string(7, '.');
std::cout << " " << setw(15) << std::setprecision(7) << std::fixed << total << std::endl;
}
}
}
void GridBase::go(struct ConfigMPI& config_mpi)
{
/*
* Each task is a json object with the data needed by the process
*
* The overall process consists in these steps:
* 0. Validate config, create the MPI result type & tasks
* 0.1 Create the MPI result type
* 0.2 Manager creates the tasks
* 1. Manager will broadcast the tasks to all the processes
* 1.1 Broadcast the number of tasks
* 1.2 Broadcast the length of the following string
* 1.2 Broadcast the tasks as a char* string
* 2a. Producer delivers the tasks to the consumers
* 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
* 2a.2 Producer will send the end message to all the consumers
* 2b. Consumers process the tasks and send the results to the producer
* 2b.1 Consumers announce to the producer that they are ready to receive a task
* 2b.2 Consumers receive the task from the producer and process it
* 2b.3 Consumers send the result to the producer
* 3. Manager compile results for each dataset
* 3.1 Loop thru all the results obtained from each outer fold (task) and select the best
* 3.2 Save the results
* 3.3 Summary of jobs done
*/
//
// 0.1 Create the MPI result type
//
validate_config();
Task_Result result;
int tasks_size;
MPI_Datatype MPI_Result;
MPI_Datatype type[11] = { MPI_UNSIGNED, MPI_UNSIGNED, MPI_INT, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_INT, MPI_INT };
int blocklen[11] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
MPI_Aint disp[11];
disp[0] = offsetof(Task_Result, idx_dataset);
disp[1] = offsetof(Task_Result, idx_combination);
disp[2] = offsetof(Task_Result, n_fold);
disp[3] = offsetof(Task_Result, score);
disp[4] = offsetof(Task_Result, time);
disp[5] = offsetof(Task_Result, time_train);
disp[6] = offsetof(Task_Result, nodes);
disp[7] = offsetof(Task_Result, leaves);
disp[8] = offsetof(Task_Result, depth);
disp[9] = offsetof(Task_Result, process);
disp[10] = offsetof(Task_Result, task);
MPI_Type_create_struct(11, blocklen, disp, type, &MPI_Result);
MPI_Type_commit(&MPI_Result);
//
// 0.2 Manager creates the tasks
//
char* msg;
json tasks;
auto env = platform::DotEnv();
auto datasets = Datasets(config.discretize, Paths::datasets(), env.get("discretize_algo"));
if (config_mpi.rank == config_mpi.manager) {
timer.start();
tasks = build_tasks(datasets);
auto tasks_str = tasks.dump();
tasks_size = tasks_str.size();
msg = new char[tasks_size + 1];
strcpy(msg, tasks_str.c_str());
}
//
// 1. Manager will broadcast the tasks to all the processes
//
MPI_Bcast(&tasks_size, 1, MPI_INT, config_mpi.manager, MPI_COMM_WORLD);
if (config_mpi.rank != config_mpi.manager) {
msg = new char[tasks_size + 1];
}
MPI_Bcast(msg, tasks_size + 1, MPI_CHAR, config_mpi.manager, MPI_COMM_WORLD);
tasks = json::parse(msg);
delete[] msg;
if (config_mpi.rank == config_mpi.manager) {
//
// 2a. Producer delivers the tasks to the consumers
//
auto datasets_names = filterDatasets(datasets);
json all_results = producer(datasets_names, tasks, config_mpi, MPI_Result);
std::cout << separator << std::endl;
//
// 3. Manager compile results for each dataset
//
auto results = initializeResults();
compile_results(results, all_results, config.model);
//
// 3.2 Save the results
//
save(results);
//
// 3.3 Summary of jobs done
//
if (!config.quiet)
summary(all_results, tasks, config_mpi);
} else {
//
// 2b. Consumers process the tasks and send the results to the producer
//
consumer(datasets, tasks, config, config_mpi, MPI_Result);
}
}
json GridBase::producer(std::vector<std::string>& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
{
Task_Result result;
json results;
int num_tasks = tasks.size();
//
// 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
//
for (int i = 0; i < num_tasks; ++i) {
MPI_Status status;
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_RESULT) {
//Store result
store_result(names, result, results);
}
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_TASK, MPI_COMM_WORLD);
}
//
// 2a.2 Producer will send the end message to all the consumers
//
for (int i = 0; i < config_mpi.n_procs - 1; ++i) {
MPI_Status status;
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_RESULT) {
//Store result
store_result(names, result, results);
}
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_END, MPI_COMM_WORLD);
}
return results;
}
void GridBase::consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
{
Task_Result result;
//
// 2b.1 Consumers announce to the producer that they are ready to receive a task
//
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_QUERY, MPI_COMM_WORLD);
int task;
while (true) {
MPI_Status status;
//
// 2b.2 Consumers receive the task from the producer and process it
//
MPI_Recv(&task, 1, MPI_INT, config_mpi.manager, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_END) {
break;
}
consumer_go(config, config_mpi, tasks, task, datasets, &result);
//
// 2b.3 Consumers send the result to the producer
//
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_RESULT, MPI_COMM_WORLD);
}
}
}

39
src/grid/GridBase.h Normal file
View File

@@ -0,0 +1,39 @@
#ifndef GRIDBASE_H
#define GRIDBASE_H
#include <string>
#include <mpi.h>
#include <nlohmann/json.hpp>
#include "common/Datasets.h"
#include "common/Timer.hpp"
#include "main/HyperParameters.h"
#include "GridConfig.h"
namespace platform {
using json = nlohmann::ordered_json;
class GridBase {
public:
explicit GridBase(struct ConfigGrid& config);
~GridBase() = default;
void go(struct ConfigMPI& config_mpi);
void validate_config();
protected:
json build_tasks(Datasets& datasets);
virtual void save(json& results) = 0;
virtual std::vector<std::string> filterDatasets(Datasets& datasets) const = 0;
virtual json initializeResults() = 0;
virtual void compile_results(json& results, json& all_results, std::string& model) = 0;
virtual json store_result(std::vector<std::string>& names, Task_Result& result, json& results) = 0;
virtual void consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result) = 0;
void shuffle_and_progress_bar(json& tasks);
json producer(std::vector<std::string>& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result);
void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result);
std::string get_color_rank(int rank);
void summary(json& all_results, json& tasks, struct ConfigMPI& config_mpi);
struct ConfigGrid config;
Timer timer; // used to measure the time of the whole process
const std::string separator = "|";
bayesnet::Smoothing_t smooth_type{ bayesnet::Smoothing_t::NONE };
};
} /* namespace platform */
#endif

55
src/grid/GridConfig.h Normal file
View File

@@ -0,0 +1,55 @@
#ifndef GRIDCONFIG_H
#define GRIDCONFIG_H
#include <string>
#include <map>
#include <mpi.h>
#include <nlohmann/json.hpp>
#include "common/Datasets.h"
#include "common/Timer.hpp"
#include "main/HyperParameters.h"
#include "GridData.h"
#include "GridConfig.h"
#include "bayesnet/network/Network.h"
namespace platform {
using json = nlohmann::ordered_json;
struct ConfigGrid {
std::string model;
std::string score;
std::string continue_from;
std::string platform;
std::string smooth_strategy;
bool quiet;
bool only; // used with continue_from to only compute that dataset
bool discretize;
bool stratified;
int nested;
int n_folds;
json excluded;
std::vector<int> seeds;
};
struct ConfigMPI {
int rank;
int n_procs;
int manager;
};
typedef struct {
uint idx_dataset;
uint idx_combination;
int n_fold;
double score; // Experiment: Score test, no score train in this case
double time; // Experiment: Time test
double time_train;
double nodes; // Experiment specific
double leaves; // Experiment specific
double depth; // Experiment specific
int process;
int task;
} Task_Result;
const int TAG_QUERY = 1;
const int TAG_RESULT = 2;
const int TAG_TASK = 3;
const int TAG_END = 4;
} /* namespace platform */
#endif

View File

@@ -1,5 +1,5 @@
#include "GridData.h"
#include <fstream> #include <fstream>
#include "GridData.h"
namespace platform { namespace platform {
GridData::GridData(const std::string& fileName) GridData::GridData(const std::string& fileName)

View File

@@ -6,7 +6,7 @@
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
namespace platform { namespace platform {
using json = nlohmann::json; using json = nlohmann::ordered_json;
const std::string ALL_DATASETS = "all"; const std::string ALL_DATASETS = "all";
class GridData { class GridData {
public: public:
@@ -23,4 +23,4 @@ namespace platform {
std::map<std::string, json> grid; std::map<std::string, json> grid;
}; };
} /* namespace platform */ } /* namespace platform */
#endif /* GRIDDATA_H */ #endif

196
src/grid/GridExperiment.cpp Normal file
View File

@@ -0,0 +1,196 @@
#include <iostream>
#include <cstddef>
#include <torch/torch.h>
#include <folding.hpp>
#include "main/Models.h"
#include "common/Paths.h"
#include "common/Utils.h"
#include "GridExperiment.h"
namespace platform {
// GridExperiment::GridExperiment(argparse::ArgumentParser& program, struct ConfigGrid& config) : arguments(program), GridBase(config)
GridExperiment::GridExperiment(ArgumentsExperiment& program, struct ConfigGrid& config) : arguments(program), GridBase(config)
{
experiment = arguments.initializedExperiment();
filesToTest = arguments.getFilesToTest();
saveResults = arguments.haveToSaveResults();
this->config.model = experiment.getModel();
this->config.score = experiment.getScore();
this->config.discretize = experiment.isDiscretized();
this->config.stratified = experiment.isStratified();
this->config.smooth_strategy = experiment.getSmoothStrategy();
this->config.n_folds = experiment.getNFolds();
this->config.seeds = experiment.getRandomSeeds();
this->config.quiet = experiment.isQuiet();
}
json GridExperiment::getResults()
{
return computed_results;
}
std::vector<std::string> GridExperiment::filterDatasets(Datasets& datasets) const
{
return filesToTest;
}
json GridExperiment::initializeResults()
{
json results;
return results;
}
void GridExperiment::save(json& results)
{
}
void GridExperiment::compile_results(json& results, json& all_results, std::string& model)
{
auto datasets = Datasets(false, Paths::datasets());
nlohmann::json temp = all_results; // To restore the order of the data by dataset name
all_results = temp;
for (const auto& result_item : all_results.items()) {
// each result has the results of all the outer folds as each one were a different task
auto dataset_name = result_item.key();
auto data = result_item.value();
auto result = json::object();
int data_size = data.size();
auto score = torch::zeros({ data_size }, torch::kFloat64);
auto score_train = torch::zeros({ data_size }, torch::kFloat64);
auto time_test = torch::zeros({ data_size }, torch::kFloat64);
auto time_train = torch::zeros({ data_size }, torch::kFloat64);
auto nodes = torch::zeros({ data_size }, torch::kFloat64);
auto leaves = torch::zeros({ data_size }, torch::kFloat64);
auto depth = torch::zeros({ data_size }, torch::kFloat64);
auto& dataset = datasets.getDataset(dataset_name);
dataset.load();
//
// Prepare Result
//
auto partial_result = PartialResult();
partial_result.setSamples(dataset.getNSamples()).setFeatures(dataset.getNFeatures()).setClasses(dataset.getNClasses());
partial_result.setHyperparameters(experiment.getHyperParameters().get(dataset_name));
for (int fold = 0; fold < data_size; ++fold) {
partial_result.addScoreTest(data[fold]["score"]);
partial_result.addScoreTrain(0.0);
partial_result.addTimeTest(data[fold]["time"]);
partial_result.addTimeTrain(data[fold]["time_train"]);
score[fold] = data[fold]["score"].get<double>();
time_test[fold] = data[fold]["time"].get<double>();
time_train[fold] = data[fold]["time_train"].get<double>();
nodes[fold] = data[fold]["nodes"].get<double>();
leaves[fold] = data[fold]["leaves"].get<double>();
depth[fold] = data[fold]["depth"].get<double>();
}
partial_result.setGraph(std::vector<std::string>());
partial_result.setScoreTest(torch::mean(score).item<double>()).setScoreTrain(0.0);
partial_result.setScoreTestStd(torch::std(score).item<double>()).setScoreTrainStd(0.0);
partial_result.setTrainTime(torch::mean(time_train).item<double>()).setTestTime(torch::mean(time_test).item<double>());
partial_result.setTrainTimeStd(torch::std(time_train).item<double>()).setTestTimeStd(torch::std(time_test).item<double>());
partial_result.setNodes(torch::mean(nodes).item<double>()).setLeaves(torch::mean(leaves).item<double>()).setDepth(torch::mean(depth).item<double>());
partial_result.setDataset(dataset_name).setNotes(std::vector<std::string>());
partial_result.setConfusionMatrices(json::array());
experiment.addResult(partial_result);
}
auto clf = Models::instance()->create(experiment.getModel());
experiment.setModelVersion(clf->getVersion());
computed_results = results;
}
json GridExperiment::store_result(std::vector<std::string>& names, Task_Result& result, json& results)
{
json json_result = {
{ "score", result.score },
{ "combination", result.idx_combination },
{ "fold", result.n_fold },
{ "time", result.time },
{ "time_train", result.time_train },
{ "dataset", result.idx_dataset },
{ "nodes", result.nodes },
{ "leaves", result.leaves },
{ "depth", result.depth },
{ "process", result.process },
{ "task", result.task }
};
auto name = names[result.idx_dataset];
if (!results.contains(name)) {
results[name] = json::array();
}
results[name].push_back(json_result);
return results;
}
void GridExperiment::consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result)
{
//
// initialize
//
Timer train_timer, test_timer;
json task = tasks[n_task];
auto model = config.model;
auto dataset_name = task["dataset"].get<std::string>();
auto idx_dataset = task["idx_dataset"].get<int>();
auto seed = task["seed"].get<int>();
auto n_fold = task["fold"].get<int>();
bool stratified = config.stratified;
bayesnet::Smoothing_t smooth;
if (config.smooth_strategy == "ORIGINAL")
smooth = bayesnet::Smoothing_t::ORIGINAL;
else if (config.smooth_strategy == "LAPLACE")
smooth = bayesnet::Smoothing_t::LAPLACE;
else if (config.smooth_strategy == "CESTNIK")
smooth = bayesnet::Smoothing_t::CESTNIK;
//
// Generate the hyperparameters combinations
//
auto& dataset = datasets.getDataset(dataset_name);
dataset.load();
auto [X, y] = dataset.getTensors();
auto features = dataset.getFeatures();
auto className = dataset.getClassName();
//
// Start working on task
//
folding::Fold* fold;
if (stratified)
fold = new folding::StratifiedKFold(config.n_folds, y, seed);
else
fold = new folding::KFold(config.n_folds, y.size(0), seed);
train_timer.start();
auto [train, test] = fold->getFold(n_fold);
auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test);
auto states = dataset.getStates(); // Get the states of the features Once they are discretized
//
// Build Classifier with selected hyperparameters
//
auto clf = Models::instance()->create(config.model);
auto valid = clf->getValidHyperparameters();
auto hyperparameters = experiment.getHyperParameters();
hyperparameters.check(valid, dataset_name);
clf->setHyperparameters(hyperparameters.get(dataset_name));
//
// Train model
//
clf->fit(X_train, y_train, features, className, states, smooth);
auto train_time = train_timer.getDuration();
//
// Test model
//
test_timer.start();
double score = clf->score(X_test, y_test);
delete fold;
auto test_time = test_timer.getDuration();
//
// Return the result
//
result->idx_dataset = task["idx_dataset"].get<int>();
result->idx_combination = 0;
result->score = score;
result->n_fold = n_fold;
result->time = test_time;
result->time_train = train_time;
result->nodes = clf->getNumberOfNodes();
result->leaves = clf->getNumberOfEdges();
result->depth = clf->getNumberOfStates();
result->process = config_mpi.rank;
result->task = n_task;
//
// Update progress bar
//
std::cout << get_color_rank(config_mpi.rank) << std::flush;
}
} /* namespace platform */

38
src/grid/GridExperiment.h Normal file
View File

@@ -0,0 +1,38 @@
#ifndef GRIDEXPERIMENT_H
#define GRIDEXPERIMENT_H
#include <string>
#include <mpi.h>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include "common/Datasets.h"
#include "main/Experiment.h"
#include "main/HyperParameters.h"
#include "main/ArgumentsExperiment.h"
#include "GridBase.h"
namespace platform {
using json = nlohmann::ordered_json;
class GridExperiment : public GridBase {
public:
explicit GridExperiment(ArgumentsExperiment& program, struct ConfigGrid& config);
~GridExperiment() = default;
json getResults();
Experiment& getExperiment() { return experiment; }
size_t numFiles() const { return filesToTest.size(); }
bool haveToSaveResults() const { return saveResults; }
private:
ArgumentsExperiment& arguments;
Experiment experiment;
json computed_results;
bool saveResults = false;
std::vector<std::string> filesToTest;
void save(json& results);
json initializeResults();
std::vector<std::string> filterDatasets(Datasets& datasets) const;
void compile_results(json& results, json& all_results, std::string& model);
json store_result(std::vector<std::string>& names, Task_Result& result, json& results);
void consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result);
};
} /* namespace platform */
#endif

View File

@@ -1,441 +0,0 @@
#include <iostream>
#include <cstddef>
#include <torch/torch.h>
#include "GridSearch.h"
#include "Models.h"
#include "Paths.h"
#include "folding.hpp"
#include "Colors.h"
namespace platform {
std::string get_date()
{
time_t rawtime;
tm* timeinfo;
time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::ostringstream oss;
oss << std::put_time(timeinfo, "%Y-%m-%d");
return oss.str();
}
std::string get_time()
{
time_t rawtime;
tm* timeinfo;
time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::ostringstream oss;
oss << std::put_time(timeinfo, "%H:%M:%S");
return oss.str();
}
std::string get_color_rank(int rank)
{
auto colors = { Colors::WHITE(), Colors::RED(), Colors::GREEN(), Colors::BLUE(), Colors::MAGENTA(), Colors::CYAN() };
return *(colors.begin() + rank % colors.size());
}
GridSearch::GridSearch(struct ConfigGrid& config) : config(config)
{
}
json GridSearch::loadResults()
{
std::ifstream file(Paths::grid_output(config.model));
if (file.is_open()) {
return json::parse(file);
}
return json();
}
std::vector<std::string> GridSearch::filterDatasets(Datasets& datasets) const
{
// Load datasets
auto datasets_names = datasets.getNames();
if (config.continue_from != NO_CONTINUE()) {
// Continue previous execution:
if (std::find(datasets_names.begin(), datasets_names.end(), config.continue_from) == datasets_names.end()) {
throw std::invalid_argument("Dataset " + config.continue_from + " not found");
}
// Remove datasets already processed
std::vector<string>::iterator it = datasets_names.begin();
while (it != datasets_names.end()) {
if (*it != config.continue_from) {
it = datasets_names.erase(it);
} else {
if (config.only)
++it;
else
break;
}
}
}
// Exclude datasets
for (const auto& name : config.excluded) {
auto dataset = name.get<std::string>();
auto it = std::find(datasets_names.begin(), datasets_names.end(), dataset);
if (it == datasets_names.end()) {
throw std::invalid_argument("Dataset " + dataset + " already excluded or doesn't exist!");
}
datasets_names.erase(it);
}
return datasets_names;
}
json GridSearch::build_tasks_mpi(int rank)
{
auto tasks = json::array();
auto grid = GridData(Paths::grid_input(config.model));
auto datasets = Datasets(false, Paths::datasets());
auto all_datasets = datasets.getNames();
auto datasets_names = filterDatasets(datasets);
for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) {
auto dataset = datasets_names[idx_dataset];
for (const auto& seed : config.seeds) {
auto combinations = grid.getGrid(dataset);
for (int n_fold = 0; n_fold < config.n_folds; n_fold++) {
json task = {
{ "dataset", dataset },
{ "idx_dataset", idx_dataset},
{ "seed", seed },
{ "fold", n_fold},
};
tasks.push_back(task);
}
}
}
// Shuffle the array so heavy datasets are spread across the workers
std::mt19937 g{ 271 }; // Use fixed seed to obtain the same shuffle
std::shuffle(tasks.begin(), tasks.end(), g);
std::cout << get_color_rank(rank) << "* Number of tasks: " << tasks.size() << std::endl;
std::cout << "|";
for (int i = 0; i < tasks.size(); ++i) {
std::cout << (i + 1) % 10;
}
std::cout << "|" << std::endl << "|" << std::flush;
return tasks;
}
void process_task_mpi_consumer(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result)
{
// initialize
Timer timer;
timer.start();
json task = tasks[n_task];
auto model = config.model;
auto grid = GridData(Paths::grid_input(model));
auto dataset = task["dataset"].get<std::string>();
auto idx_dataset = task["idx_dataset"].get<int>();
auto seed = task["seed"].get<int>();
auto n_fold = task["fold"].get<int>();
bool stratified = config.stratified;
// Generate the hyperparamters combinations
auto combinations = grid.getGrid(dataset);
auto [X, y] = datasets.getTensors(dataset);
auto states = datasets.getStates(dataset);
auto features = datasets.getFeatures(dataset);
auto className = datasets.getClassName(dataset);
//
// Start working on task
//
folding::Fold* fold;
if (stratified)
fold = new folding::StratifiedKFold(config.n_folds, y, seed);
else
fold = new folding::KFold(config.n_folds, y.size(0), seed);
auto [train, test] = fold->getFold(n_fold);
auto train_t = torch::tensor(train);
auto test_t = torch::tensor(test);
auto X_train = X.index({ "...", train_t });
auto y_train = y.index({ train_t });
auto X_test = X.index({ "...", test_t });
auto y_test = y.index({ test_t });
double best_fold_score = 0.0;
int best_idx_combination = -1;
json best_fold_hyper;
for (int idx_combination = 0; idx_combination < combinations.size(); ++idx_combination) {
auto hyperparam_line = combinations[idx_combination];
auto hyperparameters = platform::HyperParameters(datasets.getNames(), hyperparam_line);
folding::Fold* nested_fold;
if (config.stratified)
nested_fold = new folding::StratifiedKFold(config.nested, y_train, seed);
else
nested_fold = new folding::KFold(config.nested, y_train.size(0), seed);
double score = 0.0;
for (int n_nested_fold = 0; n_nested_fold < config.nested; n_nested_fold++) {
// Nested level fold
auto [train_nested, test_nested] = nested_fold->getFold(n_nested_fold);
auto train_nested_t = torch::tensor(train_nested);
auto test_nested_t = torch::tensor(test_nested);
auto X_nested_train = X_train.index({ "...", train_nested_t });
auto y_nested_train = y_train.index({ train_nested_t });
auto X_nested_test = X_train.index({ "...", test_nested_t });
auto y_nested_test = y_train.index({ test_nested_t });
// Build Classifier with selected hyperparameters
auto clf = Models::instance()->create(config.model);
auto valid = clf->getValidHyperparameters();
hyperparameters.check(valid, dataset);
clf->setHyperparameters(hyperparameters.get(dataset));
// Train model
clf->fit(X_nested_train, y_nested_train, features, className, states);
// Test model
score += clf->score(X_nested_test, y_nested_test);
}
delete nested_fold;
score /= config.nested;
if (score > best_fold_score) {
best_fold_score = score;
best_idx_combination = idx_combination;
best_fold_hyper = hyperparam_line;
}
}
delete fold;
// Build Classifier with the best hyperparameters to obtain the best score
auto hyperparameters = platform::HyperParameters(datasets.getNames(), best_fold_hyper);
auto clf = Models::instance()->create(config.model);
auto valid = clf->getValidHyperparameters();
hyperparameters.check(valid, dataset);
clf->setHyperparameters(best_fold_hyper);
clf->fit(X_train, y_train, features, className, states);
best_fold_score = clf->score(X_test, y_test);
// Return the result
result->idx_dataset = task["idx_dataset"].get<int>();
result->idx_combination = best_idx_combination;
result->score = best_fold_score;
result->n_fold = n_fold;
result->time = timer.getDuration();
// Update progress bar
std::cout << get_color_rank(config_mpi.rank) << "*" << std::flush;
}
json store_result(std::vector<std::string>& names, Task_Result& result, json& results)
{
json json_result = {
{ "score", result.score },
{ "combination", result.idx_combination },
{ "fold", result.n_fold },
{ "time", result.time },
{ "dataset", result.idx_dataset }
};
auto name = names[result.idx_dataset];
if (!results.contains(name)) {
results[name] = json::array();
}
results[name].push_back(json_result);
return results;
}
json producer(std::vector<std::string>& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
{
Task_Result result;
json results;
int num_tasks = tasks.size();
//
// 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
//
for (int i = 0; i < num_tasks; ++i) {
MPI_Status status;
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_RESULT) {
//Store result
store_result(names, result, results);
}
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_TASK, MPI_COMM_WORLD);
}
//
// 2a.2 Producer will send the end message to all the consumers
//
for (int i = 0; i < config_mpi.n_procs - 1; ++i) {
MPI_Status status;
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_RESULT) {
//Store result
store_result(names, result, results);
}
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_END, MPI_COMM_WORLD);
}
return results;
}
void select_best_results_folds(json& results, json& all_results, std::string& model)
{
Timer timer;
auto grid = GridData(Paths::grid_input(model));
//
// Select the best result of the computed outer folds
//
for (const auto& result : all_results.items()) {
// each result has the results of all the outer folds as each one were a different task
double best_score = 0.0;
json best;
for (const auto& result_fold : result.value()) {
double score = result_fold["score"].get<double>();
if (score > best_score) {
best_score = score;
best = result_fold;
}
}
auto dataset = result.key();
auto combinations = grid.getGrid(dataset);
json json_best = {
{ "score", best_score },
{ "hyperparameters", combinations[best["combination"].get<int>()] },
{ "date", get_date() + " " + get_time() },
{ "grid", grid.getInputGrid(dataset) },
{ "duration", timer.translate2String(best["time"].get<double>()) }
};
results[dataset] = json_best;
}
}
void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
{
Task_Result result;
//
// 2b.1 Consumers announce to the producer that they are ready to receive a task
//
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_QUERY, MPI_COMM_WORLD);
int task;
while (true) {
MPI_Status status;
//
// 2b.2 Consumers receive the task from the producer and process it
//
MPI_Recv(&task, 1, MPI_INT, config_mpi.manager, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_END) {
break;
}
process_task_mpi_consumer(config, config_mpi, tasks, task, datasets, &result);
//
// 2b.3 Consumers send the result to the producer
//
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_RESULT, MPI_COMM_WORLD);
}
}
void GridSearch::go(struct ConfigMPI& config_mpi)
{
/*
* Each task is a json object with the following structure:
* {
* "dataset": "dataset_name",
* "idx_dataset": idx_dataset, // used to identify the dataset in the results
* // this index is relative to the used datasets in the actual run not to the whole datasets
* "seed": # of seed to use,
* "Fold": # of fold to process
* }
*
* The overall process consists in these steps:
* 0. Create the MPI result type & tasks
* 0.1 Create the MPI result type
* 0.2 Manager creates the tasks
* 1. Manager will broadcast the tasks to all the processes
* 1.1 Broadcast the number of tasks
* 1.2 Broadcast the length of the following string
* 1.2 Broadcast the tasks as a char* string
* 2a. Producer delivers the tasks to the consumers
* 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
* 2a.2 Producer will send the end message to all the consumers
* 2b. Consumers process the tasks and send the results to the producer
* 2b.1 Consumers announce to the producer that they are ready to receive a task
* 2b.2 Consumers receive the task from the producer and process it
* 2b.3 Consumers send the result to the producer
* 3. Manager select the bests sccores for each dataset
* 3.1 Loop thru all the results obtained from each outer fold (task) and select the best
* 3.2 Save the results
*/
//
// 0.1 Create the MPI result type
//
Task_Result result;
int tasks_size;
MPI_Datatype MPI_Result;
MPI_Datatype type[5] = { MPI_UNSIGNED, MPI_UNSIGNED, MPI_INT, MPI_DOUBLE, MPI_DOUBLE };
int blocklen[5] = { 1, 1, 1, 1, 1 };
MPI_Aint disp[5];
disp[0] = offsetof(Task_Result, idx_dataset);
disp[1] = offsetof(Task_Result, idx_combination);
disp[2] = offsetof(Task_Result, n_fold);
disp[3] = offsetof(Task_Result, score);
disp[4] = offsetof(Task_Result, time);
MPI_Type_create_struct(5, blocklen, disp, type, &MPI_Result);
MPI_Type_commit(&MPI_Result);
//
// 0.2 Manager creates the tasks
//
char* msg;
json tasks;
if (config_mpi.rank == config_mpi.manager) {
timer.start();
tasks = build_tasks_mpi(config_mpi.rank);
auto tasks_str = tasks.dump();
tasks_size = tasks_str.size();
msg = new char[tasks_size + 1];
strcpy(msg, tasks_str.c_str());
}
//
// 1. Manager will broadcast the tasks to all the processes
//
MPI_Bcast(&tasks_size, 1, MPI_INT, config_mpi.manager, MPI_COMM_WORLD);
if (config_mpi.rank != config_mpi.manager) {
msg = new char[tasks_size + 1];
}
MPI_Bcast(msg, tasks_size + 1, MPI_CHAR, config_mpi.manager, MPI_COMM_WORLD);
tasks = json::parse(msg);
delete[] msg;
auto datasets = Datasets(config.discretize, Paths::datasets());
if (config_mpi.rank == config_mpi.manager) {
//
// 2a. Producer delivers the tasks to the consumers
//
auto datasets_names = filterDatasets(datasets);
json all_results = producer(datasets_names, tasks, config_mpi, MPI_Result);
std::cout << get_color_rank(config_mpi.rank) << "|" << std::endl;
//
// 3. Manager select the bests sccores for each dataset
//
auto results = initializeResults();
select_best_results_folds(results, all_results, config.model);
//
// 3.2 Save the results
//
save(results);
} else {
//
// 2b. Consumers process the tasks and send the results to the producer
//
consumer(datasets, tasks, config, config_mpi, MPI_Result);
}
}
json GridSearch::initializeResults()
{
// Load previous results if continue is set
json results;
if (config.continue_from != NO_CONTINUE()) {
if (!config.quiet)
std::cout << "* Loading previous results" << std::endl;
try {
std::ifstream file(Paths::grid_output(config.model));
if (file.is_open()) {
results = json::parse(file);
results = results["results"];
}
}
catch (const std::exception& e) {
std::cerr << "* There were no previous results" << std::endl;
std::cerr << "* Initizalizing new results" << std::endl;
results = json();
}
}
return results;
}
void GridSearch::save(json& results)
{
std::ofstream file(Paths::grid_output(config.model));
json output = {
{ "model", config.model },
{ "score", config.score },
{ "discretize", config.discretize },
{ "stratified", config.stratified },
{ "n_folds", config.n_folds },
{ "seeds", config.seeds },
{ "date", get_date() + " " + get_time()},
{ "nested", config.nested},
{ "platform", config.platform },
{ "duration", timer.getDurationString(true)},
{ "results", results }
};
file << output.dump(4);
}
} /* namespace platform */

259
src/grid/GridSearch.cpp Normal file
View File

@@ -0,0 +1,259 @@
#include <iostream>
#include <torch/torch.h>
#include <folding.hpp>
#include "main/Models.h"
#include "common/Paths.h"
#include "common/Utils.h"
#include "common/Colors.h"
#include "GridSearch.h"
namespace platform {
GridSearch::GridSearch(struct ConfigGrid& config) : GridBase(config)
{
}
json GridSearch::loadResults()
{
std::ifstream file(Paths::grid_output(config.model));
if (file.is_open()) {
return json::parse(file);
}
return json();
}
std::vector<std::string> GridSearch::filterDatasets(Datasets& datasets) const
{
// Load datasets
auto datasets_names = datasets.getNames();
if (config.continue_from != NO_CONTINUE()) {
// Continue previous execution:
if (std::find(datasets_names.begin(), datasets_names.end(), config.continue_from) == datasets_names.end()) {
throw std::invalid_argument("Dataset " + config.continue_from + " not found");
}
// Remove datasets already processed
std::vector<string>::iterator it = datasets_names.begin();
while (it != datasets_names.end()) {
if (*it != config.continue_from) {
it = datasets_names.erase(it);
} else {
if (config.only)
++it;
else
break;
}
}
}
// Exclude datasets
for (const auto& name : config.excluded) {
auto dataset = name.get<std::string>();
auto it = std::find(datasets_names.begin(), datasets_names.end(), dataset);
if (it == datasets_names.end()) {
throw std::invalid_argument("Dataset " + dataset + " already excluded or doesn't exist!");
}
datasets_names.erase(it);
}
return datasets_names;
}
json GridSearch::initializeResults()
{
// Load previous results if continue is set
json results;
if (config.continue_from != NO_CONTINUE()) {
if (!config.quiet)
std::cout << Colors::RESET() << "* Loading previous results" << std::endl;
try {
std::ifstream file(Paths::grid_output(config.model));
if (file.is_open()) {
results = json::parse(file);
results = results["results"];
}
}
catch (const std::exception& e) {
std::cerr << "* There were no previous results" << std::endl;
std::cerr << "* Initizalizing new results" << std::endl;
results = json();
}
}
return results;
}
void GridSearch::save(json& results)
{
std::ofstream file(Paths::grid_output(config.model));
json output = {
{ "model", config.model },
{ "score", config.score },
{ "discretize", config.discretize },
{ "stratified", config.stratified },
{ "n_folds", config.n_folds },
{ "seeds", config.seeds },
{ "date", get_date() + " " + get_time()},
{ "nested", config.nested},
{ "platform", config.platform },
{ "duration", timer.getDurationString(true)},
{ "results", results }
};
file << output.dump(4);
}
void GridSearch::compile_results(json& results, json& all_results, std::string& model)
{
Timer timer;
auto grid = GridData(Paths::grid_input(model));
//
// Select the best result of the computed outer folds
//
for (const auto& result : all_results.items()) {
// each result has the results of all the outer folds as each one were a different task
double best_score = 0.0;
json best;
for (const auto& result_fold : result.value()) {
double score = result_fold["score"].get<double>();
if (score > best_score) {
best_score = score;
best = result_fold;
}
}
auto dataset = result.key();
auto combinations = grid.getGrid(dataset);
json json_best = {
{ "score", best_score },
{ "hyperparameters", combinations[best["combination"].get<int>()] },
{ "date", get_date() + " " + get_time() },
{ "grid", grid.getInputGrid(dataset) },
{ "duration", timer.translate2String(best["time"].get<double>()) }
};
results[dataset] = json_best;
}
}
json GridSearch::store_result(std::vector<std::string>& names, Task_Result& result, json& results)
{
json json_result = {
{ "score", result.score },
{ "combination", result.idx_combination },
{ "fold", result.n_fold },
{ "time", result.time },
{ "dataset", result.idx_dataset },
{ "process", result.process },
{ "task", result.task }
};
auto name = names[result.idx_dataset];
if (!results.contains(name)) {
results[name] = json::array();
}
results[name].push_back(json_result);
return results;
}
void GridSearch::consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result)
{
//
// initialize
//
Timer timer;
timer.start();
json task = tasks[n_task];
auto model = config.model;
auto grid = GridData(Paths::grid_input(model));
auto dataset_name = task["dataset"].get<std::string>();
auto idx_dataset = task["idx_dataset"].get<int>();
auto seed = task["seed"].get<int>();
auto n_fold = task["fold"].get<int>();
bool stratified = config.stratified;
bayesnet::Smoothing_t smooth;
if (config.smooth_strategy == "ORIGINAL")
smooth = bayesnet::Smoothing_t::ORIGINAL;
else if (config.smooth_strategy == "LAPLACE")
smooth = bayesnet::Smoothing_t::LAPLACE;
else if (config.smooth_strategy == "CESTNIK")
smooth = bayesnet::Smoothing_t::CESTNIK;
//
// Generate the hyperparameters combinations
//
auto& dataset = datasets.getDataset(dataset_name);
auto combinations = grid.getGrid(dataset_name);
dataset.load();
auto [X, y] = dataset.getTensors();
auto features = dataset.getFeatures();
auto className = dataset.getClassName();
//
// Start working on task
//
folding::Fold* fold;
if (stratified)
fold = new folding::StratifiedKFold(config.n_folds, y, seed);
else
fold = new folding::KFold(config.n_folds, y.size(0), seed);
auto [train, test] = fold->getFold(n_fold);
auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test);
auto states = dataset.getStates(); // Get the states of the features Once they are discretized
float best_fold_score = 0.0;
int best_idx_combination = -1;
json best_fold_hyper;
for (int idx_combination = 0; idx_combination < combinations.size(); ++idx_combination) {
auto hyperparam_line = combinations[idx_combination];
auto hyperparameters = platform::HyperParameters(datasets.getNames(), hyperparam_line);
folding::Fold* nested_fold;
if (config.stratified)
nested_fold = new folding::StratifiedKFold(config.nested, y_train, seed);
else
nested_fold = new folding::KFold(config.nested, y_train.size(0), seed);
double score = 0.0;
for (int n_nested_fold = 0; n_nested_fold < config.nested; n_nested_fold++) {
//
// Nested level fold
//
auto [train_nested, test_nested] = nested_fold->getFold(n_nested_fold);
auto train_nested_t = torch::tensor(train_nested);
auto test_nested_t = torch::tensor(test_nested);
auto X_nested_train = X_train.index({ "...", train_nested_t });
auto y_nested_train = y_train.index({ train_nested_t });
auto X_nested_test = X_train.index({ "...", test_nested_t });
auto y_nested_test = y_train.index({ test_nested_t });
//
// Build Classifier with selected hyperparameters
//
auto clf = Models::instance()->create(config.model);
auto valid = clf->getValidHyperparameters();
hyperparameters.check(valid, dataset_name);
clf->setHyperparameters(hyperparameters.get(dataset_name));
//
// Train model
//
clf->fit(X_nested_train, y_nested_train, features, className, states, smooth);
//
// Test model
//
score += clf->score(X_nested_test, y_nested_test);
}
delete nested_fold;
score /= config.nested;
if (score > best_fold_score) {
best_fold_score = score;
best_idx_combination = idx_combination;
best_fold_hyper = hyperparam_line;
}
}
delete fold;
//
// Build Classifier with the best hyperparameters to obtain the best score
//
auto hyperparameters = platform::HyperParameters(datasets.getNames(), best_fold_hyper);
auto clf = Models::instance()->create(config.model);
auto valid = clf->getValidHyperparameters();
hyperparameters.check(valid, dataset_name);
clf->setHyperparameters(best_fold_hyper);
clf->fit(X_train, y_train, features, className, states, smooth);
best_fold_score = clf->score(X_test, y_test);
//
// Return the result
//
result->idx_dataset = task["idx_dataset"].get<int>();
result->idx_combination = best_idx_combination;
result->score = best_fold_score;
result->n_fold = n_fold;
result->time = timer.getDuration();
result->process = config_mpi.rank;
result->task = n_task;
//
// Update progress bar
//
std::cout << get_color_rank(config_mpi.rank) << std::flush;
}
} /* namespace platform */

Some files were not shown because too many files have changed in this diff Show More