Compare commits
313 Commits
2fd83e940a
...
withoutpy
Author | SHA1 | Date | |
---|---|---|---|
4116699b01
|
|||
b7f4651e2c
|
|||
e8b35d4c5e
|
|||
34a0719a16
|
|||
72e228f367
|
|||
c3c580a611
|
|||
515455695b
|
|||
f68d216150
|
|||
b990684581
|
|||
5fd0ef692d
|
|||
dfcdadbf38
|
|||
613f4b6813
|
|||
dc324fe5f7
|
|||
9816896240
|
|||
a3f765ce3c
|
|||
3d814a79c6
|
|||
1ef7ca6180 | |||
9448a971e8
|
|||
24cef7496d
|
|||
a1a6d3d612
|
|||
dda9740e83
|
|||
41afa1b888
|
|||
4e18dc87be
|
|||
56af1a5f85
|
|||
415a7ae608
|
|||
023d5613b4
|
|||
8c413a1eb0
|
|||
3b158e9fc1
|
|||
514968a082
|
|||
dcde8c01be
|
|||
a6b6efce95
|
|||
473d194dde
|
|||
a56ec98ef9
|
|||
70d8022926
|
|||
f5107abea7
|
|||
e64e281b63
|
|||
b639a2d79a
|
|||
d6603dd638
|
|||
321e2a2f28
|
|||
36c72491e7
|
|||
aa19ab6c21
|
|||
16b4923851
|
|||
b1965c8ae5
|
|||
7d3a2dd713
|
|||
50fde9521b | |||
cd2f47c58b | |||
facf6f6ddd
|
|||
c9ab88e475
|
|||
c2a4e3e64e
|
|||
664a6a5aeb
|
|||
ae7b89b134
|
|||
9c1852c6c3
|
|||
7a23782b05
|
|||
b2002d341c
|
|||
9a8b960ce8
|
|||
7bc8633ed1
|
|||
11155463b9
|
|||
12e69a7f53
|
|||
c127cb670a
|
|||
610c2a6a4a
|
|||
2dcd073299
|
|||
f51d5b5e40
|
|||
4e3043b2d1
|
|||
b055065e59
|
|||
0d1e4b3c6f
|
|||
1a688f90b4
|
|||
c63baf419f
|
|||
de7cf091be
|
|||
475a819a87
|
|||
ce6e192a33
|
|||
5daf7cbd69
|
|||
1b26de1e38
|
|||
d3de429f2c
|
|||
f48864a415
|
|||
c1531dba2a
|
|||
5556fbab03
|
|||
ac89cefab3
|
|||
14dd8ebb66
|
|||
bd5ba14f04
|
|||
17728212c1
|
|||
86b4558f9d
|
|||
505edc79ac
|
|||
73a4b3d5e5
|
|||
cbe8f4c79c
|
|||
0d08a526fa
|
|||
d0706da887
|
|||
07e3cc9599
|
|||
2a9652b450
|
|||
3397d0962f
|
|||
7aaf6d1bf8
|
|||
eb430a84c4
|
|||
d0e65348e0
|
|||
c1d5dd74e3
|
|||
9a9a9fb17a
|
|||
386faf960e
|
|||
28894004c8
|
|||
ae41975fb4
|
|||
0e475e4488
|
|||
909cec712c
|
|||
4901bb1f32
|
|||
0318dcf8e5
|
|||
1cc19a7b19
|
|||
f88944de36
|
|||
1a336a094e
|
|||
8705adf3ee
|
|||
017cb8a0dc
|
|||
e966c880e6
|
|||
70ea32dc9a
|
|||
ba455bb934
|
|||
a65955248a
|
|||
84930b0537
|
|||
10c65f44a0
|
|||
6d112f01e7
|
|||
401296293b
|
|||
9566ae4cf6
|
|||
55187ee521
|
|||
68ea06d129
|
|||
6c1d1d0d32
|
|||
b0853d169b
|
|||
26f8e07774
|
|||
315dfb104f
|
|||
381f226d53
|
|||
ea13835701
|
|||
d75468cf78
|
|||
c58bd9d60d
|
|||
148a3b831a
|
|||
69063badbb
|
|||
6ae2b2182a
|
|||
4dbd76df55
|
|||
4545f76667
|
|||
8372987dae
|
|||
d72943c749
|
|||
800246acd2
|
|||
0ea967dd9d
|
|||
97abec8b69
|
|||
17c9522e77
|
|||
45af550cf9
|
|||
5d5f49777e
|
|||
540a8ea06d
|
|||
1924c4392b
|
|||
f2556a30af
|
|||
2f2ed00ca1
|
|||
28f6a0d7a7
|
|||
028522f180
|
|||
84adf13a79
|
|||
26dfe6d056
|
|||
3acc34e4c6
|
|||
8f92b74260
|
|||
3d900f8c81
|
|||
e628d80f4c
|
|||
0f06f8971e
|
|||
f800772149
|
|||
b8a8ddaf8c
|
|||
90555489ff
|
|||
080f3cee34
|
|||
643633e6dd
|
|||
361c51d864
|
|||
5dd3deca1a
|
|||
2202a81782
|
|||
c4f4e332f6
|
|||
a7ec930fa0
|
|||
6858b3d89a
|
|||
5fb176d78a
|
|||
f5d5c35002
|
|||
b34af13eea
|
|||
e3a06264a9
|
|||
df82f82e88
|
|||
886dde7a06
|
|||
88468434e7
|
|||
ad5c3319bd
|
|||
594adb0534
|
|||
b9e0c92334
|
|||
25bd7a42c6
|
|||
c165a4bdda
|
|||
49a36904dc
|
|||
577351eda5
|
|||
a3c4bde460
|
|||
696c0564a7
|
|||
30a6d5e60d
|
|||
f8f3ca28dc
|
|||
5c190d7c66
|
|||
99c9c6731f
|
|||
8d20545fd2
|
|||
2b480cdcb7 | |||
ebaddf1a6c
|
|||
07a2efb298
|
|||
f88b223c46
|
|||
69b9609154
|
|||
6d4117d188
|
|||
ec0268c514
|
|||
dd94fd51f7
|
|||
009ed037b8
|
|||
6d1b78ada7
|
|||
3882ebd6e4
|
|||
423242d280
|
|||
b9381aa453
|
|||
33cfb78554
|
|||
1caa39c071
|
|||
018c94bfe6
|
|||
a54d6b8716
|
|||
6cde09d81e
|
|||
7be95d889d
|
|||
42d61c6fc4
|
|||
e5e947779f
|
|||
ad168d13ba
|
|||
78b8a8ae66
|
|||
7ed9073d15
|
|||
ee93789ca3
|
|||
375ed437ed
|
|||
5ec7fe8d00
|
|||
72ea62f783
|
|||
4b91f2bde0
|
|||
3bc51cb7b0
|
|||
cf83d1f8f4
|
|||
0dd10bcbe4
|
|||
622b36b2c7
|
|||
ea29a96ca1
|
|||
673a41fc4d
|
|||
634ea36169
|
|||
20fef5b6b3
|
|||
7cf864c3f3
|
|||
4a0fa33917
|
|||
d47da27571
|
|||
faccb09c43
|
|||
fa4f47ff35
|
|||
106a36109e
|
|||
37eba57765
|
|||
67487ffce1
|
|||
9c11dee019
|
|||
58ae2c7690
|
|||
fa366a4c22
|
|||
b9af086c29
|
|||
6a285b149b
|
|||
ad402ac21e
|
|||
38978aa7b7
|
|||
3691363b8e
|
|||
fe24aa0b3e
|
|||
175e0eb591
|
|||
1912d17498
|
|||
54249e5304
|
|||
d7f92c9682
|
|||
00bb7f4680
|
|||
bf5dabb169
|
|||
cdf339856a
|
|||
3ceea5677c
|
|||
260fd122eb
|
|||
eff0be1c1c
|
|||
0ade72a37a
|
|||
72cda3784a
|
|||
52d689666a
|
|||
26e87c9cb1 | |||
03cd6e5a51
|
|||
cd9ff89b52
|
|||
05d05e25c2
|
|||
5cd6e3d1a5
|
|||
d9e9356d92
|
|||
0010c840d1
|
|||
51f32113c0
|
|||
b3b3d9f1b9
|
|||
4c847fc3f6
|
|||
7e4ee0a9a9
|
|||
b7398db9b1
|
|||
0a9bd0d9c4
|
|||
7a3adaf4a9
|
|||
5c4efa08db
|
|||
576016bbd9 | |||
e26b3c0970
|
|||
183cf12300
|
|||
4eb08cd281
|
|||
4f5f629124
|
|||
df011f7e6b
|
|||
42648f3125
|
|||
d2832ed2b3
|
|||
ec323d86ab
|
|||
e4a6575722
|
|||
67f1feb71f | |||
23c3bed667
|
|||
b68d520726
|
|||
c69dc08134
|
|||
9a26baec47
|
|||
82f2c36621
|
|||
731e03681a
|
|||
643038fd19
|
|||
7d92876f06
|
|||
53dafa3404
|
|||
a1c7dbfea1
|
|||
581a8652cc
|
|||
4df1094340
|
|||
45d0886adb
|
|||
d996496f87
|
|||
ab03d1de49
|
|||
9d44ea4cf2
|
|||
4b5d2b4f82
|
|||
52d2004915
|
|||
3f3c14e8fc
|
|||
0907906ef6
|
|||
b490d406a2
|
|||
5993ece4fd
|
|||
c9dc378f98
|
|||
d7174e930b
|
|||
e336d39cfb
|
|||
7dbef9fc36
|
|||
889668bf00
|
|||
a220b847d4
|
|||
25a6975b02
|
|||
69bb930e3e
|
|||
24666a3a16
|
|||
210ce4a255
|
|||
5e1d59acdb
|
|||
2b20d0315c
|
|||
ecce7955f8
|
|||
6660e8b6ce
|
|||
d145e71909
|
@@ -4,8 +4,8 @@ diagrams:
|
||||
Platform:
|
||||
type: class
|
||||
glob:
|
||||
- src/Platform/*.cc
|
||||
- src/Command/*.cc
|
||||
- src/*.cpp
|
||||
- src/modules/*.cpp
|
||||
using_namespace: platform
|
||||
include:
|
||||
namespaces:
|
||||
@@ -17,7 +17,7 @@ diagrams:
|
||||
sequence:
|
||||
type: sequence
|
||||
glob:
|
||||
- src/Command/b_main.cc
|
||||
- src/b_main.cpp
|
||||
combine_free_functions_into_file_participants: true
|
||||
using_namespace:
|
||||
- std
|
||||
@@ -25,7 +25,6 @@ diagrams:
|
||||
- platform
|
||||
include:
|
||||
paths:
|
||||
- src/Command
|
||||
- src/Platform
|
||||
- src
|
||||
start_from:
|
||||
- function: main(int,const char **)
|
||||
|
16
.env.example
Normal file
16
.env.example
Normal file
@@ -0,0 +1,16 @@
|
||||
experiment=discretiz
|
||||
score=accuracy
|
||||
platform=um790Linux
|
||||
n_folds=5
|
||||
stratified=0
|
||||
model=TAN
|
||||
source_data=Arff
|
||||
seeds=[271]
|
||||
discretize=0
|
||||
ignore_nan=0
|
||||
nodes=Nodes
|
||||
leaves=Edges
|
||||
depth=States
|
||||
fit_features=0
|
||||
framework=bulma
|
||||
margin=0.1
|
6
.gitignore
vendored
6
.gitignore
vendored
@@ -39,3 +39,9 @@ cmake-build*/**
|
||||
puml/**
|
||||
.vscode/settings.json
|
||||
*.dot
|
||||
diagrams/html/**
|
||||
diagrams/latex/**
|
||||
.cache
|
||||
vcpkg_installed
|
||||
.claude/settings.local.json
|
||||
CMakeUserPresets.json
|
||||
|
18
.gitmodules
vendored
18
.gitmodules
vendored
@@ -1,18 +0,0 @@
|
||||
[submodule "lib/catch2"]
|
||||
path = lib/catch2
|
||||
url = https://github.com/catchorg/Catch2.git
|
||||
[submodule "lib/argparse"]
|
||||
path = lib/argparse
|
||||
url = https://github.com/p-ranav/argparse
|
||||
[submodule "lib/json"]
|
||||
path = lib/json
|
||||
url = https://github.com/nlohmann/json
|
||||
[submodule "lib/libxlsxwriter"]
|
||||
path = lib/libxlsxwriter
|
||||
url = https://github.com/jmcnamara/libxlsxwriter.git
|
||||
[submodule "lib/mdlp"]
|
||||
path = lib/mdlp
|
||||
url = https://github.com/rmontanana/mdlp
|
||||
[submodule "lib/PyClassifiers"]
|
||||
path = lib/PyClassifiers
|
||||
url = https://github.com/rmontanana/PyClassifiers
|
13
.vscode/c_cpp_properties.json
vendored
13
.vscode/c_cpp_properties.json
vendored
@@ -11,7 +11,18 @@
|
||||
],
|
||||
"cStandard": "c17",
|
||||
"cppStandard": "c++17",
|
||||
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json"
|
||||
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json",
|
||||
"configurationProvider": "ms-vscode.cmake-tools"
|
||||
},
|
||||
{
|
||||
"name": "Linux",
|
||||
"includePath": [
|
||||
"${workspaceFolder}/**"
|
||||
],
|
||||
"defines": [],
|
||||
"cStandard": "c17",
|
||||
"cppStandard": "c++17",
|
||||
"configurationProvider": "ms-vscode.cmake-tools"
|
||||
}
|
||||
],
|
||||
"version": 4
|
||||
|
49
.vscode/launch.json
vendored
49
.vscode/launch.json
vendored
@@ -2,9 +2,9 @@
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "sample",
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "sample",
|
||||
"program": "${workspaceFolder}/build_debug/sample/BayesNetSample",
|
||||
"args": [
|
||||
"-d",
|
||||
@@ -14,15 +14,15 @@
|
||||
"-s",
|
||||
"271",
|
||||
"-p",
|
||||
"/Users/rmontanana/Code/discretizbench/datasets/",
|
||||
"${workspaceFolder}/../discretizbench/datasets/",
|
||||
],
|
||||
//"cwd": "${workspaceFolder}/build/sample/",
|
||||
},
|
||||
{
|
||||
"name": "experimentPy",
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "experimentPy",
|
||||
"program": "${workspaceFolder}/build_debug/src/Platform/b_main",
|
||||
"program": "${workspaceFolder}/build_debug/src/b_main",
|
||||
"args": [
|
||||
"-m",
|
||||
"STree",
|
||||
@@ -36,10 +36,10 @@
|
||||
"cwd": "${workspaceFolder}/../discretizbench",
|
||||
},
|
||||
{
|
||||
"name": "gridsearch",
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "gridsearch",
|
||||
"program": "${workspaceFolder}/build_debug/src/Platform/b_grid",
|
||||
"program": "${workspaceFolder}/build_debug/src/b_grid",
|
||||
"args": [
|
||||
"-m",
|
||||
"KDB",
|
||||
@@ -52,41 +52,41 @@
|
||||
"cwd": "${workspaceFolder}/../discretizbench",
|
||||
},
|
||||
{
|
||||
"name": "experimentBayes",
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "experimentBayes",
|
||||
"program": "${workspaceFolder}/build_debug/src/Platform/b_main",
|
||||
"program": "${workspaceFolder}/build_debug/src/b_main",
|
||||
"args": [
|
||||
"-m",
|
||||
"TAN",
|
||||
"--stratified",
|
||||
"--discretize",
|
||||
"-d",
|
||||
"iris",
|
||||
"glass",
|
||||
"--hyperparameters",
|
||||
"{\"repeatSparent\": true, \"maxModels\": 12}"
|
||||
"{\"block_update\": true}"
|
||||
],
|
||||
"cwd": "/home/rmontanana/Code/discretizbench",
|
||||
},
|
||||
{
|
||||
"name": "best",
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "best",
|
||||
"program": "${workspaceFolder}/build_debug/src/Platform/b_best",
|
||||
"program": "${workspaceFolder}/build_debug/src/b_best",
|
||||
"args": [
|
||||
"-m",
|
||||
"BoostAODE",
|
||||
"-s",
|
||||
"accuracy",
|
||||
"--build",
|
||||
"--excel"
|
||||
],
|
||||
"cwd": "${workspaceFolder}/../discretizbench",
|
||||
},
|
||||
{
|
||||
"name": "manage",
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "manage",
|
||||
"program": "${workspaceFolder}/build_debug/src/Platform/b_manage",
|
||||
"program": "${workspaceFolder}/build_debug/src/b_manage",
|
||||
"args": [
|
||||
"-n",
|
||||
"20"
|
||||
@@ -94,24 +94,29 @@
|
||||
"cwd": "${workspaceFolder}/../discretizbench",
|
||||
},
|
||||
{
|
||||
"name": "list",
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "list",
|
||||
"program": "${workspaceFolder}/build_debug/src/Platform/b_list",
|
||||
"args": [],
|
||||
"program": "${workspaceFolder}/build_debug/src/b_list",
|
||||
"args": [
|
||||
"results",
|
||||
"-d",
|
||||
"mfeat-morphological"
|
||||
],
|
||||
//"cwd": "/Users/rmontanana/Code/discretizbench",
|
||||
"cwd": "${workspaceFolder}/../discretizbench",
|
||||
},
|
||||
{
|
||||
"name": "test",
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "test",
|
||||
"program": "${workspaceFolder}/build_debug/tests/unit_tests",
|
||||
"program": "${workspaceFolder}/build_debug/tests/unit_tests_platform",
|
||||
"args": [
|
||||
"-c=\"Metrics Test\"",
|
||||
"[Scores]",
|
||||
// "-c=\"Metrics Test\"",
|
||||
// "-s",
|
||||
],
|
||||
"cwd": "${workspaceFolder}/build/tests",
|
||||
"cwd": "${workspaceFolder}/build_debug/tests",
|
||||
},
|
||||
{
|
||||
"name": "Build & debug active file",
|
||||
|
93
CHANGELOG.md
Normal file
93
CHANGELOG.md
Normal file
@@ -0,0 +1,93 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Changed
|
||||
- **BREAKING**: Migrated dependency management from vcpkg to Conan
|
||||
- Updated build system to use Conan toolchain files instead of vcpkg
|
||||
- Updated `make init` command to use `conan install` instead of `vcpkg install`
|
||||
- Modified CMakeLists.txt to use Conan's find_package mechanism
|
||||
- Updated documentation in CLAUDE.md to reflect Conan usage
|
||||
|
||||
### Added
|
||||
- `conanfile.py` - Conan recipe for dependency management with all required dependencies
|
||||
- CMakeUserPresets.json (generated by Conan)
|
||||
- Support for Conan build profiles (Release/Debug)
|
||||
|
||||
### Removed
|
||||
- `vcpkg.json` - vcpkg manifest file
|
||||
- `vcpkg-configuration.json` - vcpkg registry configuration
|
||||
- vcpkg toolchain dependency in build system
|
||||
|
||||
### Notes
|
||||
- The migration maintains compatibility with existing make targets and workflow
|
||||
- All dependencies now managed through Conan package manager
|
||||
|
||||
## [1.1.0] - 2025-07-02
|
||||
|
||||
### Added
|
||||
- **AdaBoost Implementation**: Complete multi-class SAMME AdaBoost classifier with optimization
|
||||
- Optimized AdaBoostPredict with 100 estimators as default
|
||||
- Enhanced predictProbaSample functionality
|
||||
- Full predict_proba support for probabilistic predictions
|
||||
- **Decision Tree Classifier**: New base classifier implementation with comprehensive tests
|
||||
- **XA1DE Model Family**: Extended Averaged One-Dependence Estimators
|
||||
- XA1DE, XBAODE, XSPODE variants with threading support
|
||||
- Complete integration with memory optimization
|
||||
- Prior probability computation in prediction
|
||||
- **Wilcoxon Statistical Test**: Statistical significance testing for model comparison
|
||||
- **Folder Management**: Enhanced file organization with folder parameter support across tools
|
||||
- Added folder parameter to b_best, b_grid, b_main, and b_manage
|
||||
- **vcpkg Integration**: Package management system integration (now migrated to Conan)
|
||||
|
||||
### Enhanced
|
||||
- **Grid Search System**: Complete refactoring with MPI parallelization
|
||||
- Grid experiment functionality with conditional result saving
|
||||
- Fixed smoothing problems and dataset ordering
|
||||
- Enhanced reporting and summary generation
|
||||
- **Excel Reporting**: Advanced Excel export capabilities
|
||||
- ReportExcelCompared class for side-by-side result comparison
|
||||
- Enhanced formatting with colors and fixed headers
|
||||
- Automatic file opening after generation
|
||||
- **Results Management**: Comprehensive result handling and validation
|
||||
- JSON schema validation for result format integrity
|
||||
- Improved console reporting with classification reports
|
||||
- Pagination support for large result sets
|
||||
- **Statistical Analysis**: Enhanced statistical testing and reporting
|
||||
- AUC (Area Under Curve) computation and reporting
|
||||
- Confusion matrix generation and visualization
|
||||
- Classification reports with color coding
|
||||
|
||||
### Performance Improvements
|
||||
- Optimized AdaBoost training and prediction algorithms
|
||||
- Enhanced memory management in XA1DE implementations
|
||||
- Improved discretization algorithms with MDLP integration
|
||||
- Faster ROC-AUC computation for binary classification problems
|
||||
|
||||
### Developer Experience
|
||||
- **Testing Framework**: Comprehensive test suite with Catch2
|
||||
- **Build System**: Streamlined CMake configuration with dependency management
|
||||
- **Documentation**: Enhanced project documentation and build instructions
|
||||
- **Code Quality**: Refactored codebase with improved error handling and logging
|
||||
|
||||
### Bug Fixes
|
||||
- Fixed predict_proba implementations across multiple classifiers
|
||||
- Resolved grid search dataset ordering issues
|
||||
- Fixed Excel report formatting and column width problems
|
||||
- Corrected time output formatting in various tools
|
||||
- Fixed memory leaks and stability issues in model implementations
|
||||
|
||||
## [1.0.0] - 2024-01-09
|
||||
|
||||
### Initial Release
|
||||
- **Core Framework**: Machine learning experimentation platform for Bayesian Networks
|
||||
- **Basic Classifiers**: Initial set of Bayesian network classifiers
|
||||
- **Experiment Management**: Basic experiment orchestration and result storage
|
||||
- **Dataset Support**: ARFF file format support with discretization
|
||||
- **Build System**: CMake-based build system with external library integration
|
||||
- **Command Line Tools**: Initial versions of b_main, b_best, b_list utilities
|
139
CLAUDE.md
Normal file
139
CLAUDE.md
Normal file
@@ -0,0 +1,139 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
Platform is a C++ machine learning framework for running experiments with Bayesian Networks and other classifiers. It supports both research-focused experimental classifiers and production-ready models through a unified interface.
|
||||
|
||||
## Build System
|
||||
|
||||
The project uses CMake with Make as the primary build system:
|
||||
|
||||
- **Release build**: `make release` (creates `build_Release/` directory)
|
||||
- **Debug build**: `make debug` (creates `build_Debug/` directory with testing and coverage enabled)
|
||||
- **Install binaries**: `make install` (copies executables to `~/bin` by default)
|
||||
- **Clean project**: `make clean` (removes build directories)
|
||||
- **Initialize dependencies**: `make init` (runs conan install for both Release and Debug)
|
||||
|
||||
### Testing
|
||||
|
||||
- **Run tests**: `make test` (builds debug version and runs all tests)
|
||||
- **Coverage report**: `make coverage` (runs tests and generates coverage with gcovr)
|
||||
- **Single test with options**: `make test opt="-s"` (verbose) or `make test opt="-c='Test Name'"` (specific test)
|
||||
|
||||
### Build Targets
|
||||
|
||||
Main executables (built from `src/commands/`):
|
||||
- `b_main`: Main experiment runner
|
||||
- `b_grid`: Grid search over hyperparameters
|
||||
- `b_best`: Best results analysis and comparison
|
||||
- `b_list`: Dataset listing and properties
|
||||
- `b_manage`: Results management interface
|
||||
- `b_results`: Results processing
|
||||
|
||||
## Dependencies
|
||||
|
||||
The project uses Conan for package management with these key dependencies:
|
||||
- **libtorch**: PyTorch C++ backend for tensor operations
|
||||
- **nlohmann_json**: JSON processing
|
||||
- **catch2**: Unit testing framework
|
||||
- **cli11**: Command-line argument parsing (replacement for argparse)
|
||||
|
||||
Custom dependencies (not available in ConanCenter):
|
||||
- **fimdlp**: MDLP discretization library (needs manual integration)
|
||||
- **folding**: Cross-validation utilities (needs manual integration)
|
||||
- **arff-files**: ARFF dataset file handling (needs manual integration)
|
||||
|
||||
External dependencies (managed separately):
|
||||
- **BayesNet**: Core Bayesian network classifiers (from `../lib/`)
|
||||
- **PyClassifiers**: Python classifier wrappers (from `../lib/`)
|
||||
- **MPI**: Message Passing Interface for parallel processing
|
||||
- **Boost**: Python integration and utilities
|
||||
|
||||
**Note**: Some dependencies (fimdlp, folding, arff-files) are not available in ConanCenter and need to be:
|
||||
- Built as custom Conan packages, or
|
||||
- Integrated using CMake FetchContent, or
|
||||
- Built separately and found via find_package
|
||||
|
||||
## Architecture
|
||||
|
||||
### Core Components
|
||||
|
||||
**Experiment Framework** (`src/main/`):
|
||||
- `Experiment.cpp/h`: Main experiment orchestration
|
||||
- `Models.cpp/h`: Classifier factory and registration system
|
||||
- `Scores.cpp/h`: Performance metrics calculation
|
||||
- `HyperParameters.cpp/h`: Parameter management
|
||||
- `ArgumentsExperiment.cpp/h`: Command-line argument handling
|
||||
|
||||
**Data Handling** (`src/common/`):
|
||||
- `Dataset.cpp/h`: Individual dataset representation
|
||||
- `Datasets.cpp/h`: Dataset collection management
|
||||
- `Discretization.cpp/h`: Data discretization utilities
|
||||
|
||||
**Classifiers** (`src/experimental_clfs/`):
|
||||
- `AdaBoost.cpp/h`: Multi-class SAMME AdaBoost implementation
|
||||
- `DecisionTree.cpp/h`: Decision tree base classifier
|
||||
- `XA1DE.cpp/h`: Extended AODE variants
|
||||
- Experimental implementations of Bayesian network classifiers
|
||||
|
||||
**Grid Search** (`src/grid/`):
|
||||
- `GridSearch.cpp/h`: Hyperparameter optimization
|
||||
- `GridExperiment.cpp/h`: Grid search experiment management
|
||||
- Uses MPI for parallel hyperparameter evaluation
|
||||
|
||||
**Results & Reporting** (`src/results/`, `src/reports/`):
|
||||
- JSON-based result storage with schema validation
|
||||
- Excel export capabilities via libxlsxwriter
|
||||
- Console and paginated result display
|
||||
|
||||
### Model Registration System
|
||||
|
||||
The framework uses a factory pattern with automatic registration:
|
||||
- All classifiers inherit from `bayesnet::BaseClassifier`
|
||||
- Registration happens in `src/main/modelRegister.h`
|
||||
- Factory creates instances by string name via `Models::create()`
|
||||
|
||||
## Configuration
|
||||
|
||||
**Environment Configuration** (`.env` file):
|
||||
- `experiment`: Experiment name/type
|
||||
- `n_folds`: Cross-validation folds (default: 5)
|
||||
- `seeds`: Random seeds for reproducibility
|
||||
- `model`: Default classifier name
|
||||
- `score`: Primary evaluation metric
|
||||
- `platform`: System identifier for results
|
||||
|
||||
**Grid Search Configuration**:
|
||||
- `grid_<model_name>_input.json`: Hyperparameter search space
|
||||
- `grid_<model_name>_output.json`: Search results
|
||||
|
||||
## Data Format
|
||||
|
||||
**Dataset Requirements**:
|
||||
- ARFF format files in `datasets/` directory
|
||||
- `all.txt` file listing datasets: `<name>,<class_name>,<real_features>`
|
||||
- Supports both discrete and continuous features
|
||||
- Automatic discretization available via MDLP
|
||||
|
||||
**Experimental Data**:
|
||||
- Results stored in JSON format with versioned schemas
|
||||
- Test data in `tests/data/` for unit testing
|
||||
- Sample datasets: iris, diabetes, ecoli, glass, etc.
|
||||
|
||||
## Development Workflow
|
||||
|
||||
1. **Setup**: Run `make init` to install dependencies via Conan
|
||||
2. **Development**: Use `make debug` for development builds with testing
|
||||
3. **Testing**: Run `make test` after changes
|
||||
4. **Release**: Use `make release` for optimized builds
|
||||
5. **Experiments**: Use `.env` configuration and run `b_main` with appropriate flags
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Multi-threaded**: Uses MPI for parallel grid search and experiments
|
||||
- **Cross-platform**: Supports Linux and macOS via vcpkg
|
||||
- **Extensible**: Easy classifier registration and integration
|
||||
- **Research-focused**: Designed for machine learning experimentation
|
||||
- **Visualization**: DOT graph generation for decision trees and networks
|
@@ -1,95 +1,99 @@
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
|
||||
project(Platform
|
||||
VERSION 1.0.0
|
||||
VERSION 1.1.1
|
||||
DESCRIPTION "Platform to run Experiments with classifiers."
|
||||
HOMEPAGE_URL "https://github.com/rmontanana/platform"
|
||||
LANGUAGES CXX
|
||||
)
|
||||
|
||||
if (CODE_COVERAGE AND NOT ENABLE_TESTING)
|
||||
MESSAGE(FATAL_ERROR "Code coverage requires testing enabled")
|
||||
endif (CODE_COVERAGE AND NOT ENABLE_TESTING)
|
||||
|
||||
find_package(Torch REQUIRED)
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
endif ()
|
||||
|
||||
# Global CMake variables
|
||||
# ----------------------
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
|
||||
|
||||
# Options
|
||||
# -------
|
||||
option(ENABLE_CLANG_TIDY "Enable to add clang tidy." OFF)
|
||||
option(ENABLE_TESTING "Unit testing build" OFF)
|
||||
option(CODE_COVERAGE "Collect coverage from test library" OFF)
|
||||
|
||||
# CMakes modules
|
||||
# --------------
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
|
||||
|
||||
# MPI
|
||||
find_package(MPI REQUIRED)
|
||||
message("MPI_CXX_LIBRARIES=${MPI_CXX_LIBRARIES}")
|
||||
message("MPI_CXX_INCLUDE_DIRS=${MPI_CXX_INCLUDE_DIRS}")
|
||||
|
||||
# Boost Library
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
cmake_policy(SET CMP0167 NEW) # For FindBoost
|
||||
set(Boost_USE_STATIC_LIBS OFF)
|
||||
set(Boost_USE_MULTITHREADED ON)
|
||||
set(Boost_USE_STATIC_RUNTIME OFF)
|
||||
|
||||
find_package(Boost 1.66.0 REQUIRED COMPONENTS python3 numpy3)
|
||||
|
||||
# # Python
|
||||
find_package(Python3 REQUIRED COMPONENTS Development)
|
||||
|
||||
# # Boost Python
|
||||
# find_package(boost_python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR} CONFIG REQUIRED COMPONENTS python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR})
|
||||
# # target_link_libraries(MyTarget PRIVATE Boost::python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR})
|
||||
|
||||
|
||||
if(Boost_FOUND)
|
||||
message("Boost_INCLUDE_DIRS=${Boost_INCLUDE_DIRS}")
|
||||
message("Boost_LIBRARIES=${Boost_LIBRARIES}")
|
||||
message("Boost_VERSION=${Boost_VERSION}")
|
||||
include_directories(${Boost_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
# Python
|
||||
find_package(Python3 3.11...3.11.9 COMPONENTS Interpreter Development REQUIRED)
|
||||
message("Python3_LIBRARIES=${Python3_LIBRARIES}")
|
||||
|
||||
# CMakes modules
|
||||
# --------------
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
|
||||
include(AddGitSubmodule)
|
||||
|
||||
if (CODE_COVERAGE)
|
||||
enable_testing()
|
||||
include(CodeCoverage)
|
||||
MESSAGE("Code coverage enabled")
|
||||
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
|
||||
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
|
||||
endif (CODE_COVERAGE)
|
||||
|
||||
if (ENABLE_CLANG_TIDY)
|
||||
include(StaticAnalyzers) # clang-tidy
|
||||
endif (ENABLE_CLANG_TIDY)
|
||||
|
||||
# External libraries - dependencies of BayesNet
|
||||
# External libraries - dependencies of Platform
|
||||
# ---------------------------------------------
|
||||
add_git_submodule("lib/PyClassifiers")
|
||||
add_git_submodule("lib/argparse")
|
||||
|
||||
find_library(XLSXWRITER_LIB NAMES libxlsxwriter.dylib libxlsxwriter.so PATHS ${Platform_SOURCE_DIR}/lib/libxlsxwriter/lib)
|
||||
message("XLSXWRITER_LIB=${XLSXWRITER_LIB}")
|
||||
find_package(nlohmann_json CONFIG REQUIRED)
|
||||
find_package(argparse CONFIG REQUIRED)
|
||||
find_package(Torch CONFIG REQUIRED)
|
||||
find_package(arff-files CONFIG REQUIRED)
|
||||
find_package(fimdlp CONFIG REQUIRED)
|
||||
find_package(folding CONFIG REQUIRED)
|
||||
find_package(bayesnet CONFIG REQUIRED)
|
||||
# find_package(pyclassifiers CONFIG REQUIRED)
|
||||
find_package(libxlsxwriter CONFIG REQUIRED)
|
||||
find_package(Boost REQUIRED COMPONENTS python)
|
||||
|
||||
# Subdirectories
|
||||
# --------------
|
||||
## Configure test data path
|
||||
cmake_path(SET TEST_DATA_PATH "${CMAKE_CURRENT_SOURCE_DIR}/tests/data")
|
||||
configure_file(src/common/SourceData.h.in "${CMAKE_BINARY_DIR}/configured_files/include/SourceData.h")
|
||||
add_subdirectory(config)
|
||||
add_subdirectory(src/Platform)
|
||||
add_subdirectory(sample)
|
||||
file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${Platform_SOURCE_DIR}/src/Platform/*.cc)
|
||||
add_subdirectory(src)
|
||||
# add_subdirectory(sample)
|
||||
file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${Platform_SOURCE_DIR}/src/*.cpp)
|
||||
|
||||
# Testing
|
||||
# -------
|
||||
if (ENABLE_TESTING)
|
||||
MESSAGE("Testing enabled")
|
||||
if (NOT TARGET Catch2::Catch2)
|
||||
add_git_submodule("lib/catch2")
|
||||
endif (NOT TARGET Catch2::Catch2)
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
|
||||
enable_testing()
|
||||
find_package(Catch2 CONFIG REQUIRED)
|
||||
set(CODE_COVERAGE ON)
|
||||
include(CTest)
|
||||
add_subdirectory(tests)
|
||||
endif (ENABLE_TESTING)
|
||||
if (CODE_COVERAGE)
|
||||
MESSAGE("Code coverage enabled")
|
||||
include(CodeCoverage)
|
||||
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
|
||||
endif (CODE_COVERAGE)
|
||||
|
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 rmontanana
|
||||
Copyright (c) 2024 Ricardo Montañana Gómez
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
|
120
Makefile
120
Makefile
@@ -1,12 +1,18 @@
|
||||
SHELL := /bin/bash
|
||||
.DEFAULT_GOAL := help
|
||||
.PHONY: coverage setup help build test clean debug release submodules buildr buildd install dependency testp testb clang-uml
|
||||
.PHONY: init clean coverage setup help build test clean debug release buildr buildd install dependency testp testb clang-uml example
|
||||
|
||||
f_release = build_release
|
||||
f_debug = build_debug
|
||||
app_targets = b_best b_list b_main b_manage b_grid
|
||||
test_targets = unit_tests_bayesnet unit_tests_platform
|
||||
n_procs = -j 16
|
||||
f_release = build_Release
|
||||
f_debug = build_Debug
|
||||
app_targets = b_best b_list b_main b_manage b_grid b_results
|
||||
test_targets = unit_tests_platform
|
||||
# Set the number of parallel jobs to the number of available processors minus 7
|
||||
CPUS := $(shell getconf _NPROCESSORS_ONLN 2>/dev/null \
|
||||
|| nproc --all 2>/dev/null \
|
||||
|| sysctl -n hw.ncpu)
|
||||
|
||||
# --- Your desired job count: CPUs – 7, but never less than 1 --------------
|
||||
JOBS := $(shell n=$(CPUS); [ $${n} -gt 7 ] && echo $$((n-7)) || echo 1)
|
||||
|
||||
define ClearTests
|
||||
@for t in $(test_targets); do \
|
||||
@@ -21,14 +27,43 @@ define ClearTests
|
||||
fi ;
|
||||
endef
|
||||
|
||||
define build_target
|
||||
@echo ">>> Building the project for $(1)..."
|
||||
@if [ -d $(2) ]; then rm -fr $(2); fi
|
||||
@conan install . --build=missing -of $(2) -s build_type=$(1)
|
||||
@cmake -S . -B $(2) -DCMAKE_TOOLCHAIN_FILE=$(2)/build/$(1)/generators/conan_toolchain.cmake -DCMAKE_BUILD_TYPE=$(1) -D$(3)
|
||||
@echo ">>> Will build using $(JOBS) parallel jobs"
|
||||
echo ">>> Done"
|
||||
endef
|
||||
|
||||
sub-init: ## Initialize submodules
|
||||
@git submodule update --init --recursive
|
||||
define compile_target
|
||||
@echo ">>> Compiling for $(1)..."
|
||||
if [ "$(3)" != "" ]; then \
|
||||
target="-t$(3)"; \
|
||||
else \
|
||||
target=""; \
|
||||
fi
|
||||
@cmake --build $(2) --config $(1) --parallel $(JOBS) $(target)
|
||||
@echo ">>> Done"
|
||||
endef
|
||||
|
||||
sub-update: ## Initialize submodules
|
||||
@git submodule update --remote --merge
|
||||
@git submodule foreach git pull origin master
|
||||
init: ## Initialize the project installing dependencies
|
||||
@echo ">>> Installing dependencies with Conan"
|
||||
@conan install . --output-folder=build --build=missing -s build_type=Release
|
||||
@conan install . --output-folder=build_debug --build=missing -s build_type=Debug
|
||||
@echo ">>> Done"
|
||||
|
||||
clean: ## Clean the project
|
||||
@echo ">>> Cleaning the project..."
|
||||
@if test -f CMakeCache.txt ; then echo "- Deleting CMakeCache.txt"; rm -f CMakeCache.txt; fi
|
||||
@for folder in $(f_release) $(f_debug) build build_debug install_test ; do \
|
||||
if test -d "$$folder" ; then \
|
||||
echo "- Deleting $$folder folder" ; \
|
||||
rm -rf "$$folder"; \
|
||||
fi; \
|
||||
done
|
||||
$(call ClearTests)
|
||||
@echo ">>> Done";
|
||||
setup: ## Install dependencies for tests and coverage
|
||||
@if [ "$(shell uname)" = "Darwin" ]; then \
|
||||
brew install gcovr; \
|
||||
@@ -41,13 +76,15 @@ setup: ## Install dependencies for tests and coverage
|
||||
dest ?= ${HOME}/bin
|
||||
install: ## Copy binary files to bin folder
|
||||
@echo "Destination folder: $(dest)"
|
||||
make buildr
|
||||
@make buildr
|
||||
@echo "*******************************************"
|
||||
@echo ">>> Copying files to $(dest)"
|
||||
@echo "*******************************************"
|
||||
@for item in $(app_targets); do \
|
||||
echo ">>> Copying $$item" ; \
|
||||
cp $(f_release)/src/Platform/$$item $(dest) ; \
|
||||
cp $(f_release)/src/$$item $(dest) || { \
|
||||
echo "*** Error copying $$item" ; \
|
||||
} ; \
|
||||
done
|
||||
|
||||
dependency: ## Create a dependency graph diagram of the project (build/dependency.png)
|
||||
@@ -56,38 +93,26 @@ dependency: ## Create a dependency graph diagram of the project (build/dependenc
|
||||
cd $(f_debug) && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png
|
||||
|
||||
buildd: ## Build the debug targets
|
||||
cmake --build $(f_debug) -t $(app_targets) PlatformSample $(n_procs)
|
||||
@$(call compile_target,"Debug","$(f_debug)")
|
||||
|
||||
buildr: ## Build the release targets
|
||||
cmake --build $(f_release) -t $(app_targets) $(n_procs)
|
||||
|
||||
clean: ## Clean the tests info
|
||||
@echo ">>> Cleaning Debug BayesNet tests...";
|
||||
$(call ClearTests)
|
||||
@echo ">>> Done";
|
||||
@$(call compile_target,"Release","$(f_release)")
|
||||
|
||||
clang-uml: ## Create uml class and sequence diagrams
|
||||
clang-uml -p --add-compile-flag -I /usr/lib/gcc/x86_64-redhat-linux/8/include/
|
||||
|
||||
debug: ## Build a debug version of the project
|
||||
@echo ">>> Building Debug BayesNet...";
|
||||
@if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi
|
||||
@mkdir $(f_debug);
|
||||
@cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON
|
||||
@echo ">>> Done";
|
||||
debug: ## Build a debug version of the project with Conan
|
||||
@$(call build_target,"Debug","$(f_debug)", "ENABLE_TESTING=ON")
|
||||
|
||||
release: ## Build a Release version of the project with Conan
|
||||
@$(call build_target,"Release","$(f_release)", "ENABLE_TESTING=OFF")
|
||||
|
||||
release: ## Build a Release version of the project
|
||||
@echo ">>> Building Release BayesNet...";
|
||||
@if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi
|
||||
@mkdir $(f_release);
|
||||
@cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release
|
||||
@echo ">>> Done";
|
||||
|
||||
opt = ""
|
||||
test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
|
||||
@echo ">>> Running BayesNet & Platform tests...";
|
||||
@$(MAKE) clean
|
||||
@cmake --build $(f_debug) -t $(test_targets) $(n_procs)
|
||||
@echo ">>> Running Platform tests...";
|
||||
@$(MAKE) debug
|
||||
@$(call compile_target, "Debug", "$(f_debug)", $(test_targets))
|
||||
@for t in $(test_targets); do \
|
||||
if [ -f $(f_debug)/tests/$$t ]; then \
|
||||
cd $(f_debug)/tests ; \
|
||||
@@ -96,33 +121,24 @@ test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximu
|
||||
done
|
||||
@echo ">>> Done";
|
||||
|
||||
opt = ""
|
||||
testp: ## Run platform tests (opt="-s") to verbose output the tests, (opt="-c='Stratified Fold Test'") to run only that section
|
||||
@echo ">>> Running Platform tests...";
|
||||
@$(MAKE) clean
|
||||
@cmake --build $(f_debug) --target unit_tests_platform $(n_procs)
|
||||
@if [ -f $(f_debug)/tests/unit_tests_platform ]; then cd $(f_debug)/tests ; ./unit_tests_platform $(opt) ; fi ;
|
||||
fname = iris
|
||||
example: ## Build sample
|
||||
@echo ">>> Building Sample...";
|
||||
@cmake --build $(f_release) -t sample
|
||||
$(f_release)/sample/PlatformSample --model BoostAODE --dataset $(fname) --discretize --stratified
|
||||
@echo ">>> Done";
|
||||
|
||||
opt = ""
|
||||
testb: ## Run BayesNet tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
|
||||
@echo ">>> Running BayesNet tests...";
|
||||
@$(MAKE) clean
|
||||
@cmake --build $(f_debug) --target unit_tests_bayesnet $(n_procs)
|
||||
@if [ -f $(f_debug)/tests/unit_tests_bayesnet ]; then cd $(f_debug)/tests ; ./unit_tests_bayesnet $(opt) ; fi ;
|
||||
@echo ">>> Done";
|
||||
|
||||
coverage: ## Run tests and generate coverage report (build/index.html)
|
||||
@echo ">>> Building tests with coverage...";
|
||||
@echo ">>> Building tests with coverage..."
|
||||
@$(MAKE) test
|
||||
@cd $(f_debug) ; \
|
||||
gcovr --config ../gcovr.cfg tests ;
|
||||
@gcovr $(f_debug)/tests
|
||||
@echo ">>> Done";
|
||||
|
||||
|
||||
help: ## Show help message
|
||||
@IFS=$$'\n' ; \
|
||||
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||
help_lines=(`grep -Fh "##" $(MAKEFILE_LIST) | grep -Fv fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||
printf "%s\n\n" "Usage: make [task]"; \
|
||||
printf "%-20s %s\n" "task" "help" ; \
|
||||
printf "%-20s %s\n" "------" "----" ; \
|
||||
|
88
README.md
88
README.md
@@ -1,16 +1,15 @@
|
||||
# Platform
|
||||
# <img src="logo.png" alt="logo" width="50"/> Platform
|
||||
|
||||
Platform to run Bayesian Networks and Machine Learning Classifiers experiments.
|
||||
|
||||
# Platform
|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||

|
||||
[](<https://opensource.org/licenses/MIT>)
|
||||
[](https://deepwiki.com/rmontanana/Platform)
|
||||

|
||||
|
||||
Platform to run Bayesian Networks and Machine Learning Classifiers experiments.
|
||||
|
||||
## 0. Setup
|
||||
|
||||
Before compiling BayesNet.
|
||||
Before compiling Platform.
|
||||
|
||||
### Miniconda
|
||||
|
||||
@@ -22,11 +21,18 @@ In Linux sometimes the library libstdc++ is mistaken from the miniconda installa
|
||||
libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by b_xxxx)
|
||||
```
|
||||
|
||||
The solution is to erase the libstdc++ library from the miniconda installation:
|
||||
The solution is to erase the libstdc++ library from the miniconda installation and no further compilation is needed.
|
||||
|
||||
### MPI
|
||||
|
||||
In Linux just install openmpi & openmpi-devel packages. Only if cmake can't find openmpi installation (like in Oracle Linux) set the following variable:
|
||||
In Linux just install openmpi & openmpi-devel packages.
|
||||
|
||||
```bash
|
||||
source /etc/profile.d/modules.sh
|
||||
module load mpi/openmpi-x86_64
|
||||
```
|
||||
|
||||
If cmake can't find openmpi installation (like in Oracle Linux) set the following variable:
|
||||
|
||||
```bash
|
||||
export MPI_HOME="/usr/lib64/openmpi"
|
||||
@@ -35,7 +41,7 @@ export MPI_HOME="/usr/lib64/openmpi"
|
||||
In Mac OS X, install mpich with brew and if cmake doesn't find it, edit mpicxx wrapper to remove the ",-commons,use_dylibs" from final_ldflags
|
||||
|
||||
```bash
|
||||
vi /opt/homebrew/bin/mpicx
|
||||
vi /opt/homebrew/bin/mpicxx
|
||||
```
|
||||
|
||||
### boost library
|
||||
@@ -86,4 +92,64 @@ make release
|
||||
make debug
|
||||
```
|
||||
|
||||
## 1. Introduction
|
||||
### Configuration
|
||||
|
||||
The configuration file is named .env and it should be located in the folder where the experiments should be run. In the root folder of the project there is a file named .env.example that can be used as a template.
|
||||
|
||||
## 1. Commands
|
||||
|
||||
### b_list
|
||||
|
||||
List all the datasets and its properties. The datasets are located in the _datasets_ folder under the experiments root folder. A special file called all.txt with the names of the datasets has to be created. This all file is built wih lines of the form:
|
||||
<name>,<class_name>,<real_features>
|
||||
|
||||
where <real_features> can be either the word _all_ or a list of numbers separated by commas, i.e. [0,3,6,7]
|
||||
|
||||
### b_grid
|
||||
|
||||
Run a grid search over the parameters of the classifiers. The parameters are defined in the file _grid.txt_ located in the grid folder of the experiments. The file has to be created with the following format:
|
||||
|
||||
```json
|
||||
{
|
||||
"all": [
|
||||
<set of hyperparams>, ...
|
||||
],
|
||||
"<dataset_name>": [
|
||||
<specific set of hyperparams for <dataset_name>>, ...
|
||||
],
|
||||
}
|
||||
```
|
||||
|
||||
The file has to be named _grid_<model_name>_input.json_
|
||||
|
||||
As a result it builds a file named _grid_<model_name>_output.json_ with the results of the grid search.
|
||||
|
||||
The computation is done in parallel using MPI.
|
||||
|
||||

|
||||
|
||||
### b_main
|
||||
|
||||
Run the main experiment. There are several hyperparameters that can set in command line:
|
||||
|
||||
- -d, -\-dataset <dataset_name> : Name of the dataset to run the experiment with. If no dataset is specificied the experiment will run with all the datasets in the all.txt file.
|
||||
- -m, -\-model <classifier_name> : Name of the classifier to run the experiment with (i.e. BoostAODE, TAN, Odte, etc.).
|
||||
- -\-discretize: Discretize the dataset before running the experiment.
|
||||
- -\-stratified: Use stratified cross validation.
|
||||
- -\-folds <folds>: Number of folds for cross validation (optional, default value is in .env file).
|
||||
- -s, -\-seeds <seed>: Seeds for the random number generator (optional, default values are in .env file).
|
||||
- -\-no-train-score: Do not calculate the train score (optional), this is useful when the dataset is big and the training score is not needed.
|
||||
- -\-hyperparameters <hyperparameters>: Hyperparameters for the experiment in json format.
|
||||
- -\-hyper-file <hyperparameters_file>: File with the hyperparameters for the experiment in json format. This file uses the output format of the b_grid command.
|
||||
- -\-title <title_text>: Title of the experiment (optional if only one dataset is specificied).
|
||||
- -\-quiet: Don't display detailed progress and result of the experiment.
|
||||
|
||||
### b_manage
|
||||
|
||||
Manage the results of the experiments.
|
||||
|
||||
### b_best
|
||||
|
||||
Get and optionally compare the best results of the experiments. The results can be stored in an MS Excel file.
|
||||
|
||||

|
||||
|
@@ -137,7 +137,7 @@
|
||||
|
||||
include(CMakeParseArguments)
|
||||
|
||||
option(CODE_COVERAGE_VERBOSE "Verbose information" FALSE)
|
||||
option(CODE_COVERAGE_VERBOSE "Verbose information" TRUE)
|
||||
|
||||
# Check prereqs
|
||||
find_program( GCOV_PATH gcov )
|
||||
@@ -160,8 +160,12 @@ foreach(LANG ${LANGUAGES})
|
||||
endif()
|
||||
elseif(NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "GNU"
|
||||
AND NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "(LLVM)?[Ff]lang")
|
||||
if ("${LANG}" MATCHES "CUDA")
|
||||
message(STATUS "Ignoring CUDA")
|
||||
else()
|
||||
message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(COVERAGE_COMPILER_FLAGS "-g --coverage"
|
||||
|
42
conanfile.py
Normal file
42
conanfile.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from conan import ConanFile
|
||||
from conan.tools.cmake import CMakeToolchain, CMakeDeps, cmake_layout
|
||||
|
||||
|
||||
class PlatformConan(ConanFile):
|
||||
name = "platform"
|
||||
version = "1.1.0"
|
||||
|
||||
# Binary configuration
|
||||
settings = "os", "compiler", "build_type", "arch"
|
||||
|
||||
# Sources are located in the same place as this recipe, copy them to the recipe
|
||||
exports_sources = "CMakeLists.txt", "src/*", "tests/*", "config/*", "cmake/*"
|
||||
|
||||
def requirements(self):
|
||||
# Core dependencies from vcpkg.json
|
||||
self.requires("argparse/3.2")
|
||||
self.requires("libtorch/2.7.1")
|
||||
self.requires("nlohmann_json/3.11.3")
|
||||
self.requires("folding/1.1.2")
|
||||
self.requires("fimdlp/2.1.1")
|
||||
self.requires("arff-files/1.2.1")
|
||||
self.requires("bayesnet/1.2.1")
|
||||
# self.requires("pyclassifiers/1.0.3")
|
||||
self.requires("libxlsxwriter/1.2.2")
|
||||
|
||||
def build_requirements(self):
|
||||
self.tool_requires("cmake/[>=3.30]")
|
||||
self.test_requires("catch2/3.8.1")
|
||||
|
||||
def layout(self):
|
||||
cmake_layout(self)
|
||||
|
||||
def generate(self):
|
||||
deps = CMakeDeps(self)
|
||||
deps.generate()
|
||||
tc = CMakeToolchain(self)
|
||||
tc.generate()
|
||||
|
||||
def configure(self):
|
||||
# C++20 requirement
|
||||
self.settings.compiler.cppstd = "20"
|
@@ -1,4 +1,4 @@
|
||||
configure_file(
|
||||
"config.h.in"
|
||||
"${CMAKE_BINARY_DIR}/configured_files/include/config.h" ESCAPE_QUOTES
|
||||
"${CMAKE_BINARY_DIR}/configured_files/include/config_platform.h" ESCAPE_QUOTES
|
||||
)
|
||||
|
@@ -1,14 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef PLATFORM_H
|
||||
#define PLATFORM_H
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#define PROJECT_VERSION_MAJOR @PROJECT_VERSION_MAJOR @
|
||||
#define PROJECT_VERSION_MINOR @PROJECT_VERSION_MINOR @
|
||||
#define PROJECT_VERSION_PATCH @PROJECT_VERSION_PATCH @
|
||||
|
||||
static constexpr std::string_view project_name = "@PROJECT_NAME@";
|
||||
static constexpr std::string_view project_version = "@PROJECT_VERSION@";
|
||||
static constexpr std::string_view project_description = "@PROJECT_DESCRIPTION@";
|
||||
static constexpr std::string_view git_sha = "@GIT_SHA@";
|
||||
static constexpr std::string_view data_path = "@Platform_SOURCE_DIR@/tests/data/";
|
||||
static constexpr std::string_view platform_project_name = "@PROJECT_NAME@";
|
||||
static constexpr std::string_view platform_project_version = "@PROJECT_VERSION@";
|
||||
static constexpr std::string_view platform_project_description = "@PROJECT_DESCRIPTION@";
|
||||
static constexpr std::string_view platform_git_sha = "@GIT_SHA@";
|
||||
static constexpr std::string_view platform_data_path = "@Platform_SOURCE_DIR@/tests/data/";
|
||||
#endif
|
@@ -1,4 +1,4 @@
|
||||
filter = src/
|
||||
exclude-directories = build/lib/
|
||||
exclude-directories = build_debug/lib/
|
||||
print-summary = yes
|
||||
sort-percentage = yes
|
||||
|
31
gitmodules
31
gitmodules
@@ -1,31 +0,0 @@
|
||||
[submodule "lib/mdlp"]
|
||||
path = lib/mdlp
|
||||
url = https://github.com/rmontanana/mdlp
|
||||
main = main
|
||||
update = merge
|
||||
[submodule "lib/catch2"]
|
||||
path = lib/catch2
|
||||
main = v2.x
|
||||
update = merge
|
||||
url = https://github.com/catchorg/Catch2.git
|
||||
[submodule "lib/argparse"]
|
||||
path = lib/argparse
|
||||
url = https://github.com/p-ranav/argparse
|
||||
master = master
|
||||
update = merge
|
||||
[submodule "lib/json"]
|
||||
path = lib/json
|
||||
url = https://github.com/nlohmann/json.git
|
||||
master = master
|
||||
update = merge
|
||||
[submodule "lib/libxlsxwriter"]
|
||||
path = lib/libxlsxwriter
|
||||
url = https://github.com/jmcnamara/libxlsxwriter.git
|
||||
main = main
|
||||
update = merge
|
||||
[submodule "lib/PyClassifiers"]
|
||||
path = lib/PyClassifiers
|
||||
url = https://github.com/rmontanana/PyClassifiers
|
||||
[submodule "lib/folding"]
|
||||
path = lib/folding
|
||||
url = https://github.com/rmontanana/Folding
|
BIN
img/bbest.gif
Normal file
BIN
img/bbest.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.9 MiB |
BIN
img/bgrid.gif
Normal file
BIN
img/bgrid.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 349 KiB |
BIN
img/blist.gif
Normal file
BIN
img/blist.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.7 MiB |
BIN
img/bmain.gif
Normal file
BIN
img/bmain.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.3 MiB |
BIN
img/bmanage.gif
Normal file
BIN
img/bmanage.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 8.7 MiB |
@@ -1,168 +0,0 @@
|
||||
#include "ArffFiles.h"
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
|
||||
ArffFiles::ArffFiles() = default;
|
||||
|
||||
std::vector<std::string> ArffFiles::getLines() const
|
||||
{
|
||||
return lines;
|
||||
}
|
||||
|
||||
unsigned long int ArffFiles::getSize() const
|
||||
{
|
||||
return lines.size();
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> ArffFiles::getAttributes() const
|
||||
{
|
||||
return attributes;
|
||||
}
|
||||
|
||||
std::string ArffFiles::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
|
||||
std::string ArffFiles::getClassType() const
|
||||
{
|
||||
return classType;
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>>& ArffFiles::getX()
|
||||
{
|
||||
return X;
|
||||
}
|
||||
|
||||
std::vector<int>& ArffFiles::getY()
|
||||
{
|
||||
return y;
|
||||
}
|
||||
|
||||
void ArffFiles::loadCommon(std::string fileName)
|
||||
{
|
||||
std::ifstream file(fileName);
|
||||
if (!file.is_open()) {
|
||||
throw std::invalid_argument("Unable to open file");
|
||||
}
|
||||
std::string line;
|
||||
std::string keyword;
|
||||
std::string attribute;
|
||||
std::string type;
|
||||
std::string type_w;
|
||||
while (getline(file, line)) {
|
||||
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
|
||||
continue;
|
||||
}
|
||||
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
|
||||
std::stringstream ss(line);
|
||||
ss >> keyword >> attribute;
|
||||
type = "";
|
||||
while (ss >> type_w)
|
||||
type += type_w + " ";
|
||||
attributes.emplace_back(trim(attribute), trim(type));
|
||||
continue;
|
||||
}
|
||||
if (line[0] == '@') {
|
||||
continue;
|
||||
}
|
||||
lines.push_back(line);
|
||||
}
|
||||
file.close();
|
||||
if (attributes.empty())
|
||||
throw std::invalid_argument("No attributes found");
|
||||
}
|
||||
|
||||
void ArffFiles::load(const std::string& fileName, bool classLast)
|
||||
{
|
||||
int labelIndex;
|
||||
loadCommon(fileName);
|
||||
if (classLast) {
|
||||
className = std::get<0>(attributes.back());
|
||||
classType = std::get<1>(attributes.back());
|
||||
attributes.pop_back();
|
||||
labelIndex = static_cast<int>(attributes.size());
|
||||
} else {
|
||||
className = std::get<0>(attributes.front());
|
||||
classType = std::get<1>(attributes.front());
|
||||
attributes.erase(attributes.begin());
|
||||
labelIndex = 0;
|
||||
}
|
||||
generateDataset(labelIndex);
|
||||
}
|
||||
void ArffFiles::load(const std::string& fileName, const std::string& name)
|
||||
{
|
||||
int labelIndex;
|
||||
loadCommon(fileName);
|
||||
bool found = false;
|
||||
for (int i = 0; i < attributes.size(); ++i) {
|
||||
if (attributes[i].first == name) {
|
||||
className = std::get<0>(attributes[i]);
|
||||
classType = std::get<1>(attributes[i]);
|
||||
attributes.erase(attributes.begin() + i);
|
||||
labelIndex = i;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
throw std::invalid_argument("Class name not found");
|
||||
}
|
||||
generateDataset(labelIndex);
|
||||
}
|
||||
|
||||
void ArffFiles::generateDataset(int labelIndex)
|
||||
{
|
||||
X = std::vector<std::vector<float>>(attributes.size(), std::vector<float>(lines.size()));
|
||||
auto yy = std::vector<std::string>(lines.size(), "");
|
||||
auto removeLines = std::vector<int>(); // Lines with missing values
|
||||
for (size_t i = 0; i < lines.size(); i++) {
|
||||
std::stringstream ss(lines[i]);
|
||||
std::string value;
|
||||
int pos = 0;
|
||||
int xIndex = 0;
|
||||
while (getline(ss, value, ',')) {
|
||||
if (pos++ == labelIndex) {
|
||||
yy[i] = value;
|
||||
} else {
|
||||
if (value == "?") {
|
||||
X[xIndex++][i] = -1;
|
||||
removeLines.push_back(i);
|
||||
} else
|
||||
X[xIndex++][i] = stof(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto i : removeLines) {
|
||||
yy.erase(yy.begin() + i);
|
||||
for (auto& x : X) {
|
||||
x.erase(x.begin() + i);
|
||||
}
|
||||
}
|
||||
y = factorize(yy);
|
||||
}
|
||||
|
||||
std::string ArffFiles::trim(const std::string& source)
|
||||
{
|
||||
std::string s(source);
|
||||
s.erase(0, s.find_first_not_of(" '\n\r\t"));
|
||||
s.erase(s.find_last_not_of(" '\n\r\t") + 1);
|
||||
return s;
|
||||
}
|
||||
|
||||
std::vector<int> ArffFiles::factorize(const std::vector<std::string>& labels_t)
|
||||
{
|
||||
std::vector<int> yy;
|
||||
yy.reserve(labels_t.size());
|
||||
std::map<std::string, int> labelMap;
|
||||
int i = 0;
|
||||
for (const std::string& label : labels_t) {
|
||||
if (labelMap.find(label) == labelMap.end()) {
|
||||
labelMap[label] = i++;
|
||||
}
|
||||
yy.push_back(labelMap[label]);
|
||||
}
|
||||
return yy;
|
||||
}
|
@@ -1,32 +0,0 @@
|
||||
#ifndef ARFFFILES_H
|
||||
#define ARFFFILES_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
class ArffFiles {
|
||||
private:
|
||||
std::vector<std::string> lines;
|
||||
std::vector<std::pair<std::string, std::string>> attributes;
|
||||
std::string className;
|
||||
std::string classType;
|
||||
std::vector<std::vector<float>> X;
|
||||
std::vector<int> y;
|
||||
void generateDataset(int);
|
||||
void loadCommon(std::string);
|
||||
public:
|
||||
ArffFiles();
|
||||
void load(const std::string&, bool = true);
|
||||
void load(const std::string&, const std::string&);
|
||||
std::vector<std::string> getLines() const;
|
||||
unsigned long int getSize() const;
|
||||
std::string getClassName() const;
|
||||
std::string getClassType() const;
|
||||
static std::string trim(const std::string&);
|
||||
std::vector<std::vector<float>>& getX();
|
||||
std::vector<int>& getY();
|
||||
std::vector<std::pair<std::string, std::string>> getAttributes() const;
|
||||
static std::vector<int> factorize(const std::vector<std::string>& labels_t);
|
||||
};
|
||||
|
||||
#endif
|
@@ -1 +0,0 @@
|
||||
add_library(ArffFiles ArffFiles.cc)
|
Submodule lib/PyClassifiers deleted from f46f6dcbb2
Submodule lib/argparse deleted from 69dabd88a8
Submodule lib/libxlsxwriter deleted from 6a2364c42c
14
remove_submodules.sh
Normal file
14
remove_submodules.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
git config --file .gitmodules --get-regexp path | awk '{ print $2 }' | while read line; do
|
||||
echo "Removing $line"
|
||||
# Deinit the submodule
|
||||
git submodule deinit -f "$line"
|
||||
|
||||
# Remove the submodule from the working tree
|
||||
git rm -f "$line"
|
||||
|
||||
# Remove the submodule from .git/modules
|
||||
rm -rf ".git/modules/$line"
|
||||
done
|
||||
|
||||
# Remove the .gitmodules file
|
||||
git rm -f .gitmodules
|
@@ -1,14 +1,11 @@
|
||||
include_directories(
|
||||
${Platform_SOURCE_DIR}/src/Platform
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/src/PyClassifiers
|
||||
${TORCH_INCLUDE_DIRS}
|
||||
${Platform_SOURCE_DIR}/src/common
|
||||
${Platform_SOURCE_DIR}/src/main
|
||||
${Python3_INCLUDE_DIRS}
|
||||
${Platform_SOURCE_DIR}/lib/Files
|
||||
${Platform_SOURCE_DIR}/lib/argparse/include
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/src/BayesNet
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/folding
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/mdlp
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/json/include
|
||||
${CMAKE_BINARY_DIR}/configured_files/include
|
||||
${PyClassifiers_INCLUDE_DIRS}
|
||||
${bayesnet_INCLUDE_DIRS}
|
||||
)
|
||||
add_executable(PlatformSample sample.cc ${Platform_SOURCE_DIR}/src/Platform/Models.cc)
|
||||
target_link_libraries(PlatformSample PyClassifiers ArffFiles mdlp "${TORCH_LIBRARIES}")
|
||||
add_executable(PlatformSample sample.cpp ${Platform_SOURCE_DIR}/src/main/Models.cpp)
|
||||
target_link_libraries(PlatformSample "${PyClassifiers}" "${BayesNet}" fimdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} ${Boost_LIBRARIES})
|
236
sample/sample.cc
236
sample/sample.cc
@@ -1,236 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <torch/torch.h>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "ArffFiles.h"
|
||||
#include "BayesMetrics.h"
|
||||
#include "CPPFImdlp.h"
|
||||
#include "folding.hpp"
|
||||
#include "Models.h"
|
||||
#include "modelRegister.h"
|
||||
#include <fstream>
|
||||
#include "config.h"
|
||||
|
||||
const std::string PATH = { data_path.begin(), data_path.end() };
|
||||
|
||||
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
|
||||
{
|
||||
std::vector<mdlp::labels_t>Xd;
|
||||
map<std::string, int> maxes;
|
||||
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return { Xd, maxes };
|
||||
}
|
||||
|
||||
bool file_exists(const std::string& name)
|
||||
{
|
||||
if (FILE* file = fopen(name.c_str(), "r")) {
|
||||
fclose(file);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<int>>, std::vector<int>> extract_indices(std::vector<int> indices, std::vector<std::vector<int>> X, std::vector<int> y)
|
||||
{
|
||||
std::vector<std::vector<int>> Xr; // nxm
|
||||
std::vector<int> yr;
|
||||
for (int col = 0; col < X.size(); ++col) {
|
||||
Xr.push_back(std::vector<int>());
|
||||
}
|
||||
for (auto index : indices) {
|
||||
for (int col = 0; col < X.size(); ++col) {
|
||||
Xr[col].push_back(X[col][index]);
|
||||
}
|
||||
yr.push_back(y[index]);
|
||||
}
|
||||
return { Xr, yr };
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
map<std::string, bool> datasets = {
|
||||
{"diabetes", true},
|
||||
{"ecoli", true},
|
||||
{"glass", true},
|
||||
{"iris", true},
|
||||
{"kdd_JapaneseVowels", false},
|
||||
{"letter", true},
|
||||
{"liver-disorders", true},
|
||||
{"mfeat-factors", true},
|
||||
};
|
||||
auto valid_datasets = std::vector<std::string>();
|
||||
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
|
||||
[](const pair<std::string, bool>& pair) { return pair.first; });
|
||||
argparse::ArgumentParser program("PlatformSample");
|
||||
program.add_argument("-d", "--dataset")
|
||||
.help("Dataset file name")
|
||||
.action([valid_datasets](const std::string& value) {
|
||||
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
|
||||
return value;
|
||||
}
|
||||
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
|
||||
}
|
||||
);
|
||||
program.add_argument("-p", "--path")
|
||||
.help(" folder where the data files are located, default")
|
||||
.default_value(std::string{ PATH }
|
||||
);
|
||||
program.add_argument("-m", "--model")
|
||||
.help("Model to use " + platform::Models::instance()->tostring())
|
||||
.action([](const std::string& value) {
|
||||
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
|
||||
if (find(choices.begin(), choices.end(), value) != choices.end()) {
|
||||
return value;
|
||||
}
|
||||
throw runtime_error("Model must be one of " + platform::Models::instance()->tostring());
|
||||
}
|
||||
);
|
||||
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
|
||||
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
|
||||
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
|
||||
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
|
||||
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw runtime_error("Number of folds must be greater than 1");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const runtime_error& err) {
|
||||
throw runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw runtime_error("Number of folds must be an integer");
|
||||
}});
|
||||
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
|
||||
bool class_last, stratified, tensors, dump_cpt;
|
||||
std::string model_name, file_name, path, complete_file_name;
|
||||
int nFolds, seed;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
file_name = program.get<std::string>("dataset");
|
||||
path = program.get<std::string>("path");
|
||||
model_name = program.get<std::string>("model");
|
||||
complete_file_name = path + file_name + ".arff";
|
||||
stratified = program.get<bool>("stratified");
|
||||
tensors = program.get<bool>("tensors");
|
||||
nFolds = program.get<int>("folds");
|
||||
seed = program.get<int>("seed");
|
||||
dump_cpt = program.get<bool>("dumpcpt");
|
||||
class_last = datasets[file_name];
|
||||
if (!file_exists(complete_file_name)) {
|
||||
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
|
||||
}
|
||||
}
|
||||
catch (const exception& err) {
|
||||
cerr << err.what() << std::endl;
|
||||
cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Begin Processing
|
||||
*/
|
||||
auto handler = ArffFiles();
|
||||
handler.load(complete_file_name, class_last);
|
||||
// Get Dataset X, y
|
||||
std::vector<mdlp::samples_t>& X = handler.getX();
|
||||
mdlp::labels_t& y = handler.getY();
|
||||
// Get className & Features
|
||||
auto className = handler.getClassName();
|
||||
std::vector<std::string> features;
|
||||
auto attributes = handler.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features),
|
||||
[](const pair<std::string, std::string>& item) { return item.first; });
|
||||
// Discretize Dataset
|
||||
auto [Xd, maxes] = discretize(X, y, features);
|
||||
maxes[className] = *max_element(y.begin(), y.end()) + 1;
|
||||
map<std::string, std::vector<int>> states;
|
||||
for (auto feature : features) {
|
||||
states[feature] = std::vector<int>(maxes[feature]);
|
||||
}
|
||||
states[className] = std::vector<int>(maxes[className]);
|
||||
auto clf = platform::Models::instance()->create(model_name);
|
||||
clf->fit(Xd, y, features, className, states);
|
||||
if (dump_cpt) {
|
||||
std::cout << "--- CPT Tables ---" << std::endl;
|
||||
clf->dump_cpt();
|
||||
}
|
||||
auto lines = clf->show();
|
||||
for (auto line : lines) {
|
||||
std::cout << line << std::endl;
|
||||
}
|
||||
std::cout << "--- Topological Order ---" << std::endl;
|
||||
auto order = clf->topological_order();
|
||||
for (auto name : order) {
|
||||
std::cout << name << ", ";
|
||||
}
|
||||
std::cout << "end." << std::endl;
|
||||
auto score = clf->score(Xd, y);
|
||||
std::cout << "Score: " << score << std::endl;
|
||||
auto graph = clf->graph();
|
||||
auto dot_file = model_name + "_" + file_name;
|
||||
ofstream file(dot_file + ".dot");
|
||||
file << graph;
|
||||
file.close();
|
||||
std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl;
|
||||
std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl;
|
||||
std::string stratified_string = stratified ? " Stratified" : "";
|
||||
std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl;
|
||||
std::cout << "==========================================" << std::endl;
|
||||
torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
|
||||
torch::Tensor yt = torch::tensor(y, torch::kInt32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
|
||||
}
|
||||
float total_score = 0, total_score_train = 0, score_train, score_test;
|
||||
folding::Fold* fold;
|
||||
if (stratified)
|
||||
fold = new folding::StratifiedKFold(nFolds, y, seed);
|
||||
else
|
||||
fold = new folding::KFold(nFolds, y.size(), seed);
|
||||
for (auto i = 0; i < nFolds; ++i) {
|
||||
auto [train, test] = fold->getFold(i);
|
||||
std::cout << "Fold: " << i + 1 << std::endl;
|
||||
if (tensors) {
|
||||
auto ttrain = torch::tensor(train, torch::kInt64);
|
||||
auto ttest = torch::tensor(test, torch::kInt64);
|
||||
torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
|
||||
torch::Tensor ytraint = yt.index({ ttrain });
|
||||
torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
|
||||
torch::Tensor ytestt = yt.index({ ttest });
|
||||
clf->fit(Xtraint, ytraint, features, className, states);
|
||||
auto temp = clf->predict(Xtraint);
|
||||
score_train = clf->score(Xtraint, ytraint);
|
||||
score_test = clf->score(Xtestt, ytestt);
|
||||
} else {
|
||||
auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
|
||||
auto [Xtest, ytest] = extract_indices(test, Xd, y);
|
||||
clf->fit(Xtrain, ytrain, features, className, states);
|
||||
score_train = clf->score(Xtrain, ytrain);
|
||||
score_test = clf->score(Xtest, ytest);
|
||||
}
|
||||
if (dump_cpt) {
|
||||
std::cout << "--- CPT Tables ---" << std::endl;
|
||||
clf->dump_cpt();
|
||||
}
|
||||
total_score_train += score_train;
|
||||
total_score += score_test;
|
||||
std::cout << "Score Train: " << score_train << std::endl;
|
||||
std::cout << "Score Test : " << score_test << std::endl;
|
||||
std::cout << "-------------------------------------------------------------------------------" << std::endl;
|
||||
}
|
||||
std::cout << "**********************************************************************************" << std::endl;
|
||||
std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl;
|
||||
std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0;
|
||||
}
|
279
sample/sample.cpp
Normal file
279
sample/sample.cpp
Normal file
@@ -0,0 +1,279 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <torch/torch.h>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <ArffFiles.hpp>
|
||||
#include <fimdlp/CPPFImdlp.h>
|
||||
#include <folding.hpp>
|
||||
#include <bayesnet/utils/BayesMetrics.h>
|
||||
#include <bayesnet/classifiers/SPODE.h>
|
||||
#include "Models.h"
|
||||
#include "modelRegister.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
const std::string PATH = { platform_data_path.begin(), platform_data_path.end() };
|
||||
|
||||
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
|
||||
{
|
||||
std::vector<mdlp::labels_t>Xd;
|
||||
map<std::string, int> maxes;
|
||||
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return { Xd, maxes };
|
||||
}
|
||||
|
||||
bool file_exists(const std::string& name)
|
||||
{
|
||||
if (FILE* file = fopen(name.c_str(), "r")) {
|
||||
fclose(file);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<int>>, std::vector<int>> extract_indices(std::vector<int> indices, std::vector<std::vector<int>> X, std::vector<int> y)
|
||||
{
|
||||
std::vector<std::vector<int>> Xr; // nxm
|
||||
std::vector<int> yr;
|
||||
for (int col = 0; col < X.size(); ++col) {
|
||||
Xr.push_back(std::vector<int>());
|
||||
}
|
||||
for (auto index : indices) {
|
||||
for (int col = 0; col < X.size(); ++col) {
|
||||
Xr[col].push_back(X[col][index]);
|
||||
}
|
||||
yr.push_back(y[index]);
|
||||
}
|
||||
return { Xr, yr };
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
map<std::string, bool> datasets = {
|
||||
{"diabetes", true},
|
||||
{"ecoli", true},
|
||||
{"glass", true},
|
||||
{"iris", true},
|
||||
{"kdd_JapaneseVowels", false},
|
||||
{"letter", true},
|
||||
{"liver-disorders", true},
|
||||
{"mfeat-factors", true},
|
||||
};
|
||||
auto valid_datasets = std::vector<std::string>();
|
||||
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
|
||||
[](const pair<std::string, bool>& pair) { return pair.first; });
|
||||
argparse::ArgumentParser program("PlatformSample");
|
||||
program.add_argument("-d", "--dataset")
|
||||
.help("Dataset file name")
|
||||
.action([valid_datasets](const std::string& value) {
|
||||
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
|
||||
return value;
|
||||
}
|
||||
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
|
||||
}
|
||||
);
|
||||
program.add_argument("-p", "--path")
|
||||
.help(" folder where the data files are located, default")
|
||||
.default_value(std::string{ PATH }
|
||||
);
|
||||
program.add_argument("-m", "--model")
|
||||
.help("Model to use " + platform::Models::instance()->toString())
|
||||
.action([](const std::string& value) {
|
||||
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
|
||||
if (find(choices.begin(), choices.end(), value) != choices.end()) {
|
||||
return value;
|
||||
}
|
||||
throw runtime_error("Model must be one of " + platform::Models::instance()->toString());
|
||||
}
|
||||
);
|
||||
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
|
||||
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
|
||||
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
|
||||
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
|
||||
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw runtime_error("Number of folds must be greater than 1");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const runtime_error& err) {
|
||||
throw runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw runtime_error("Number of folds must be an integer");
|
||||
}});
|
||||
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
|
||||
bool class_last, stratified, tensors, dump_cpt;
|
||||
std::string model_name, file_name, path, complete_file_name;
|
||||
int nFolds, seed;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
file_name = program.get<std::string>("dataset");
|
||||
path = program.get<std::string>("path");
|
||||
model_name = program.get<std::string>("model");
|
||||
complete_file_name = path + file_name + ".arff";
|
||||
stratified = program.get<bool>("stratified");
|
||||
tensors = program.get<bool>("tensors");
|
||||
nFolds = program.get<int>("folds");
|
||||
seed = program.get<int>("seed");
|
||||
dump_cpt = program.get<bool>("dumpcpt");
|
||||
class_last = datasets[file_name];
|
||||
if (!file_exists(complete_file_name)) {
|
||||
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
|
||||
}
|
||||
}
|
||||
catch (const exception& err) {
|
||||
cerr << err.what() << std::endl;
|
||||
cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Begin Processing
|
||||
*/
|
||||
auto handler = ArffFiles();
|
||||
handler.load(complete_file_name, class_last);
|
||||
// Get Dataset X, y
|
||||
std::vector<mdlp::samples_t>& X = handler.getX();
|
||||
mdlp::labels_t& y = handler.getY();
|
||||
// Get className & Features
|
||||
auto className = handler.getClassName();
|
||||
std::vector<std::string> features;
|
||||
auto attributes = handler.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features),
|
||||
[](const pair<std::string, std::string>& item) { return item.first; });
|
||||
// Discretize Dataset
|
||||
auto [Xd, maxes] = discretize(X, y, features);
|
||||
maxes[className] = *max_element(y.begin(), y.end()) + 1;
|
||||
map<std::string, std::vector<int>> states;
|
||||
for (auto feature : features) {
|
||||
states[feature] = std::vector<int>(maxes[feature]);
|
||||
}
|
||||
states[className] = std::vector<int>(maxes[className]);
|
||||
// Output the states
|
||||
std::cout << std::string(80, '-') << std::endl;
|
||||
std::cout << "States" << std::endl;
|
||||
for (auto feature : features) {
|
||||
std::cout << feature << ": " << states[feature].size() << std::endl;
|
||||
}
|
||||
std::cout << std::string(80, '-') << std::endl;
|
||||
//auto clf = platform::Models::instance()->create("SPODE");
|
||||
auto clf = bayesnet::SPODE(2);
|
||||
|
||||
bayesnet::Smoothing_t smoothing = bayesnet::Smoothing_t::ORIGINAL;
|
||||
clf.fit(Xd, y, features, className, states, smoothing);
|
||||
if (dump_cpt) {
|
||||
std::cout << "--- CPT Tables ---" << std::endl;
|
||||
std::cout << clf.dump_cpt();
|
||||
}
|
||||
std::cout << "--- Datos predicción ---" << std::endl;
|
||||
std::cout << "Orden de variables: " << std::endl;
|
||||
for (auto feature : features) {
|
||||
std::cout << feature << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << "X[0]: ";
|
||||
for (int i = 0; i < Xd.size(); ++i) {
|
||||
std::cout << Xd[i][0] << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << std::string(80, '-') << std::endl;
|
||||
|
||||
auto lines = clf.show();
|
||||
for (auto line : lines) {
|
||||
std::cout << line << std::endl;
|
||||
}
|
||||
std::cout << "--- Topological Order ---" << std::endl;
|
||||
auto order = clf.topological_order();
|
||||
for (auto name : order) {
|
||||
std::cout << name << ", ";
|
||||
}
|
||||
auto predict_proba = clf.predict_proba(Xd);
|
||||
std::cout << "Instances predict_proba: ";
|
||||
for (int i = 0; i < predict_proba.size(); i++) {
|
||||
std::cout << "Instance " << i << ": ";
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
std::cout << Xd[j][i] << ", ";
|
||||
}
|
||||
std::cout << ": ";
|
||||
for (auto score : predict_proba[i]) {
|
||||
std::cout << score << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
// std::cout << std::endl;
|
||||
// std::cout << "end." << std::endl;
|
||||
// auto score = clf->score(Xd, y);
|
||||
// std::cout << "Score: " << score << std::endl;
|
||||
// auto graph = clf->graph();
|
||||
// auto dot_file = model_name + "_" + file_name;
|
||||
// ofstream file(dot_file + ".dot");
|
||||
// file << graph;
|
||||
// file.close();
|
||||
// std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl;
|
||||
// std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl;
|
||||
// std::string stratified_string = stratified ? " Stratified" : "";
|
||||
// std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl;
|
||||
// std::cout << "==========================================" << std::endl;
|
||||
// torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
|
||||
// torch::Tensor yt = torch::tensor(y, torch::kInt32);
|
||||
// for (int i = 0; i < features.size(); ++i) {
|
||||
// Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
|
||||
// }
|
||||
// float total_score = 0, total_score_train = 0, score_train, score_test;
|
||||
// folding::Fold* fold;
|
||||
// double nodes = 0.0;
|
||||
// if (stratified)
|
||||
// fold = new folding::StratifiedKFold(nFolds, y, seed);
|
||||
// else
|
||||
// fold = new folding::KFold(nFolds, y.size(), seed);
|
||||
// for (auto i = 0; i < nFolds; ++i) {
|
||||
// auto [train, test] = fold->getFold(i);
|
||||
// std::cout << "Fold: " << i + 1 << std::endl;
|
||||
// if (tensors) {
|
||||
// auto ttrain = torch::tensor(train, torch::kInt64);
|
||||
// auto ttest = torch::tensor(test, torch::kInt64);
|
||||
// torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
|
||||
// torch::Tensor ytraint = yt.index({ ttrain });
|
||||
// torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
|
||||
// torch::Tensor ytestt = yt.index({ ttest });
|
||||
// clf->fit(Xtraint, ytraint, features, className, states, smoothing);
|
||||
// auto temp = clf->predict(Xtraint);
|
||||
// score_train = clf->score(Xtraint, ytraint);
|
||||
// score_test = clf->score(Xtestt, ytestt);
|
||||
// } else {
|
||||
// auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
|
||||
// auto [Xtest, ytest] = extract_indices(test, Xd, y);
|
||||
// clf->fit(Xtrain, ytrain, features, className, states, smoothing);
|
||||
// std::cout << "Nodes: " << clf->getNumberOfNodes() << std::endl;
|
||||
// nodes += clf->getNumberOfNodes();
|
||||
// score_train = clf->score(Xtrain, ytrain);
|
||||
// score_test = clf->score(Xtest, ytest);
|
||||
// }
|
||||
// // if (dump_cpt) {
|
||||
// // std::cout << "--- CPT Tables ---" << std::endl;
|
||||
// // std::cout << clf->dump_cpt();
|
||||
// // }
|
||||
// total_score_train += score_train;
|
||||
// total_score += score_test;
|
||||
// std::cout << "Score Train: " << score_train << std::endl;
|
||||
// std::cout << "Score Test : " << score_test << std::endl;
|
||||
// std::cout << "-------------------------------------------------------------------------------" << std::endl;
|
||||
// }
|
||||
|
||||
// std::cout << "Nodes: " << nodes / nFolds << std::endl;
|
||||
// std::cout << "**********************************************************************************" << std::endl;
|
||||
// std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl;
|
||||
// std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0;
|
||||
}
|
80
src/CMakeLists.txt
Normal file
80
src/CMakeLists.txt
Normal file
@@ -0,0 +1,80 @@
|
||||
include_directories(
|
||||
${Python3_INCLUDE_DIRS}
|
||||
${MPI_CXX_INCLUDE_DIRS}
|
||||
${CMAKE_BINARY_DIR}/configured_files/include
|
||||
${Platform_SOURCE_DIR}/src
|
||||
)
|
||||
|
||||
# b_best
|
||||
add_executable(
|
||||
b_best commands/b_best.cpp best/Statistics.cpp
|
||||
best/BestResultsExcel.cpp best/BestResultsTex.cpp best/BestResultsMd.cpp best/BestResults.cpp
|
||||
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||
main/Models.cpp main/Scores.cpp
|
||||
reports/ReportExcel.cpp reports/ReportBase.cpp reports/ExcelFile.cpp
|
||||
results/Result.cpp
|
||||
experimental_clfs/XA1DE.cpp
|
||||
experimental_clfs/ExpClf.cpp
|
||||
experimental_clfs/DecisionTree.cpp
|
||||
experimental_clfs/AdaBoost.cpp
|
||||
)
|
||||
target_link_libraries(b_best Boost::boost bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy libxlsxwriter::libxlsxwriter)
|
||||
|
||||
# b_grid
|
||||
set(grid_sources GridSearch.cpp GridData.cpp GridExperiment.cpp GridBase.cpp )
|
||||
list(TRANSFORM grid_sources PREPEND grid/)
|
||||
add_executable(b_grid commands/b_grid.cpp ${grid_sources}
|
||||
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||
main/HyperParameters.cpp main/Models.cpp main/Experiment.cpp main/Scores.cpp main/ArgumentsExperiment.cpp
|
||||
reports/ReportConsole.cpp reports/ReportBase.cpp
|
||||
results/Result.cpp
|
||||
experimental_clfs/XA1DE.cpp
|
||||
experimental_clfs/ExpClf.cpp
|
||||
experimental_clfs/DecisionTree.cpp
|
||||
experimental_clfs/AdaBoost.cpp
|
||||
)
|
||||
target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy)
|
||||
|
||||
# b_list
|
||||
add_executable(b_list commands/b_list.cpp
|
||||
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||
main/Models.cpp main/Scores.cpp
|
||||
reports/ReportExcel.cpp reports/ExcelFile.cpp reports/ReportBase.cpp reports/DatasetsExcel.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
|
||||
results/Result.cpp results/ResultsDatasetExcel.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
|
||||
experimental_clfs/XA1DE.cpp
|
||||
experimental_clfs/ExpClf.cpp
|
||||
experimental_clfs/DecisionTree.cpp
|
||||
experimental_clfs/AdaBoost.cpp
|
||||
)
|
||||
target_link_libraries(b_list bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy libxlsxwriter::libxlsxwriter)
|
||||
|
||||
# b_main
|
||||
set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp ArgumentsExperiment.cpp)
|
||||
list(TRANSFORM main_sources PREPEND main/)
|
||||
add_executable(b_main commands/b_main.cpp ${main_sources}
|
||||
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||
reports/ReportConsole.cpp reports/ReportBase.cpp
|
||||
results/Result.cpp
|
||||
experimental_clfs/XA1DE.cpp
|
||||
experimental_clfs/ExpClf.cpp
|
||||
experimental_clfs/ExpClf.cpp
|
||||
experimental_clfs/DecisionTree.cpp
|
||||
experimental_clfs/AdaBoost.cpp
|
||||
)
|
||||
target_link_libraries(b_main PRIVATE nlohmann_json::nlohmann_json bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy)
|
||||
|
||||
# b_manage
|
||||
set(manage_sources ManageScreen.cpp OptionsMenu.cpp ResultsManager.cpp)
|
||||
list(TRANSFORM manage_sources PREPEND manage/)
|
||||
add_executable(
|
||||
b_manage commands/b_manage.cpp ${manage_sources}
|
||||
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||
reports/ReportConsole.cpp reports/ReportExcel.cpp reports/ReportExcelCompared.cpp reports/ReportBase.cpp reports/ExcelFile.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
|
||||
results/Result.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
|
||||
main/Scores.cpp
|
||||
)
|
||||
target_link_libraries(b_manage torch::torch libxlsxwriter::libxlsxwriter fimdlp::fimdlp bayesnet::bayesnet argparse::argparse)
|
||||
|
||||
# b_results
|
||||
add_executable(b_results commands/b_results.cpp)
|
||||
target_link_libraries(b_results torch::torch libxlsxwriter::libxlsxwriter fimdlp::fimdlp bayesnet::bayesnet argparse::argparse)
|
@@ -1,28 +0,0 @@
|
||||
include_directories(
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/src/BayesNet
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/folding
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/mdlp
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/json/include
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/src/PyClassifiers
|
||||
${Platform_SOURCE_DIR}/src/Platform
|
||||
${Platform_SOURCE_DIR}/lib/Files
|
||||
${Platform_SOURCE_DIR}/lib/mdlp
|
||||
${Platform_SOURCE_DIR}/lib/argparse/include
|
||||
${Platform_SOURCE_DIR}/lib/json/include
|
||||
${Platform_SOURCE_DIR}/lib/libxlsxwriter/include
|
||||
${Python3_INCLUDE_DIRS}
|
||||
${MPI_CXX_INCLUDE_DIRS}
|
||||
${CMAKE_BINARY_DIR}/configured_files/include
|
||||
)
|
||||
|
||||
add_executable(b_best b_best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
|
||||
add_executable(b_grid b_grid.cc GridSearch.cc GridData.cc HyperParameters.cc Datasets.cc Dataset.cc Models.cc)
|
||||
add_executable(b_list b_list.cc Datasets.cc Dataset.cc)
|
||||
add_executable(b_main b_main.cc Experiment.cc Datasets.cc Dataset.cc Models.cc HyperParameters.cc ReportConsole.cc ReportBase.cc)
|
||||
add_executable(b_manage b_manage.cc Results.cc ManageResults.cc CommandParser.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
|
||||
|
||||
target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}" "${TORCH_LIBRARIES}" ArffFiles mdlp)
|
||||
target_link_libraries(b_grid PyClassifiers ${MPI_CXX_LIBRARIES} ArffFiles)
|
||||
target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}")
|
||||
target_link_libraries(b_main PyClassifiers BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
|
||||
target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)
|
@@ -1,15 +0,0 @@
|
||||
#ifndef COLORS_H
|
||||
#define COLORS_H
|
||||
class Colors {
|
||||
public:
|
||||
static std::string MAGENTA() { return "\033[1;35m"; }
|
||||
static std::string BLUE() { return "\033[1;34m"; }
|
||||
static std::string CYAN() { return "\033[1;36m"; }
|
||||
static std::string GREEN() { return "\033[1;32m"; }
|
||||
static std::string YELLOW() { return "\033[1;33m"; }
|
||||
static std::string RED() { return "\033[1;31m"; }
|
||||
static std::string WHITE() { return "\033[1;37m"; }
|
||||
static std::string IBLUE() { return "\033[0;94m"; }
|
||||
static std::string RESET() { return "\033[0m"; }
|
||||
};
|
||||
#endif // COLORS_H
|
@@ -1,87 +0,0 @@
|
||||
#include "CommandParser.h"
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include "Colors.h"
|
||||
#include "Utils.h"
|
||||
|
||||
namespace platform {
|
||||
void CommandParser::messageError(const std::string& message)
|
||||
{
|
||||
std::cout << Colors::RED() << message << Colors::RESET() << std::endl;
|
||||
}
|
||||
std::pair<char, int> CommandParser::parse(const std::string& color, const std::vector<std::tuple<std::string, char, bool>>& options, const char defaultCommand, const int maxIndex)
|
||||
{
|
||||
bool finished = false;
|
||||
while (!finished) {
|
||||
std::stringstream oss;
|
||||
std::string line;
|
||||
oss << color << "Choose option (";
|
||||
bool first = true;
|
||||
for (auto& option : options) {
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
oss << ", ";
|
||||
}
|
||||
oss << std::get<char>(option) << "=" << std::get<std::string>(option);
|
||||
}
|
||||
oss << "): ";
|
||||
std::cout << oss.str();
|
||||
getline(std::cin, line);
|
||||
std::cout << Colors::RESET();
|
||||
line = trim(line);
|
||||
if (line.size() == 0)
|
||||
continue;
|
||||
if (all_of(line.begin(), line.end(), ::isdigit)) {
|
||||
command = defaultCommand;
|
||||
index = stoi(line);
|
||||
if (index > maxIndex || index < 0) {
|
||||
messageError("Index out of range");
|
||||
continue;
|
||||
}
|
||||
finished = true;
|
||||
break;
|
||||
}
|
||||
bool found = false;
|
||||
for (auto& option : options) {
|
||||
if (line[0] == std::get<char>(option)) {
|
||||
found = true;
|
||||
// it's a match
|
||||
line.erase(line.begin());
|
||||
line = trim(line);
|
||||
if (std::get<bool>(option)) {
|
||||
// The option requires a value
|
||||
if (line.size() == 0) {
|
||||
messageError("Option " + std::get<std::string>(option) + " requires a value");
|
||||
break;
|
||||
}
|
||||
try {
|
||||
index = stoi(line);
|
||||
if (index > maxIndex || index < 0) {
|
||||
messageError("Index out of range");
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (const std::invalid_argument& ia) {
|
||||
messageError("Invalid value: " + line);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (line.size() > 0) {
|
||||
messageError("option " + std::get<std::string>(option) + " doesn't accept values");
|
||||
break;
|
||||
}
|
||||
}
|
||||
command = std::get<char>(option);
|
||||
finished = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
messageError("I don't know " + line);
|
||||
}
|
||||
}
|
||||
return { command, index };
|
||||
}
|
||||
} /* namespace platform */
|
@@ -1,20 +0,0 @@
|
||||
#ifndef COMMAND_PARSER_H
|
||||
#define COMMAND_PARSER_H
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <tuple>
|
||||
|
||||
namespace platform {
|
||||
class CommandParser {
|
||||
public:
|
||||
CommandParser() = default;
|
||||
std::pair<char, int> parse(const std::string& color, const std::vector<std::tuple<std::string, char, bool>>& options, const char defaultCommand, const int maxIndex);
|
||||
char getCommand() const { return command; };
|
||||
int getIndex() const { return index; };
|
||||
private:
|
||||
void messageError(const std::string& message);
|
||||
char command;
|
||||
int index;
|
||||
};
|
||||
} /* namespace platform */
|
||||
#endif /* COMMAND_PARSER_H */
|
@@ -1,215 +0,0 @@
|
||||
#include "Dataset.h"
|
||||
#include "ArffFiles.h"
|
||||
#include <fstream>
|
||||
namespace platform {
|
||||
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
|
||||
{
|
||||
}
|
||||
std::string Dataset::getName() const
|
||||
{
|
||||
return name;
|
||||
}
|
||||
std::string Dataset::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
std::vector<std::string> Dataset::getFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return features;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Dataset::getNFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_features;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Dataset::getNSamples() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_samples;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
std::map<std::string, std::vector<int>> Dataset::getStates() const
|
||||
{
|
||||
if (loaded) {
|
||||
return states;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xv, yv };
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<int>>&, std::vector<int>&> Dataset::getVectorsDiscretized()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xd, yv };
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
|
||||
{
|
||||
if (loaded) {
|
||||
buildTensors();
|
||||
return { X, y };
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
void Dataset::load_csv()
|
||||
{
|
||||
ifstream file(path + "/" + name + ".csv");
|
||||
if (file.is_open()) {
|
||||
std::string line;
|
||||
getline(file, line);
|
||||
std::vector<std::string> tokens = split(line, ',');
|
||||
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
|
||||
if (className == "-1") {
|
||||
className = tokens.back();
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = split(line, ',');
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv[i].push_back(stof(tokens[i]));
|
||||
}
|
||||
yv.push_back(stoi(tokens.back()));
|
||||
}
|
||||
file.close();
|
||||
} else {
|
||||
throw std::invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
}
|
||||
void Dataset::computeStates()
|
||||
{
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
states[features[i]] = std::vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
|
||||
auto item = states.at(features[i]);
|
||||
iota(begin(item), end(item), 0);
|
||||
}
|
||||
states[className] = std::vector<int>(*max_element(yv.begin(), yv.end()) + 1);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
}
|
||||
void Dataset::load_arff()
|
||||
{
|
||||
auto arff = ArffFiles();
|
||||
arff.load(path + "/" + name + ".arff", className);
|
||||
// Get Dataset X, y
|
||||
Xv = arff.getX();
|
||||
yv = arff.getY();
|
||||
// Get className & Features
|
||||
className = arff.getClassName();
|
||||
auto attributes = arff.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
|
||||
}
|
||||
std::vector<std::string> tokenize(std::string line)
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
for (auto i = 0; i < line.size(); ++i) {
|
||||
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
|
||||
std::string token = line.substr(0, i);
|
||||
tokens.push_back(token);
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
i = 0;
|
||||
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
}
|
||||
}
|
||||
if (line.size() > 0) {
|
||||
tokens.push_back(line);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
void Dataset::load_rdata()
|
||||
{
|
||||
ifstream file(path + "/" + name + "_R.dat");
|
||||
if (file.is_open()) {
|
||||
std::string line;
|
||||
getline(file, line);
|
||||
line = ArffFiles::trim(line);
|
||||
std::vector<std::string> tokens = tokenize(line);
|
||||
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
|
||||
if (className == "-1") {
|
||||
className = ArffFiles::trim(tokens.back());
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = tokenize(line);
|
||||
// We have to skip the first token, which is the instance number.
|
||||
for (auto i = 1; i < features.size() + 1; ++i) {
|
||||
const float value = stof(tokens[i]);
|
||||
Xv[i - 1].push_back(value);
|
||||
}
|
||||
yv.push_back(stoi(tokens.back()));
|
||||
}
|
||||
file.close();
|
||||
} else {
|
||||
throw std::invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
}
|
||||
void Dataset::load()
|
||||
{
|
||||
if (loaded) {
|
||||
return;
|
||||
}
|
||||
if (fileType == CSV) {
|
||||
load_csv();
|
||||
} else if (fileType == ARFF) {
|
||||
load_arff();
|
||||
} else if (fileType == RDATA) {
|
||||
load_rdata();
|
||||
}
|
||||
if (discretize) {
|
||||
Xd = discretizeDataset(Xv, yv);
|
||||
computeStates();
|
||||
}
|
||||
n_samples = Xv[0].size();
|
||||
n_features = Xv.size();
|
||||
loaded = true;
|
||||
}
|
||||
void Dataset::buildTensors()
|
||||
{
|
||||
if (discretize) {
|
||||
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
|
||||
} else {
|
||||
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
|
||||
}
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
if (discretize) {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
|
||||
} else {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
|
||||
}
|
||||
}
|
||||
y = torch::tensor(yv, torch::kInt32);
|
||||
}
|
||||
std::vector<mdlp::labels_t> Dataset::discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
||||
{
|
||||
std::vector<mdlp::labels_t> Xd;
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return Xd;
|
||||
}
|
||||
}
|
@@ -1,129 +0,0 @@
|
||||
#include "Datasets.h"
|
||||
#include <fstream>
|
||||
namespace platform {
|
||||
void Datasets::load()
|
||||
{
|
||||
auto sd = SourceData(sfileType);
|
||||
fileType = sd.getFileType();
|
||||
path = sd.getPath();
|
||||
ifstream catalog(path + "all.txt");
|
||||
if (catalog.is_open()) {
|
||||
std::string line;
|
||||
while (getline(catalog, line)) {
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
std::vector<std::string> tokens = split(line, ',');
|
||||
std::string name = tokens[0];
|
||||
std::string className;
|
||||
if (tokens.size() == 1) {
|
||||
className = "-1";
|
||||
} else {
|
||||
className = tokens[1];
|
||||
}
|
||||
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
|
||||
}
|
||||
catalog.close();
|
||||
} else {
|
||||
throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
|
||||
}
|
||||
}
|
||||
std::vector<std::string> Datasets::getNames()
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
|
||||
return result;
|
||||
}
|
||||
std::vector<std::string> Datasets::getFeatures(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getFeatures();
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
map<std::string, std::vector<int>> Datasets::getStates(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getStates();
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
void Datasets::loadDataset(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return;
|
||||
} else {
|
||||
datasets.at(name)->load();
|
||||
}
|
||||
}
|
||||
std::string Datasets::getClassName(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getClassName();
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Datasets::getNSamples(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getNSamples();
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Datasets::getNClasses(const std::string& name)
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
auto className = datasets.at(name)->getClassName();
|
||||
if (discretize) {
|
||||
auto states = getStates(name);
|
||||
return states.at(className).size();
|
||||
}
|
||||
auto [Xv, yv] = getVectors(name);
|
||||
return *std::max_element(yv.begin(), yv.end()) + 1;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
std::vector<int> Datasets::getClassesCounts(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
auto [Xv, yv] = datasets.at(name)->getVectors();
|
||||
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
|
||||
for (auto y : yv) {
|
||||
counts[y]++;
|
||||
}
|
||||
return counts;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<float>>&, std::vector<int>&> Datasets::getVectors(const std::string& name)
|
||||
{
|
||||
if (!datasets[name]->isLoaded()) {
|
||||
datasets[name]->load();
|
||||
}
|
||||
return datasets[name]->getVectors();
|
||||
}
|
||||
pair<std::vector<std::vector<int>>&, std::vector<int>&> Datasets::getVectorsDiscretized(const std::string& name)
|
||||
{
|
||||
if (!datasets[name]->isLoaded()) {
|
||||
datasets[name]->load();
|
||||
}
|
||||
return datasets[name]->getVectorsDiscretized();
|
||||
}
|
||||
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(const std::string& name)
|
||||
{
|
||||
if (!datasets[name]->isLoaded()) {
|
||||
datasets[name]->load();
|
||||
}
|
||||
return datasets[name]->getTensors();
|
||||
}
|
||||
bool Datasets::isDataset(const std::string& name) const
|
||||
{
|
||||
return datasets.find(name) != datasets.end();
|
||||
}
|
||||
}
|
@@ -1,30 +0,0 @@
|
||||
#ifndef DATASETS_H
|
||||
#define DATASETS_H
|
||||
#include "Dataset.h"
|
||||
namespace platform {
|
||||
class Datasets {
|
||||
private:
|
||||
std::string path;
|
||||
fileType_t fileType;
|
||||
std::string sfileType;
|
||||
std::map<std::string, std::unique_ptr<Dataset>> datasets;
|
||||
bool discretize;
|
||||
void load(); // Loads the list of datasets
|
||||
public:
|
||||
explicit Datasets(bool discretize, std::string sfileType) : discretize(discretize), sfileType(sfileType) { load(); };
|
||||
std::vector<string> getNames();
|
||||
std::vector<string> getFeatures(const std::string& name) const;
|
||||
int getNSamples(const std::string& name) const;
|
||||
std::string getClassName(const std::string& name) const;
|
||||
int getNClasses(const std::string& name);
|
||||
std::vector<int> getClassesCounts(const std::string& name) const;
|
||||
std::map<std::string, std::vector<int>> getStates(const std::string& name) const;
|
||||
std::pair<std::vector<std::vector<float>>&, std::vector<int>&> getVectors(const std::string& name);
|
||||
std::pair<std::vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized(const std::string& name);
|
||||
std::pair<torch::Tensor&, torch::Tensor&> getTensors(const std::string& name);
|
||||
bool isDataset(const std::string& name) const;
|
||||
void loadDataset(const std::string& name) const;
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
@@ -1,55 +0,0 @@
|
||||
#ifndef DOTENV_H
|
||||
#define DOTENV_H
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include "Utils.h"
|
||||
|
||||
//#include "Dataset.h"
|
||||
namespace platform {
|
||||
class DotEnv {
|
||||
private:
|
||||
std::map<std::string, std::string> env;
|
||||
public:
|
||||
DotEnv()
|
||||
{
|
||||
std::ifstream file(".env");
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "File .env not found" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
std::string line;
|
||||
while (std::getline(file, line)) {
|
||||
line = trim(line);
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
std::istringstream iss(line);
|
||||
std::string key, value;
|
||||
if (std::getline(iss, key, '=') && std::getline(iss, value)) {
|
||||
env[key] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::string get(const std::string& key)
|
||||
{
|
||||
return env.at(key);
|
||||
}
|
||||
std::vector<int> getSeeds()
|
||||
{
|
||||
auto seeds = std::vector<int>();
|
||||
auto seeds_str = env["seeds"];
|
||||
seeds_str = trim(seeds_str);
|
||||
seeds_str = seeds_str.substr(1, seeds_str.size() - 2);
|
||||
auto seeds_str_split = split(seeds_str, ',');
|
||||
transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) {
|
||||
return stoi(str);
|
||||
});
|
||||
return seeds;
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -1,226 +0,0 @@
|
||||
#include <fstream>
|
||||
#include "Experiment.h"
|
||||
#include "Datasets.h"
|
||||
#include "Models.h"
|
||||
#include "ReportConsole.h"
|
||||
#include "Paths.h"
|
||||
namespace platform {
|
||||
using json = nlohmann::json;
|
||||
std::string get_date()
|
||||
{
|
||||
time_t rawtime;
|
||||
tm* timeinfo;
|
||||
time(&rawtime);
|
||||
timeinfo = std::localtime(&rawtime);
|
||||
std::ostringstream oss;
|
||||
oss << std::put_time(timeinfo, "%Y-%m-%d");
|
||||
return oss.str();
|
||||
}
|
||||
std::string get_time()
|
||||
{
|
||||
time_t rawtime;
|
||||
tm* timeinfo;
|
||||
time(&rawtime);
|
||||
timeinfo = std::localtime(&rawtime);
|
||||
std::ostringstream oss;
|
||||
oss << std::put_time(timeinfo, "%H:%M:%S");
|
||||
return oss.str();
|
||||
}
|
||||
std::string Experiment::get_file_name()
|
||||
{
|
||||
std::string result = "results_" + score_name + "_" + model + "_" + platform + "_" + get_date() + "_" + get_time() + "_" + (stratified ? "1" : "0") + ".json";
|
||||
return result;
|
||||
}
|
||||
|
||||
json Experiment::build_json()
|
||||
{
|
||||
json result;
|
||||
result["title"] = title;
|
||||
result["date"] = get_date();
|
||||
result["time"] = get_time();
|
||||
result["model"] = model;
|
||||
result["version"] = model_version;
|
||||
result["platform"] = platform;
|
||||
result["score_name"] = score_name;
|
||||
result["language"] = language;
|
||||
result["language_version"] = language_version;
|
||||
result["discretized"] = discretized;
|
||||
result["stratified"] = stratified;
|
||||
result["folds"] = nfolds;
|
||||
result["seeds"] = randomSeeds;
|
||||
result["duration"] = duration;
|
||||
result["results"] = json::array();
|
||||
for (const auto& r : results) {
|
||||
json j;
|
||||
j["dataset"] = r.getDataset();
|
||||
j["hyperparameters"] = r.getHyperparameters();
|
||||
j["samples"] = r.getSamples();
|
||||
j["features"] = r.getFeatures();
|
||||
j["classes"] = r.getClasses();
|
||||
j["score_train"] = r.getScoreTrain();
|
||||
j["score_test"] = r.getScoreTest();
|
||||
j["score"] = r.getScoreTest();
|
||||
j["score_std"] = r.getScoreTestStd();
|
||||
j["score_train_std"] = r.getScoreTrainStd();
|
||||
j["score_test_std"] = r.getScoreTestStd();
|
||||
j["train_time"] = r.getTrainTime();
|
||||
j["train_time_std"] = r.getTrainTimeStd();
|
||||
j["test_time"] = r.getTestTime();
|
||||
j["test_time_std"] = r.getTestTimeStd();
|
||||
j["time"] = r.getTestTime() + r.getTrainTime();
|
||||
j["time_std"] = r.getTestTimeStd() + r.getTrainTimeStd();
|
||||
j["scores_train"] = r.getScoresTrain();
|
||||
j["scores_test"] = r.getScoresTest();
|
||||
j["times_train"] = r.getTimesTrain();
|
||||
j["times_test"] = r.getTimesTest();
|
||||
j["nodes"] = r.getNodes();
|
||||
j["leaves"] = r.getLeaves();
|
||||
j["depth"] = r.getDepth();
|
||||
result["results"].push_back(j);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
void Experiment::save(const std::string& path)
|
||||
{
|
||||
json data = build_json();
|
||||
ofstream file(path + "/" + get_file_name());
|
||||
file << data;
|
||||
file.close();
|
||||
}
|
||||
|
||||
void Experiment::report()
|
||||
{
|
||||
json data = build_json();
|
||||
ReportConsole report(data);
|
||||
report.show();
|
||||
}
|
||||
|
||||
void Experiment::show()
|
||||
{
|
||||
json data = build_json();
|
||||
std::cout << data.dump(4) << std::endl;
|
||||
}
|
||||
|
||||
void Experiment::go(std::vector<std::string> filesToProcess, bool quiet)
|
||||
{
|
||||
std::cout << "*** Starting experiment: " << title << " ***" << std::endl;
|
||||
for (auto fileName : filesToProcess) {
|
||||
std::cout << "- " << setw(20) << left << fileName << " " << right << flush;
|
||||
cross_validation(fileName, quiet);
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::string getColor(bayesnet::status_t status)
|
||||
{
|
||||
switch (status) {
|
||||
case bayesnet::NORMAL:
|
||||
return Colors::GREEN();
|
||||
case bayesnet::WARNING:
|
||||
return Colors::YELLOW();
|
||||
case bayesnet::ERROR:
|
||||
return Colors::RED();
|
||||
default:
|
||||
return Colors::RESET();
|
||||
}
|
||||
}
|
||||
|
||||
void showProgress(int fold, const std::string& color, const std::string& phase)
|
||||
{
|
||||
std::string prefix = phase == "a" ? "" : "\b\b\b\b";
|
||||
std::cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush;
|
||||
|
||||
}
|
||||
void Experiment::cross_validation(const std::string& fileName, bool quiet)
|
||||
{
|
||||
auto datasets = Datasets(discretized, Paths::datasets());
|
||||
// Get dataset
|
||||
auto [X, y] = datasets.getTensors(fileName);
|
||||
auto states = datasets.getStates(fileName);
|
||||
auto features = datasets.getFeatures(fileName);
|
||||
auto samples = datasets.getNSamples(fileName);
|
||||
auto className = datasets.getClassName(fileName);
|
||||
if (!quiet) {
|
||||
std::cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush;
|
||||
}
|
||||
// Prepare Result
|
||||
auto result = Result();
|
||||
auto [values, counts] = at::_unique(y);
|
||||
result.setSamples(X.size(1)).setFeatures(X.size(0)).setClasses(values.size(0));
|
||||
result.setHyperparameters(hyperparameters.get(fileName));
|
||||
// Initialize results std::vectors
|
||||
int nResults = nfolds * static_cast<int>(randomSeeds.size());
|
||||
auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64);
|
||||
auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64);
|
||||
auto train_time = torch::zeros({ nResults }, torch::kFloat64);
|
||||
auto test_time = torch::zeros({ nResults }, torch::kFloat64);
|
||||
auto nodes = torch::zeros({ nResults }, torch::kFloat64);
|
||||
auto edges = torch::zeros({ nResults }, torch::kFloat64);
|
||||
auto num_states = torch::zeros({ nResults }, torch::kFloat64);
|
||||
Timer train_timer, test_timer;
|
||||
int item = 0;
|
||||
for (auto seed : randomSeeds) {
|
||||
if (!quiet)
|
||||
std::cout << "(" << seed << ") doing Fold: " << flush;
|
||||
folding::Fold* fold;
|
||||
if (stratified)
|
||||
fold = new folding::StratifiedKFold(nfolds, y, seed);
|
||||
else
|
||||
fold = new folding::KFold(nfolds, y.size(0), seed);
|
||||
for (int nfold = 0; nfold < nfolds; nfold++) {
|
||||
auto clf = Models::instance()->create(model);
|
||||
setModelVersion(clf->getVersion());
|
||||
auto valid = clf->getValidHyperparameters();
|
||||
hyperparameters.check(valid, fileName);
|
||||
clf->setHyperparameters(hyperparameters.get(fileName));
|
||||
// Split train - test dataset
|
||||
train_timer.start();
|
||||
auto [train, test] = fold->getFold(nfold);
|
||||
auto train_t = torch::tensor(train);
|
||||
auto test_t = torch::tensor(test);
|
||||
auto X_train = X.index({ "...", train_t });
|
||||
auto y_train = y.index({ train_t });
|
||||
auto X_test = X.index({ "...", test_t });
|
||||
auto y_test = y.index({ test_t });
|
||||
if (!quiet)
|
||||
showProgress(nfold + 1, getColor(clf->getStatus()), "a");
|
||||
// Train model
|
||||
clf->fit(X_train, y_train, features, className, states);
|
||||
if (!quiet)
|
||||
showProgress(nfold + 1, getColor(clf->getStatus()), "b");
|
||||
nodes[item] = clf->getNumberOfNodes();
|
||||
edges[item] = clf->getNumberOfEdges();
|
||||
num_states[item] = clf->getNumberOfStates();
|
||||
train_time[item] = train_timer.getDuration();
|
||||
// Score train
|
||||
auto accuracy_train_value = clf->score(X_train, y_train);
|
||||
// Test model
|
||||
if (!quiet)
|
||||
showProgress(nfold + 1, getColor(clf->getStatus()), "c");
|
||||
test_timer.start();
|
||||
auto accuracy_test_value = clf->score(X_test, y_test);
|
||||
test_time[item] = test_timer.getDuration();
|
||||
accuracy_train[item] = accuracy_train_value;
|
||||
accuracy_test[item] = accuracy_test_value;
|
||||
if (!quiet)
|
||||
std::cout << "\b\b\b, " << flush;
|
||||
// Store results and times in std::vector
|
||||
result.addScoreTrain(accuracy_train_value);
|
||||
result.addScoreTest(accuracy_test_value);
|
||||
result.addTimeTrain(train_time[item].item<double>());
|
||||
result.addTimeTest(test_time[item].item<double>());
|
||||
item++;
|
||||
}
|
||||
if (!quiet)
|
||||
std::cout << "end. " << flush;
|
||||
delete fold;
|
||||
}
|
||||
result.setScoreTest(torch::mean(accuracy_test).item<double>()).setScoreTrain(torch::mean(accuracy_train).item<double>());
|
||||
result.setScoreTestStd(torch::std(accuracy_test).item<double>()).setScoreTrainStd(torch::std(accuracy_train).item<double>());
|
||||
result.setTrainTime(torch::mean(train_time).item<double>()).setTestTime(torch::mean(test_time).item<double>());
|
||||
result.setTestTimeStd(torch::std(test_time).item<double>()).setTrainTimeStd(torch::std(train_time).item<double>());
|
||||
result.setNodes(torch::mean(nodes).item<double>()).setLeaves(torch::mean(edges).item<double>()).setDepth(torch::mean(num_states).item<double>());
|
||||
result.setDataset(fileName);
|
||||
addResult(result);
|
||||
}
|
||||
}
|
@@ -1,103 +0,0 @@
|
||||
#ifndef EXPERIMENT_H
|
||||
#define EXPERIMENT_H
|
||||
#include <torch/torch.h>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <string>
|
||||
#include "folding.hpp"
|
||||
#include "BaseClassifier.h"
|
||||
#include "HyperParameters.h"
|
||||
#include "TAN.h"
|
||||
#include "KDB.h"
|
||||
#include "AODE.h"
|
||||
#include "Timer.h"
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::json;
|
||||
class Result {
|
||||
private:
|
||||
std::string dataset, model_version;
|
||||
json hyperparameters;
|
||||
int samples{ 0 }, features{ 0 }, classes{ 0 };
|
||||
double score_train{ 0 }, score_test{ 0 }, score_train_std{ 0 }, score_test_std{ 0 }, train_time{ 0 }, train_time_std{ 0 }, test_time{ 0 }, test_time_std{ 0 };
|
||||
float nodes{ 0 }, leaves{ 0 }, depth{ 0 };
|
||||
std::vector<double> scores_train, scores_test, times_train, times_test;
|
||||
public:
|
||||
Result() = default;
|
||||
Result& setDataset(const std::string& dataset) { this->dataset = dataset; return *this; }
|
||||
Result& setHyperparameters(const json& hyperparameters) { this->hyperparameters = hyperparameters; return *this; }
|
||||
Result& setSamples(int samples) { this->samples = samples; return *this; }
|
||||
Result& setFeatures(int features) { this->features = features; return *this; }
|
||||
Result& setClasses(int classes) { this->classes = classes; return *this; }
|
||||
Result& setScoreTrain(double score) { this->score_train = score; return *this; }
|
||||
Result& setScoreTest(double score) { this->score_test = score; return *this; }
|
||||
Result& setScoreTrainStd(double score_std) { this->score_train_std = score_std; return *this; }
|
||||
Result& setScoreTestStd(double score_std) { this->score_test_std = score_std; return *this; }
|
||||
Result& setTrainTime(double train_time) { this->train_time = train_time; return *this; }
|
||||
Result& setTrainTimeStd(double train_time_std) { this->train_time_std = train_time_std; return *this; }
|
||||
Result& setTestTime(double test_time) { this->test_time = test_time; return *this; }
|
||||
Result& setTestTimeStd(double test_time_std) { this->test_time_std = test_time_std; return *this; }
|
||||
Result& setNodes(float nodes) { this->nodes = nodes; return *this; }
|
||||
Result& setLeaves(float leaves) { this->leaves = leaves; return *this; }
|
||||
Result& setDepth(float depth) { this->depth = depth; return *this; }
|
||||
Result& addScoreTrain(double score) { scores_train.push_back(score); return *this; }
|
||||
Result& addScoreTest(double score) { scores_test.push_back(score); return *this; }
|
||||
Result& addTimeTrain(double time) { times_train.push_back(time); return *this; }
|
||||
Result& addTimeTest(double time) { times_test.push_back(time); return *this; }
|
||||
const float get_score_train() const { return score_train; }
|
||||
float get_score_test() { return score_test; }
|
||||
const std::string& getDataset() const { return dataset; }
|
||||
const json& getHyperparameters() const { return hyperparameters; }
|
||||
const int getSamples() const { return samples; }
|
||||
const int getFeatures() const { return features; }
|
||||
const int getClasses() const { return classes; }
|
||||
const double getScoreTrain() const { return score_train; }
|
||||
const double getScoreTest() const { return score_test; }
|
||||
const double getScoreTrainStd() const { return score_train_std; }
|
||||
const double getScoreTestStd() const { return score_test_std; }
|
||||
const double getTrainTime() const { return train_time; }
|
||||
const double getTrainTimeStd() const { return train_time_std; }
|
||||
const double getTestTime() const { return test_time; }
|
||||
const double getTestTimeStd() const { return test_time_std; }
|
||||
const float getNodes() const { return nodes; }
|
||||
const float getLeaves() const { return leaves; }
|
||||
const float getDepth() const { return depth; }
|
||||
const std::vector<double>& getScoresTrain() const { return scores_train; }
|
||||
const std::vector<double>& getScoresTest() const { return scores_test; }
|
||||
const std::vector<double>& getTimesTrain() const { return times_train; }
|
||||
const std::vector<double>& getTimesTest() const { return times_test; }
|
||||
};
|
||||
class Experiment {
|
||||
public:
|
||||
Experiment() = default;
|
||||
Experiment& setTitle(const std::string& title) { this->title = title; return *this; }
|
||||
Experiment& setModel(const std::string& model) { this->model = model; return *this; }
|
||||
Experiment& setPlatform(const std::string& platform) { this->platform = platform; return *this; }
|
||||
Experiment& setScoreName(const std::string& score_name) { this->score_name = score_name; return *this; }
|
||||
Experiment& setModelVersion(const std::string& model_version) { this->model_version = model_version; return *this; }
|
||||
Experiment& setLanguage(const std::string& language) { this->language = language; return *this; }
|
||||
Experiment& setLanguageVersion(const std::string& language_version) { this->language_version = language_version; return *this; }
|
||||
Experiment& setDiscretized(bool discretized) { this->discretized = discretized; return *this; }
|
||||
Experiment& setStratified(bool stratified) { this->stratified = stratified; return *this; }
|
||||
Experiment& setNFolds(int nfolds) { this->nfolds = nfolds; return *this; }
|
||||
Experiment& addResult(Result result) { results.push_back(result); return *this; }
|
||||
Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); return *this; }
|
||||
Experiment& setDuration(float duration) { this->duration = duration; return *this; }
|
||||
Experiment& setHyperparameters(const HyperParameters& hyperparameters_) { this->hyperparameters = hyperparameters_; return *this; }
|
||||
std::string get_file_name();
|
||||
void save(const std::string& path);
|
||||
void cross_validation(const std::string& fileName, bool quiet);
|
||||
void go(std::vector<std::string> filesToProcess, bool quiet);
|
||||
void show();
|
||||
void report();
|
||||
private:
|
||||
std::string title, model, platform, score_name, model_version, language_version, language;
|
||||
bool discretized{ false }, stratified{ false };
|
||||
std::vector<Result> results;
|
||||
std::vector<int> randomSeeds;
|
||||
HyperParameters hyperparameters;
|
||||
int nfolds{ 0 };
|
||||
float duration{ 0 };
|
||||
json build_json();
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -1,441 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <cstddef>
|
||||
#include <torch/torch.h>
|
||||
#include "GridSearch.h"
|
||||
#include "Models.h"
|
||||
#include "Paths.h"
|
||||
#include "folding.hpp"
|
||||
#include "Colors.h"
|
||||
|
||||
namespace platform {
|
||||
std::string get_date()
|
||||
{
|
||||
time_t rawtime;
|
||||
tm* timeinfo;
|
||||
time(&rawtime);
|
||||
timeinfo = std::localtime(&rawtime);
|
||||
std::ostringstream oss;
|
||||
oss << std::put_time(timeinfo, "%Y-%m-%d");
|
||||
return oss.str();
|
||||
}
|
||||
std::string get_time()
|
||||
{
|
||||
time_t rawtime;
|
||||
tm* timeinfo;
|
||||
time(&rawtime);
|
||||
timeinfo = std::localtime(&rawtime);
|
||||
std::ostringstream oss;
|
||||
oss << std::put_time(timeinfo, "%H:%M:%S");
|
||||
return oss.str();
|
||||
}
|
||||
std::string get_color_rank(int rank)
|
||||
{
|
||||
auto colors = { Colors::WHITE(), Colors::RED(), Colors::GREEN(), Colors::BLUE(), Colors::MAGENTA(), Colors::CYAN() };
|
||||
return *(colors.begin() + rank % colors.size());
|
||||
}
|
||||
GridSearch::GridSearch(struct ConfigGrid& config) : config(config)
|
||||
{
|
||||
}
|
||||
json GridSearch::loadResults()
|
||||
{
|
||||
std::ifstream file(Paths::grid_output(config.model));
|
||||
if (file.is_open()) {
|
||||
return json::parse(file);
|
||||
}
|
||||
return json();
|
||||
}
|
||||
std::vector<std::string> GridSearch::filterDatasets(Datasets& datasets) const
|
||||
{
|
||||
// Load datasets
|
||||
auto datasets_names = datasets.getNames();
|
||||
if (config.continue_from != NO_CONTINUE()) {
|
||||
// Continue previous execution:
|
||||
if (std::find(datasets_names.begin(), datasets_names.end(), config.continue_from) == datasets_names.end()) {
|
||||
throw std::invalid_argument("Dataset " + config.continue_from + " not found");
|
||||
}
|
||||
// Remove datasets already processed
|
||||
std::vector<string>::iterator it = datasets_names.begin();
|
||||
while (it != datasets_names.end()) {
|
||||
if (*it != config.continue_from) {
|
||||
it = datasets_names.erase(it);
|
||||
} else {
|
||||
if (config.only)
|
||||
++it;
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Exclude datasets
|
||||
for (const auto& name : config.excluded) {
|
||||
auto dataset = name.get<std::string>();
|
||||
auto it = std::find(datasets_names.begin(), datasets_names.end(), dataset);
|
||||
if (it == datasets_names.end()) {
|
||||
throw std::invalid_argument("Dataset " + dataset + " already excluded or doesn't exist!");
|
||||
}
|
||||
datasets_names.erase(it);
|
||||
}
|
||||
return datasets_names;
|
||||
}
|
||||
json GridSearch::build_tasks_mpi(int rank)
|
||||
{
|
||||
auto tasks = json::array();
|
||||
auto grid = GridData(Paths::grid_input(config.model));
|
||||
auto datasets = Datasets(false, Paths::datasets());
|
||||
auto all_datasets = datasets.getNames();
|
||||
auto datasets_names = filterDatasets(datasets);
|
||||
for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) {
|
||||
auto dataset = datasets_names[idx_dataset];
|
||||
for (const auto& seed : config.seeds) {
|
||||
auto combinations = grid.getGrid(dataset);
|
||||
for (int n_fold = 0; n_fold < config.n_folds; n_fold++) {
|
||||
json task = {
|
||||
{ "dataset", dataset },
|
||||
{ "idx_dataset", idx_dataset},
|
||||
{ "seed", seed },
|
||||
{ "fold", n_fold},
|
||||
};
|
||||
tasks.push_back(task);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Shuffle the array so heavy datasets are spread across the workers
|
||||
std::mt19937 g{ 271 }; // Use fixed seed to obtain the same shuffle
|
||||
std::shuffle(tasks.begin(), tasks.end(), g);
|
||||
std::cout << get_color_rank(rank) << "* Number of tasks: " << tasks.size() << std::endl;
|
||||
std::cout << "|";
|
||||
for (int i = 0; i < tasks.size(); ++i) {
|
||||
std::cout << (i + 1) % 10;
|
||||
}
|
||||
std::cout << "|" << std::endl << "|" << std::flush;
|
||||
return tasks;
|
||||
}
|
||||
void process_task_mpi_consumer(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result)
|
||||
{
|
||||
// initialize
|
||||
Timer timer;
|
||||
timer.start();
|
||||
json task = tasks[n_task];
|
||||
auto model = config.model;
|
||||
auto grid = GridData(Paths::grid_input(model));
|
||||
auto dataset = task["dataset"].get<std::string>();
|
||||
auto idx_dataset = task["idx_dataset"].get<int>();
|
||||
auto seed = task["seed"].get<int>();
|
||||
auto n_fold = task["fold"].get<int>();
|
||||
bool stratified = config.stratified;
|
||||
// Generate the hyperparamters combinations
|
||||
auto combinations = grid.getGrid(dataset);
|
||||
auto [X, y] = datasets.getTensors(dataset);
|
||||
auto states = datasets.getStates(dataset);
|
||||
auto features = datasets.getFeatures(dataset);
|
||||
auto className = datasets.getClassName(dataset);
|
||||
//
|
||||
// Start working on task
|
||||
//
|
||||
folding::Fold* fold;
|
||||
if (stratified)
|
||||
fold = new folding::StratifiedKFold(config.n_folds, y, seed);
|
||||
else
|
||||
fold = new folding::KFold(config.n_folds, y.size(0), seed);
|
||||
auto [train, test] = fold->getFold(n_fold);
|
||||
auto train_t = torch::tensor(train);
|
||||
auto test_t = torch::tensor(test);
|
||||
auto X_train = X.index({ "...", train_t });
|
||||
auto y_train = y.index({ train_t });
|
||||
auto X_test = X.index({ "...", test_t });
|
||||
auto y_test = y.index({ test_t });
|
||||
double best_fold_score = 0.0;
|
||||
int best_idx_combination = -1;
|
||||
json best_fold_hyper;
|
||||
for (int idx_combination = 0; idx_combination < combinations.size(); ++idx_combination) {
|
||||
auto hyperparam_line = combinations[idx_combination];
|
||||
auto hyperparameters = platform::HyperParameters(datasets.getNames(), hyperparam_line);
|
||||
folding::Fold* nested_fold;
|
||||
if (config.stratified)
|
||||
nested_fold = new folding::StratifiedKFold(config.nested, y_train, seed);
|
||||
else
|
||||
nested_fold = new folding::KFold(config.nested, y_train.size(0), seed);
|
||||
double score = 0.0;
|
||||
for (int n_nested_fold = 0; n_nested_fold < config.nested; n_nested_fold++) {
|
||||
// Nested level fold
|
||||
auto [train_nested, test_nested] = nested_fold->getFold(n_nested_fold);
|
||||
auto train_nested_t = torch::tensor(train_nested);
|
||||
auto test_nested_t = torch::tensor(test_nested);
|
||||
auto X_nested_train = X_train.index({ "...", train_nested_t });
|
||||
auto y_nested_train = y_train.index({ train_nested_t });
|
||||
auto X_nested_test = X_train.index({ "...", test_nested_t });
|
||||
auto y_nested_test = y_train.index({ test_nested_t });
|
||||
// Build Classifier with selected hyperparameters
|
||||
auto clf = Models::instance()->create(config.model);
|
||||
auto valid = clf->getValidHyperparameters();
|
||||
hyperparameters.check(valid, dataset);
|
||||
clf->setHyperparameters(hyperparameters.get(dataset));
|
||||
// Train model
|
||||
clf->fit(X_nested_train, y_nested_train, features, className, states);
|
||||
// Test model
|
||||
score += clf->score(X_nested_test, y_nested_test);
|
||||
}
|
||||
delete nested_fold;
|
||||
score /= config.nested;
|
||||
if (score > best_fold_score) {
|
||||
best_fold_score = score;
|
||||
best_idx_combination = idx_combination;
|
||||
best_fold_hyper = hyperparam_line;
|
||||
}
|
||||
}
|
||||
delete fold;
|
||||
// Build Classifier with the best hyperparameters to obtain the best score
|
||||
auto hyperparameters = platform::HyperParameters(datasets.getNames(), best_fold_hyper);
|
||||
auto clf = Models::instance()->create(config.model);
|
||||
auto valid = clf->getValidHyperparameters();
|
||||
hyperparameters.check(valid, dataset);
|
||||
clf->setHyperparameters(best_fold_hyper);
|
||||
clf->fit(X_train, y_train, features, className, states);
|
||||
best_fold_score = clf->score(X_test, y_test);
|
||||
// Return the result
|
||||
result->idx_dataset = task["idx_dataset"].get<int>();
|
||||
result->idx_combination = best_idx_combination;
|
||||
result->score = best_fold_score;
|
||||
result->n_fold = n_fold;
|
||||
result->time = timer.getDuration();
|
||||
// Update progress bar
|
||||
std::cout << get_color_rank(config_mpi.rank) << "*" << std::flush;
|
||||
}
|
||||
json store_result(std::vector<std::string>& names, Task_Result& result, json& results)
|
||||
{
|
||||
json json_result = {
|
||||
{ "score", result.score },
|
||||
{ "combination", result.idx_combination },
|
||||
{ "fold", result.n_fold },
|
||||
{ "time", result.time },
|
||||
{ "dataset", result.idx_dataset }
|
||||
};
|
||||
auto name = names[result.idx_dataset];
|
||||
if (!results.contains(name)) {
|
||||
results[name] = json::array();
|
||||
}
|
||||
results[name].push_back(json_result);
|
||||
return results;
|
||||
}
|
||||
json producer(std::vector<std::string>& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
|
||||
{
|
||||
Task_Result result;
|
||||
json results;
|
||||
int num_tasks = tasks.size();
|
||||
|
||||
//
|
||||
// 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
|
||||
//
|
||||
for (int i = 0; i < num_tasks; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
||||
if (status.MPI_TAG == TAG_RESULT) {
|
||||
//Store result
|
||||
store_result(names, result, results);
|
||||
}
|
||||
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_TASK, MPI_COMM_WORLD);
|
||||
}
|
||||
//
|
||||
// 2a.2 Producer will send the end message to all the consumers
|
||||
//
|
||||
for (int i = 0; i < config_mpi.n_procs - 1; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
||||
if (status.MPI_TAG == TAG_RESULT) {
|
||||
//Store result
|
||||
store_result(names, result, results);
|
||||
}
|
||||
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_END, MPI_COMM_WORLD);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
void select_best_results_folds(json& results, json& all_results, std::string& model)
|
||||
{
|
||||
Timer timer;
|
||||
auto grid = GridData(Paths::grid_input(model));
|
||||
//
|
||||
// Select the best result of the computed outer folds
|
||||
//
|
||||
for (const auto& result : all_results.items()) {
|
||||
// each result has the results of all the outer folds as each one were a different task
|
||||
double best_score = 0.0;
|
||||
json best;
|
||||
for (const auto& result_fold : result.value()) {
|
||||
double score = result_fold["score"].get<double>();
|
||||
if (score > best_score) {
|
||||
best_score = score;
|
||||
best = result_fold;
|
||||
}
|
||||
}
|
||||
auto dataset = result.key();
|
||||
auto combinations = grid.getGrid(dataset);
|
||||
json json_best = {
|
||||
{ "score", best_score },
|
||||
{ "hyperparameters", combinations[best["combination"].get<int>()] },
|
||||
{ "date", get_date() + " " + get_time() },
|
||||
{ "grid", grid.getInputGrid(dataset) },
|
||||
{ "duration", timer.translate2String(best["time"].get<double>()) }
|
||||
};
|
||||
results[dataset] = json_best;
|
||||
}
|
||||
}
|
||||
void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
|
||||
{
|
||||
Task_Result result;
|
||||
//
|
||||
// 2b.1 Consumers announce to the producer that they are ready to receive a task
|
||||
//
|
||||
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_QUERY, MPI_COMM_WORLD);
|
||||
int task;
|
||||
while (true) {
|
||||
MPI_Status status;
|
||||
//
|
||||
// 2b.2 Consumers receive the task from the producer and process it
|
||||
//
|
||||
MPI_Recv(&task, 1, MPI_INT, config_mpi.manager, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
||||
if (status.MPI_TAG == TAG_END) {
|
||||
break;
|
||||
}
|
||||
process_task_mpi_consumer(config, config_mpi, tasks, task, datasets, &result);
|
||||
//
|
||||
// 2b.3 Consumers send the result to the producer
|
||||
//
|
||||
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_RESULT, MPI_COMM_WORLD);
|
||||
}
|
||||
}
|
||||
void GridSearch::go(struct ConfigMPI& config_mpi)
|
||||
{
|
||||
/*
|
||||
* Each task is a json object with the following structure:
|
||||
* {
|
||||
* "dataset": "dataset_name",
|
||||
* "idx_dataset": idx_dataset, // used to identify the dataset in the results
|
||||
* // this index is relative to the used datasets in the actual run not to the whole datasets
|
||||
* "seed": # of seed to use,
|
||||
* "Fold": # of fold to process
|
||||
* }
|
||||
*
|
||||
* The overall process consists in these steps:
|
||||
* 0. Create the MPI result type & tasks
|
||||
* 0.1 Create the MPI result type
|
||||
* 0.2 Manager creates the tasks
|
||||
* 1. Manager will broadcast the tasks to all the processes
|
||||
* 1.1 Broadcast the number of tasks
|
||||
* 1.2 Broadcast the length of the following string
|
||||
* 1.2 Broadcast the tasks as a char* string
|
||||
* 2a. Producer delivers the tasks to the consumers
|
||||
* 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
|
||||
* 2a.2 Producer will send the end message to all the consumers
|
||||
* 2b. Consumers process the tasks and send the results to the producer
|
||||
* 2b.1 Consumers announce to the producer that they are ready to receive a task
|
||||
* 2b.2 Consumers receive the task from the producer and process it
|
||||
* 2b.3 Consumers send the result to the producer
|
||||
* 3. Manager select the bests sccores for each dataset
|
||||
* 3.1 Loop thru all the results obtained from each outer fold (task) and select the best
|
||||
* 3.2 Save the results
|
||||
*/
|
||||
//
|
||||
// 0.1 Create the MPI result type
|
||||
//
|
||||
Task_Result result;
|
||||
int tasks_size;
|
||||
MPI_Datatype MPI_Result;
|
||||
MPI_Datatype type[5] = { MPI_UNSIGNED, MPI_UNSIGNED, MPI_INT, MPI_DOUBLE, MPI_DOUBLE };
|
||||
int blocklen[5] = { 1, 1, 1, 1, 1 };
|
||||
MPI_Aint disp[5];
|
||||
disp[0] = offsetof(Task_Result, idx_dataset);
|
||||
disp[1] = offsetof(Task_Result, idx_combination);
|
||||
disp[2] = offsetof(Task_Result, n_fold);
|
||||
disp[3] = offsetof(Task_Result, score);
|
||||
disp[4] = offsetof(Task_Result, time);
|
||||
MPI_Type_create_struct(5, blocklen, disp, type, &MPI_Result);
|
||||
MPI_Type_commit(&MPI_Result);
|
||||
//
|
||||
// 0.2 Manager creates the tasks
|
||||
//
|
||||
char* msg;
|
||||
json tasks;
|
||||
if (config_mpi.rank == config_mpi.manager) {
|
||||
timer.start();
|
||||
tasks = build_tasks_mpi(config_mpi.rank);
|
||||
auto tasks_str = tasks.dump();
|
||||
tasks_size = tasks_str.size();
|
||||
msg = new char[tasks_size + 1];
|
||||
strcpy(msg, tasks_str.c_str());
|
||||
}
|
||||
//
|
||||
// 1. Manager will broadcast the tasks to all the processes
|
||||
//
|
||||
MPI_Bcast(&tasks_size, 1, MPI_INT, config_mpi.manager, MPI_COMM_WORLD);
|
||||
if (config_mpi.rank != config_mpi.manager) {
|
||||
msg = new char[tasks_size + 1];
|
||||
}
|
||||
MPI_Bcast(msg, tasks_size + 1, MPI_CHAR, config_mpi.manager, MPI_COMM_WORLD);
|
||||
tasks = json::parse(msg);
|
||||
delete[] msg;
|
||||
auto datasets = Datasets(config.discretize, Paths::datasets());
|
||||
if (config_mpi.rank == config_mpi.manager) {
|
||||
//
|
||||
// 2a. Producer delivers the tasks to the consumers
|
||||
//
|
||||
auto datasets_names = filterDatasets(datasets);
|
||||
json all_results = producer(datasets_names, tasks, config_mpi, MPI_Result);
|
||||
std::cout << get_color_rank(config_mpi.rank) << "|" << std::endl;
|
||||
//
|
||||
// 3. Manager select the bests sccores for each dataset
|
||||
//
|
||||
auto results = initializeResults();
|
||||
select_best_results_folds(results, all_results, config.model);
|
||||
//
|
||||
// 3.2 Save the results
|
||||
//
|
||||
save(results);
|
||||
} else {
|
||||
//
|
||||
// 2b. Consumers process the tasks and send the results to the producer
|
||||
//
|
||||
consumer(datasets, tasks, config, config_mpi, MPI_Result);
|
||||
}
|
||||
}
|
||||
json GridSearch::initializeResults()
|
||||
{
|
||||
// Load previous results if continue is set
|
||||
json results;
|
||||
if (config.continue_from != NO_CONTINUE()) {
|
||||
if (!config.quiet)
|
||||
std::cout << "* Loading previous results" << std::endl;
|
||||
try {
|
||||
std::ifstream file(Paths::grid_output(config.model));
|
||||
if (file.is_open()) {
|
||||
results = json::parse(file);
|
||||
results = results["results"];
|
||||
}
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "* There were no previous results" << std::endl;
|
||||
std::cerr << "* Initizalizing new results" << std::endl;
|
||||
results = json();
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
void GridSearch::save(json& results)
|
||||
{
|
||||
std::ofstream file(Paths::grid_output(config.model));
|
||||
json output = {
|
||||
{ "model", config.model },
|
||||
{ "score", config.score },
|
||||
{ "discretize", config.discretize },
|
||||
{ "stratified", config.stratified },
|
||||
{ "n_folds", config.n_folds },
|
||||
{ "seeds", config.seeds },
|
||||
{ "date", get_date() + " " + get_time()},
|
||||
{ "nested", config.nested},
|
||||
{ "platform", config.platform },
|
||||
{ "duration", timer.getDurationString(true)},
|
||||
{ "results", results }
|
||||
|
||||
};
|
||||
file << output.dump(4);
|
||||
}
|
||||
} /* namespace platform */
|
@@ -1,213 +0,0 @@
|
||||
#include "ManageResults.h"
|
||||
#include "CommandParser.h"
|
||||
#include <filesystem>
|
||||
#include <tuple>
|
||||
#include "Colors.h"
|
||||
#include "CLocale.h"
|
||||
#include "Paths.h"
|
||||
#include "ReportConsole.h"
|
||||
#include "ReportExcel.h"
|
||||
|
||||
namespace platform {
|
||||
|
||||
ManageResults::ManageResults(int numFiles, const std::string& model, const std::string& score, bool complete, bool partial, bool compare) :
|
||||
numFiles{ numFiles }, complete{ complete }, partial{ partial }, compare{ compare }, results(Results(Paths::results(), model, score, complete, partial))
|
||||
{
|
||||
indexList = true;
|
||||
openExcel = false;
|
||||
workbook = NULL;
|
||||
if (numFiles == 0) {
|
||||
this->numFiles = results.size();
|
||||
}
|
||||
}
|
||||
void ManageResults::doMenu()
|
||||
{
|
||||
if (results.empty()) {
|
||||
std::cout << Colors::MAGENTA() << "No results found!" << Colors::RESET() << std::endl;
|
||||
return;
|
||||
}
|
||||
results.sortDate();
|
||||
list();
|
||||
menu();
|
||||
if (openExcel) {
|
||||
workbook_close(workbook);
|
||||
}
|
||||
std::cout << Colors::RESET() << "Done!" << std::endl;
|
||||
}
|
||||
void ManageResults::list()
|
||||
{
|
||||
auto temp = ConfigLocale();
|
||||
std::string suffix = numFiles != results.size() ? " of " + std::to_string(results.size()) : "";
|
||||
std::stringstream oss;
|
||||
oss << "Results on screen: " << numFiles << suffix;
|
||||
std::cout << Colors::GREEN() << oss.str() << std::endl;
|
||||
std::cout << std::string(oss.str().size(), '-') << std::endl;
|
||||
if (complete) {
|
||||
std::cout << Colors::MAGENTA() << "Only listing complete results" << std::endl;
|
||||
}
|
||||
if (partial) {
|
||||
std::cout << Colors::MAGENTA() << "Only listing partial results" << std::endl;
|
||||
}
|
||||
auto i = 0;
|
||||
int maxModel = results.maxModelSize();
|
||||
std::cout << Colors::GREEN() << " # Date " << std::setw(maxModel) << std::left << "Model" << " Score Name Score C/P Duration Title" << std::endl;
|
||||
std::cout << "=== ========== " << std::string(maxModel, '=') << " =========== =========== === ========= =============================================================" << std::endl;
|
||||
bool odd = true;
|
||||
for (auto& result : results) {
|
||||
auto color = odd ? Colors::BLUE() : Colors::CYAN();
|
||||
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
|
||||
std::cout << result.to_string(maxModel) << std::endl;
|
||||
if (i == numFiles) {
|
||||
break;
|
||||
}
|
||||
odd = !odd;
|
||||
}
|
||||
}
|
||||
bool ManageResults::confirmAction(const std::string& intent, const std::string& fileName) const
|
||||
{
|
||||
std::string color;
|
||||
if (intent == "delete") {
|
||||
color = Colors::RED();
|
||||
} else {
|
||||
color = Colors::YELLOW();
|
||||
}
|
||||
std::string line;
|
||||
bool finished = false;
|
||||
while (!finished) {
|
||||
std::cout << color << "Really want to " << intent << " " << fileName << "? (y/n): ";
|
||||
getline(std::cin, line);
|
||||
finished = line.size() == 1 && (tolower(line[0]) == 'y' || tolower(line[0] == 'n'));
|
||||
}
|
||||
if (tolower(line[0]) == 'y') {
|
||||
return true;
|
||||
}
|
||||
std::cout << "Not done!" << std::endl;
|
||||
return false;
|
||||
}
|
||||
void ManageResults::report(const int index, const bool excelReport)
|
||||
{
|
||||
std::cout << Colors::YELLOW() << "Reporting " << results.at(index).getFilename() << std::endl;
|
||||
auto data = results.at(index).load();
|
||||
if (excelReport) {
|
||||
ReportExcel reporter(data, compare, workbook);
|
||||
reporter.show();
|
||||
openExcel = true;
|
||||
workbook = reporter.getWorkbook();
|
||||
std::cout << "Adding sheet to " << Paths::excel() + Paths::excelResults() << std::endl;
|
||||
} else {
|
||||
ReportConsole reporter(data, compare);
|
||||
reporter.show();
|
||||
}
|
||||
}
|
||||
void ManageResults::showIndex(const int index, const int idx)
|
||||
{
|
||||
// Show a dataset result inside a report
|
||||
auto data = results.at(index).load();
|
||||
std::cout << Colors::YELLOW() << "Showing " << results.at(index).getFilename() << std::endl;
|
||||
ReportConsole reporter(data, compare, idx);
|
||||
reporter.show();
|
||||
}
|
||||
void ManageResults::sortList()
|
||||
{
|
||||
std::cout << Colors::YELLOW() << "Choose sorting field (date='d', score='s', duration='u', model='m'): ";
|
||||
std::string line;
|
||||
char option;
|
||||
getline(std::cin, line);
|
||||
if (line.size() == 0)
|
||||
return;
|
||||
if (line.size() > 1) {
|
||||
std::cout << "Invalid option" << std::endl;
|
||||
return;
|
||||
}
|
||||
option = line[0];
|
||||
switch (option) {
|
||||
case 'd':
|
||||
results.sortDate();
|
||||
break;
|
||||
case 's':
|
||||
results.sortScore();
|
||||
break;
|
||||
case 'u':
|
||||
results.sortDuration();
|
||||
break;
|
||||
case 'm':
|
||||
results.sortModel();
|
||||
break;
|
||||
default:
|
||||
std::cout << "Invalid option" << std::endl;
|
||||
}
|
||||
}
|
||||
void ManageResults::menu()
|
||||
{
|
||||
char option;
|
||||
int index, subIndex;
|
||||
bool finished = false;
|
||||
std::string filename;
|
||||
// tuple<Option, digit, requires value>
|
||||
std::vector<std::tuple<std::string, char, bool>> mainOptions = {
|
||||
{"quit", 'q', false},
|
||||
{"list", 'l', false},
|
||||
{"delete", 'd', true},
|
||||
{"hide", 'h', true},
|
||||
{"sort", 's', false},
|
||||
{"report", 'r', true},
|
||||
{"excel", 'e', true}
|
||||
};
|
||||
std::vector<std::tuple<std::string, char, bool>> listOptions = {
|
||||
{"report", 'r', true},
|
||||
{"list", 'l', false},
|
||||
{"quit", 'q', false}
|
||||
};
|
||||
auto parser = CommandParser();
|
||||
while (!finished) {
|
||||
if (indexList) {
|
||||
std::tie(option, index) = parser.parse(Colors::GREEN(), mainOptions, 'r', numFiles - 1);
|
||||
} else {
|
||||
std::tie(option, subIndex) = parser.parse(Colors::MAGENTA(), listOptions, 'r', results.at(index).load()["results"].size() - 1);
|
||||
}
|
||||
switch (option) {
|
||||
case 'q':
|
||||
finished = true;
|
||||
break;
|
||||
case 'l':
|
||||
list();
|
||||
indexList = true;
|
||||
break;
|
||||
case 'd':
|
||||
filename = results.at(index).getFilename();
|
||||
if (!confirmAction("delete", filename))
|
||||
break;
|
||||
std::cout << "Deleting " << filename << std::endl;
|
||||
results.deleteResult(index);
|
||||
std::cout << "File: " + filename + " deleted!" << std::endl;
|
||||
list();
|
||||
break;
|
||||
case 'h':
|
||||
filename = results.at(index).getFilename();
|
||||
if (!confirmAction("hide", filename))
|
||||
break;
|
||||
filename = results.at(index).getFilename();
|
||||
std::cout << "Hiding " << filename << std::endl;
|
||||
results.hideResult(index, Paths::hiddenResults());
|
||||
std::cout << "File: " + filename + " hidden! (moved to " << Paths::hiddenResults() << ")" << std::endl;
|
||||
list();
|
||||
break;
|
||||
case 's':
|
||||
sortList();
|
||||
list();
|
||||
break;
|
||||
case 'r':
|
||||
if (indexList) {
|
||||
report(index, false);
|
||||
indexList = false;
|
||||
} else {
|
||||
showIndex(index, subIndex);
|
||||
}
|
||||
break;
|
||||
case 'e':
|
||||
report(index, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} /* namespace platform */
|
@@ -1,31 +0,0 @@
|
||||
#ifndef MANAGE_RESULTS_H
|
||||
#define MANAGE_RESULTS_H
|
||||
#include "Results.h"
|
||||
#include "xlsxwriter.h"
|
||||
|
||||
namespace platform {
|
||||
class ManageResults {
|
||||
public:
|
||||
ManageResults(int numFiles, const std::string& model, const std::string& score, bool complete, bool partial, bool compare);
|
||||
~ManageResults() = default;
|
||||
void doMenu();
|
||||
private:
|
||||
void list();
|
||||
bool confirmAction(const std::string& intent, const std::string& fileName) const;
|
||||
void report(const int index, const bool excelReport);
|
||||
void showIndex(const int index, const int idx);
|
||||
void sortList();
|
||||
void menu();
|
||||
int numFiles;
|
||||
bool indexList;
|
||||
bool openExcel;
|
||||
bool complete;
|
||||
bool partial;
|
||||
bool compare;
|
||||
Results results;
|
||||
lxw_workbook* workbook;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif /* MANAGE_RESULTS_H */
|
@@ -1,41 +0,0 @@
|
||||
#ifndef MODELS_H
|
||||
#define MODELS_H
|
||||
#include <map>
|
||||
#include "BaseClassifier.h"
|
||||
#include "AODE.h"
|
||||
#include "TAN.h"
|
||||
#include "KDB.h"
|
||||
#include "SPODE.h"
|
||||
#include "TANLd.h"
|
||||
#include "KDBLd.h"
|
||||
#include "SPODELd.h"
|
||||
#include "AODELd.h"
|
||||
#include "BoostAODE.h"
|
||||
#include "STree.h"
|
||||
#include "ODTE.h"
|
||||
#include "SVC.h"
|
||||
#include "RandomForest.h"
|
||||
namespace platform {
|
||||
class Models {
|
||||
private:
|
||||
map<std::string, function<bayesnet::BaseClassifier* (void)>> functionRegistry;
|
||||
static Models* factory; //singleton
|
||||
Models() {};
|
||||
public:
|
||||
Models(Models&) = delete;
|
||||
void operator=(const Models&) = delete;
|
||||
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
|
||||
static Models* instance();
|
||||
shared_ptr<bayesnet::BaseClassifier> create(const std::string& name);
|
||||
void registerFactoryFunction(const std::string& name,
|
||||
function<bayesnet::BaseClassifier* (void)> classFactoryFunction);
|
||||
std::vector<string> getNames();
|
||||
std::string tostring();
|
||||
|
||||
};
|
||||
class Registrar {
|
||||
public:
|
||||
Registrar(const std::string& className, function<bayesnet::BaseClassifier* (void)> classFactoryFunction);
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -1,39 +0,0 @@
|
||||
#ifndef PATHS_H
|
||||
#define PATHS_H
|
||||
#include <string>
|
||||
#include <filesystem>
|
||||
#include "DotEnv.h"
|
||||
namespace platform {
|
||||
class Paths {
|
||||
public:
|
||||
static std::string results() { return "results/"; }
|
||||
static std::string hiddenResults() { return "hidden_results/"; }
|
||||
static std::string excel() { return "excel/"; }
|
||||
static std::string grid() { return "grid/"; }
|
||||
static std::string datasets()
|
||||
{
|
||||
auto env = platform::DotEnv();
|
||||
return env.get("source_data");
|
||||
}
|
||||
static void createPath(const std::string& path)
|
||||
{
|
||||
// Create directory if it does not exist
|
||||
try {
|
||||
std::filesystem::create_directory(path);
|
||||
}
|
||||
catch (std::exception& e) {
|
||||
throw std::runtime_error("Could not create directory " + path);
|
||||
}
|
||||
}
|
||||
static std::string excelResults() { return "some_results.xlsx"; }
|
||||
static std::string grid_input(const std::string& model)
|
||||
{
|
||||
return grid() + "grid_" + model + "_input.json";
|
||||
}
|
||||
static std::string grid_output(const std::string& model)
|
||||
{
|
||||
return grid() + "grid_" + model + "_output.json";
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -1,114 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <locale>
|
||||
#include "ReportConsole.h"
|
||||
#include "BestScore.h"
|
||||
#include "CLocale.h"
|
||||
|
||||
namespace platform {
|
||||
std::string ReportConsole::headerLine(const std::string& text, int utf = 0)
|
||||
{
|
||||
int n = MAXL - text.length() - 3;
|
||||
n = n < 0 ? 0 : n;
|
||||
return "* " + text + std::string(n + utf, ' ') + "*\n";
|
||||
}
|
||||
|
||||
void ReportConsole::header()
|
||||
{
|
||||
std::stringstream oss;
|
||||
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
|
||||
std::cout << headerLine(
|
||||
"Report " + data["model"].get<std::string>() + " ver. " + data["version"].get<std::string>()
|
||||
+ " with " + std::to_string(data["folds"].get<int>()) + " Folds cross validation and " + std::to_string(data["seeds"].size())
|
||||
+ " random seeds. " + data["date"].get<std::string>() + " " + data["time"].get<std::string>()
|
||||
);
|
||||
std::cout << headerLine(data["title"].get<std::string>());
|
||||
std::cout << headerLine("Random seeds: " + fromVector("seeds") + " Stratified: " + (data["stratified"].get<bool>() ? "True" : "False"));
|
||||
oss << "Execution took " << std::setprecision(2) << std::fixed << data["duration"].get<float>()
|
||||
<< " seconds, " << data["duration"].get<float>() / 3600 << " hours, on " << data["platform"].get<std::string>();
|
||||
std::cout << headerLine(oss.str());
|
||||
std::cout << headerLine("Score is " + data["score_name"].get<std::string>());
|
||||
std::cout << std::string(MAXL, '*') << std::endl;
|
||||
std::cout << std::endl;
|
||||
}
|
||||
void ReportConsole::body()
|
||||
{
|
||||
auto tmp = ConfigLocale();
|
||||
int maxHyper = 15;
|
||||
int maxDataset = 7;
|
||||
for (const auto& r : data["results"]) {
|
||||
maxHyper = std::max(maxHyper, (int)r["hyperparameters"].dump().size());
|
||||
maxDataset = std::max(maxDataset, (int)r["dataset"].get<std::string>().size());
|
||||
|
||||
}
|
||||
std::cout << Colors::GREEN() << " # " << std::setw(maxDataset) << std::left << "Dataset" << " Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << std::endl;
|
||||
std::cout << "=== " << std::string(maxDataset, '=') << " ====== ===== === ========= ========= ========= =============== =================== " << std::string(maxHyper, '=') << std::endl;
|
||||
json lastResult;
|
||||
double totalScore = 0.0;
|
||||
bool odd = true;
|
||||
int index = 0;
|
||||
for (const auto& r : data["results"]) {
|
||||
if (selectedIndex != -1 && index != selectedIndex) {
|
||||
index++;
|
||||
continue;
|
||||
}
|
||||
auto color = odd ? Colors::CYAN() : Colors::BLUE();
|
||||
std::cout << color;
|
||||
std::cout << std::setw(3) << std::right << index++ << " ";
|
||||
std::cout << std::setw(maxDataset) << std::left << r["dataset"].get<std::string>() << " ";
|
||||
std::cout << std::setw(6) << std::right << r["samples"].get<int>() << " ";
|
||||
std::cout << std::setw(5) << std::right << r["features"].get<int>() << " ";
|
||||
std::cout << std::setw(3) << std::right << r["classes"].get<int>() << " ";
|
||||
std::cout << std::setw(9) << std::setprecision(2) << std::fixed << r["nodes"].get<float>() << " ";
|
||||
std::cout << std::setw(9) << std::setprecision(2) << std::fixed << r["leaves"].get<float>() << " ";
|
||||
std::cout << std::setw(9) << std::setprecision(2) << std::fixed << r["depth"].get<float>() << " ";
|
||||
std::cout << std::setw(8) << std::right << std::setprecision(6) << std::fixed << r["score"].get<double>() << "±" << std::setw(6) << std::setprecision(4) << std::fixed << r["score_std"].get<double>();
|
||||
const std::string status = compareResult(r["dataset"].get<std::string>(), r["score"].get<double>());
|
||||
std::cout << status;
|
||||
std::cout << std::setw(12) << std::right << std::setprecision(6) << std::fixed << r["time"].get<double>() << "±" << std::setw(6) << std::setprecision(4) << std::fixed << r["time_std"].get<double>() << " ";
|
||||
std::cout << r["hyperparameters"].dump();
|
||||
std::cout << std::endl;
|
||||
std::cout << std::flush;
|
||||
lastResult = r;
|
||||
totalScore += r["score"].get<double>();
|
||||
odd = !odd;
|
||||
}
|
||||
if (data["results"].size() == 1 || selectedIndex != -1) {
|
||||
std::cout << std::string(MAXL, '*') << std::endl;
|
||||
std::cout << headerLine(fVector("Train scores: ", lastResult["scores_train"], 14, 12));
|
||||
std::cout << headerLine(fVector("Test scores: ", lastResult["scores_test"], 14, 12));
|
||||
std::cout << headerLine(fVector("Train times: ", lastResult["times_train"], 10, 3));
|
||||
std::cout << headerLine(fVector("Test times: ", lastResult["times_test"], 10, 3));
|
||||
std::cout << std::string(MAXL, '*') << std::endl;
|
||||
} else {
|
||||
footer(totalScore);
|
||||
}
|
||||
}
|
||||
void ReportConsole::showSummary()
|
||||
{
|
||||
for (const auto& item : summary) {
|
||||
std::stringstream oss;
|
||||
oss << std::setw(3) << std::left << item.first;
|
||||
oss << std::setw(3) << std::right << item.second << " ";
|
||||
oss << std::left << meaning.at(item.first);
|
||||
std::cout << headerLine(oss.str(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
void ReportConsole::footer(double totalScore)
|
||||
{
|
||||
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
|
||||
showSummary();
|
||||
auto score = data["score_name"].get<std::string>();
|
||||
auto best = BestScore::getScore(score);
|
||||
if (best.first != "") {
|
||||
std::stringstream oss;
|
||||
oss << score << " compared to " << best.first << " .: " << totalScore / best.second;
|
||||
std::cout << headerLine(oss.str());
|
||||
}
|
||||
if (!getExistBestFile() && compare) {
|
||||
std::cout << headerLine("*** Best Results File not found. Couldn't compare any result!");
|
||||
}
|
||||
std::cout << std::string(MAXL, '*') << std::endl << Colors::RESET();
|
||||
}
|
||||
}
|
@@ -1,22 +0,0 @@
|
||||
#ifndef REPORTCONSOLE_H
|
||||
#define REPORTCONSOLE_H
|
||||
#include <string>
|
||||
#include "ReportBase.h"
|
||||
#include "Colors.h"
|
||||
|
||||
namespace platform {
|
||||
const int MAXL = 133;
|
||||
class ReportConsole : public ReportBase {
|
||||
public:
|
||||
explicit ReportConsole(json data_, bool compare = false, int index = -1) : ReportBase(data_, compare), selectedIndex(index) {};
|
||||
virtual ~ReportConsole() = default;
|
||||
private:
|
||||
int selectedIndex;
|
||||
std::string headerLine(const std::string& text, int utf);
|
||||
void header() override;
|
||||
void body() override;
|
||||
void footer(double totalScore);
|
||||
void showSummary() override;
|
||||
};
|
||||
};
|
||||
#endif
|
@@ -1,180 +0,0 @@
|
||||
#include <sstream>
|
||||
#include <locale>
|
||||
#include "ReportExcel.h"
|
||||
#include "BestScore.h"
|
||||
|
||||
|
||||
namespace platform {
|
||||
|
||||
ReportExcel::ReportExcel(json data_, bool compare, lxw_workbook* workbook, lxw_worksheet* worksheet) : ReportBase(data_, compare), ExcelFile(workbook, worksheet)
|
||||
{
|
||||
createFile();
|
||||
}
|
||||
|
||||
void ReportExcel::formatColumns()
|
||||
{
|
||||
worksheet_freeze_panes(worksheet, 6, 1);
|
||||
std::vector<int> columns_sizes = { 22, 10, 9, 7, 12, 12, 12, 12, 12, 3, 15, 12, 23 };
|
||||
for (int i = 0; i < columns_sizes.size(); ++i) {
|
||||
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
|
||||
}
|
||||
}
|
||||
void ReportExcel::createWorksheet()
|
||||
{
|
||||
const std::string name = data["model"].get<std::string>();
|
||||
std::string suffix = "";
|
||||
std::string efectiveName;
|
||||
int num = 1;
|
||||
// Create a sheet with the name of the model
|
||||
while (true) {
|
||||
efectiveName = name + suffix;
|
||||
if (workbook_get_worksheet_by_name(workbook, efectiveName.c_str())) {
|
||||
suffix = std::to_string(++num);
|
||||
} else {
|
||||
worksheet = workbook_add_worksheet(workbook, efectiveName.c_str());
|
||||
break;
|
||||
}
|
||||
if (num > 100) {
|
||||
throw std::invalid_argument("Couldn't create sheet " + efectiveName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ReportExcel::createFile()
|
||||
{
|
||||
if (workbook == NULL) {
|
||||
workbook = workbook_new((Paths::excel() + Paths::excelResults()).c_str());
|
||||
}
|
||||
if (worksheet == NULL) {
|
||||
createWorksheet();
|
||||
}
|
||||
setProperties(data["title"].get<std::string>());
|
||||
createFormats();
|
||||
formatColumns();
|
||||
}
|
||||
|
||||
void ReportExcel::closeFile()
|
||||
{
|
||||
workbook_close(workbook);
|
||||
}
|
||||
|
||||
void ReportExcel::header()
|
||||
{
|
||||
std::locale mylocale(std::cout.getloc(), new separated);
|
||||
std::locale::global(mylocale);
|
||||
std::cout.imbue(mylocale);
|
||||
std::stringstream oss;
|
||||
std::string message = data["model"].get<std::string>() + " ver. " + data["version"].get<std::string>() + " " +
|
||||
data["language"].get<std::string>() + " ver. " + data["language_version"].get<std::string>() +
|
||||
" with " + std::to_string(data["folds"].get<int>()) + " Folds cross validation and " + std::to_string(data["seeds"].size()) +
|
||||
" random seeds. " + data["date"].get<std::string>() + " " + data["time"].get<std::string>();
|
||||
worksheet_merge_range(worksheet, 0, 0, 0, 12, message.c_str(), styles["headerFirst"]);
|
||||
worksheet_merge_range(worksheet, 1, 0, 1, 12, data["title"].get<std::string>().c_str(), styles["headerRest"]);
|
||||
worksheet_merge_range(worksheet, 2, 0, 3, 0, ("Score is " + data["score_name"].get<std::string>()).c_str(), styles["headerRest"]);
|
||||
worksheet_merge_range(worksheet, 2, 1, 3, 3, "Execution time", styles["headerRest"]);
|
||||
oss << std::setprecision(2) << std::fixed << data["duration"].get<float>() << " s";
|
||||
worksheet_merge_range(worksheet, 2, 4, 2, 5, oss.str().c_str(), styles["headerRest"]);
|
||||
oss.str("");
|
||||
oss.clear();
|
||||
oss << std::setprecision(2) << std::fixed << data["duration"].get<float>() / 3600 << " h";
|
||||
worksheet_merge_range(worksheet, 3, 4, 3, 5, oss.str().c_str(), styles["headerRest"]);
|
||||
worksheet_merge_range(worksheet, 2, 6, 3, 7, "Platform", styles["headerRest"]);
|
||||
worksheet_merge_range(worksheet, 2, 8, 3, 9, data["platform"].get<std::string>().c_str(), styles["headerRest"]);
|
||||
worksheet_merge_range(worksheet, 2, 10, 2, 12, ("Random seeds: " + fromVector("seeds")).c_str(), styles["headerSmall"]);
|
||||
oss.str("");
|
||||
oss.clear();
|
||||
oss << "Stratified: " << (data["stratified"].get<bool>() ? "True" : "False");
|
||||
worksheet_merge_range(worksheet, 3, 10, 3, 11, oss.str().c_str(), styles["headerSmall"]);
|
||||
oss.str("");
|
||||
oss.clear();
|
||||
oss << "Discretized: " << (data["discretized"].get<bool>() ? "True" : "False");
|
||||
worksheet_write_string(worksheet, 3, 12, oss.str().c_str(), styles["headerSmall"]);
|
||||
}
|
||||
|
||||
void ReportExcel::body()
|
||||
{
|
||||
auto head = std::vector<std::string>(
|
||||
{ "Dataset", "Samples", "Features", "Classes", "Nodes", "Edges", "States", "Score", "Score Std.", "St.", "Time",
|
||||
"Time Std.", "Hyperparameters" });
|
||||
int col = 0;
|
||||
for (const auto& item : head) {
|
||||
writeString(5, col++, item, "bodyHeader");
|
||||
}
|
||||
row = 6;
|
||||
col = 0;
|
||||
int hypSize = 22;
|
||||
json lastResult;
|
||||
double totalScore = 0.0;
|
||||
std::string hyperparameters;
|
||||
for (const auto& r : data["results"]) {
|
||||
writeString(row, col, r["dataset"].get<std::string>(), "text");
|
||||
writeInt(row, col + 1, r["samples"].get<int>(), "ints");
|
||||
writeInt(row, col + 2, r["features"].get<int>(), "ints");
|
||||
writeInt(row, col + 3, r["classes"].get<int>(), "ints");
|
||||
writeDouble(row, col + 4, r["nodes"].get<float>(), "floats");
|
||||
writeDouble(row, col + 5, r["leaves"].get<float>(), "floats");
|
||||
writeDouble(row, col + 6, r["depth"].get<double>(), "floats");
|
||||
writeDouble(row, col + 7, r["score"].get<double>(), "result");
|
||||
writeDouble(row, col + 8, r["score_std"].get<double>(), "result");
|
||||
const std::string status = compareResult(r["dataset"].get<std::string>(), r["score"].get<double>());
|
||||
writeString(row, col + 9, status, "textCentered");
|
||||
writeDouble(row, col + 10, r["time"].get<double>(), "time");
|
||||
writeDouble(row, col + 11, r["time_std"].get<double>(), "time");
|
||||
hyperparameters = r["hyperparameters"].dump();
|
||||
if (hyperparameters.size() > hypSize) {
|
||||
hypSize = hyperparameters.size();
|
||||
}
|
||||
writeString(row, col + 12, hyperparameters, "text");
|
||||
lastResult = r;
|
||||
totalScore += r["score"].get<double>();
|
||||
row++;
|
||||
}
|
||||
// Set the right column width of hyperparameters with the maximum length
|
||||
worksheet_set_column(worksheet, 12, 12, hypSize + 5, NULL);
|
||||
// Show totals if only one dataset is present in the result
|
||||
if (data["results"].size() == 1) {
|
||||
for (const std::string& group : { "scores_train", "scores_test", "times_train", "times_test" }) {
|
||||
row++;
|
||||
col = 1;
|
||||
writeString(row, col, group, "text");
|
||||
for (double item : lastResult[group]) {
|
||||
std::string style = group.find("scores") != std::string::npos ? "result" : "time";
|
||||
writeDouble(row, ++col, item, style);
|
||||
}
|
||||
}
|
||||
// Set with of columns to show those totals completely
|
||||
worksheet_set_column(worksheet, 1, 1, 12, NULL);
|
||||
for (int i = 2; i < 7; ++i) {
|
||||
// doesn't work with from col to col, so...
|
||||
worksheet_set_column(worksheet, i, i, 15, NULL);
|
||||
}
|
||||
} else {
|
||||
footer(totalScore, row);
|
||||
}
|
||||
}
|
||||
|
||||
void ReportExcel::showSummary()
|
||||
{
|
||||
for (const auto& item : summary) {
|
||||
worksheet_write_string(worksheet, row + 2, 1, item.first.c_str(), styles["summaryStyle"]);
|
||||
worksheet_write_number(worksheet, row + 2, 2, item.second, styles["summaryStyle"]);
|
||||
worksheet_merge_range(worksheet, row + 2, 3, row + 2, 5, meaning.at(item.first).c_str(), styles["summaryStyle"]);
|
||||
row += 1;
|
||||
}
|
||||
}
|
||||
|
||||
void ReportExcel::footer(double totalScore, int row)
|
||||
{
|
||||
showSummary();
|
||||
row += 4 + summary.size();
|
||||
auto score = data["score_name"].get<std::string>();
|
||||
auto best = BestScore::getScore(score);
|
||||
if (best.first != "") {
|
||||
worksheet_merge_range(worksheet, row, 1, row, 5, (score + " compared to " + best.first + " .:").c_str(), efectiveStyle("text"));
|
||||
writeDouble(row, 6, totalScore / best.second, "result");
|
||||
}
|
||||
if (!getExistBestFile() && compare) {
|
||||
worksheet_write_string(worksheet, row + 1, 0, "*** Best Results File not found. Couldn't compare any result!", styles["summaryStyle"]);
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,58 +0,0 @@
|
||||
#include "Result.h"
|
||||
#include "BestScore.h"
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include "Colors.h"
|
||||
#include "DotEnv.h"
|
||||
#include "CLocale.h"
|
||||
|
||||
namespace platform {
|
||||
Result::Result(const std::string& path, const std::string& filename)
|
||||
: path(path)
|
||||
, filename(filename)
|
||||
{
|
||||
auto data = load();
|
||||
date = data["date"];
|
||||
score = 0;
|
||||
for (const auto& result : data["results"]) {
|
||||
score += result["score"].get<double>();
|
||||
}
|
||||
scoreName = data["score_name"];
|
||||
auto best = BestScore::getScore(scoreName);
|
||||
if (best.first != "") {
|
||||
score /= best.second;
|
||||
}
|
||||
title = data["title"];
|
||||
duration = data["duration"];
|
||||
model = data["model"];
|
||||
complete = data["results"].size() > 1;
|
||||
}
|
||||
|
||||
json Result::load() const
|
||||
{
|
||||
std::ifstream resultData(path + "/" + filename);
|
||||
if (resultData.is_open()) {
|
||||
json data = json::parse(resultData);
|
||||
return data;
|
||||
}
|
||||
throw std::invalid_argument("Unable to open result file. [" + path + "/" + filename + "]");
|
||||
}
|
||||
|
||||
std::string Result::to_string(int maxModel) const
|
||||
{
|
||||
auto tmp = ConfigLocale();
|
||||
std::stringstream oss;
|
||||
double durationShow = duration > 3600 ? duration / 3600 : duration > 60 ? duration / 60 : duration;
|
||||
std::string durationUnit = duration > 3600 ? "h" : duration > 60 ? "m" : "s";
|
||||
oss << date << " ";
|
||||
oss << std::setw(maxModel) << std::left << model << " ";
|
||||
oss << std::setw(11) << std::left << scoreName << " ";
|
||||
oss << std::right << std::setw(11) << std::setprecision(7) << std::fixed << score << " ";
|
||||
auto completeString = isComplete() ? "C" : "P";
|
||||
oss << std::setw(1) << " " << completeString << " ";
|
||||
oss << std::setw(7) << std::setprecision(2) << std::fixed << durationShow << " " << durationUnit << " ";
|
||||
oss << std::setw(50) << std::left << title << " ";
|
||||
return oss.str();
|
||||
}
|
||||
}
|
@@ -1,35 +0,0 @@
|
||||
#ifndef RESULT_H
|
||||
#define RESULT_H
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <nlohmann/json.hpp>
|
||||
namespace platform {
|
||||
using json = nlohmann::json;
|
||||
|
||||
class Result {
|
||||
public:
|
||||
Result(const std::string& path, const std::string& filename);
|
||||
json load() const;
|
||||
std::string to_string(int maxModel) const;
|
||||
std::string getFilename() const { return filename; };
|
||||
std::string getDate() const { return date; };
|
||||
double getScore() const { return score; };
|
||||
std::string getTitle() const { return title; };
|
||||
double getDuration() const { return duration; };
|
||||
std::string getModel() const { return model; };
|
||||
std::string getScoreName() const { return scoreName; };
|
||||
bool isComplete() const { return complete; };
|
||||
private:
|
||||
std::string path;
|
||||
std::string filename;
|
||||
std::string date;
|
||||
double score;
|
||||
std::string title;
|
||||
double duration;
|
||||
std::string model;
|
||||
std::string scoreName;
|
||||
bool complete;
|
||||
};
|
||||
};
|
||||
#endif
|
@@ -1,74 +0,0 @@
|
||||
#include "Results.h"
|
||||
#include <algorithm>
|
||||
|
||||
namespace platform {
|
||||
Results::Results(const std::string& path, const std::string& model, const std::string& score, bool complete, bool partial) :
|
||||
path(path), model(model), scoreName(score), complete(complete), partial(partial)
|
||||
{
|
||||
load();
|
||||
if (!files.empty()) {
|
||||
maxModel = (*max_element(files.begin(), files.end(), [](const Result& a, const Result& b) { return a.getModel().size() < b.getModel().size(); })).getModel().size();
|
||||
} else {
|
||||
maxModel = 0;
|
||||
}
|
||||
};
|
||||
void Results::load()
|
||||
{
|
||||
using std::filesystem::directory_iterator;
|
||||
for (const auto& file : directory_iterator(path)) {
|
||||
auto filename = file.path().filename().string();
|
||||
if (filename.find(".json") != std::string::npos && filename.find("results_") == 0) {
|
||||
auto result = Result(path, filename);
|
||||
bool addResult = true;
|
||||
if (model != "any" && result.getModel() != model || scoreName != "any" && scoreName != result.getScoreName() || complete && !result.isComplete() || partial && result.isComplete())
|
||||
addResult = false;
|
||||
if (addResult)
|
||||
files.push_back(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
void Results::hideResult(int index, const std::string& pathHidden)
|
||||
{
|
||||
auto filename = files.at(index).getFilename();
|
||||
rename((path + "/" + filename).c_str(), (pathHidden + "/" + filename).c_str());
|
||||
files.erase(files.begin() + index);
|
||||
}
|
||||
void Results::deleteResult(int index)
|
||||
{
|
||||
auto filename = files.at(index).getFilename();
|
||||
remove((path + "/" + filename).c_str());
|
||||
files.erase(files.begin() + index);
|
||||
}
|
||||
int Results::size() const
|
||||
{
|
||||
return files.size();
|
||||
}
|
||||
void Results::sortDate()
|
||||
{
|
||||
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
|
||||
return a.getDate() > b.getDate();
|
||||
});
|
||||
}
|
||||
void Results::sortModel()
|
||||
{
|
||||
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
|
||||
return a.getModel() > b.getModel();
|
||||
});
|
||||
}
|
||||
void Results::sortDuration()
|
||||
{
|
||||
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
|
||||
return a.getDuration() > b.getDuration();
|
||||
});
|
||||
}
|
||||
void Results::sortScore()
|
||||
{
|
||||
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
|
||||
return a.getScore() > b.getScore();
|
||||
});
|
||||
}
|
||||
bool Results::empty() const
|
||||
{
|
||||
return files.empty();
|
||||
}
|
||||
}
|
@@ -1,63 +0,0 @@
|
||||
#ifndef STATISTICS_H
|
||||
#define STATISTICS_H
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace platform {
|
||||
struct WTL {
|
||||
int win;
|
||||
int tie;
|
||||
int loss;
|
||||
};
|
||||
struct FriedmanResult {
|
||||
double statistic;
|
||||
double criticalValue;
|
||||
long double pvalue;
|
||||
bool reject;
|
||||
};
|
||||
struct HolmLine {
|
||||
std::string model;
|
||||
long double pvalue;
|
||||
double rank;
|
||||
WTL wtl;
|
||||
bool reject;
|
||||
};
|
||||
struct HolmResult {
|
||||
std::string model;
|
||||
std::vector<HolmLine> holmLines;
|
||||
};
|
||||
class Statistics {
|
||||
public:
|
||||
Statistics(const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance = 0.05, bool output = true);
|
||||
bool friedmanTest();
|
||||
void postHocHolmTest(bool friedmanResult);
|
||||
FriedmanResult& getFriedmanResult();
|
||||
HolmResult& getHolmResult();
|
||||
std::map<std::string, std::map<std::string, float>>& getRanks();
|
||||
private:
|
||||
void fit();
|
||||
void computeRanks();
|
||||
void computeWTL();
|
||||
const std::vector<std::string>& models;
|
||||
const std::vector<std::string>& datasets;
|
||||
const json& data;
|
||||
double significance;
|
||||
bool output;
|
||||
bool fitted = false;
|
||||
int nModels = 0;
|
||||
int nDatasets = 0;
|
||||
int controlIdx = 0;
|
||||
std::map<int, WTL> wtl;
|
||||
std::map<std::string, float> ranks;
|
||||
int maxModelName = 0;
|
||||
int maxDatasetName = 0;
|
||||
FriedmanResult friedmanResult;
|
||||
HolmResult holmResult;
|
||||
std::map<std::string, std::map<std::string, float>> ranksModels;
|
||||
};
|
||||
}
|
||||
#endif // !STATISTICS_H
|
@@ -1,30 +0,0 @@
|
||||
#ifndef UTILS_H
|
||||
#define UTILS_H
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
namespace platform {
|
||||
//static std::vector<std::string> split(const std::string& text, char delimiter);
|
||||
static std::vector<std::string> split(const std::string& text, char delimiter)
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
std::stringstream ss(text);
|
||||
std::string token;
|
||||
while (std::getline(ss, token, delimiter)) {
|
||||
result.push_back(token);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
static std::string trim(const std::string& str)
|
||||
{
|
||||
std::string result = str;
|
||||
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
|
||||
return !std::isspace(ch);
|
||||
}));
|
||||
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
|
||||
return !std::isspace(ch);
|
||||
}).base(), result.end());
|
||||
return result;
|
||||
}
|
||||
}
|
||||
#endif
|
@@ -1,85 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include "Paths.h"
|
||||
#include "BestResults.h"
|
||||
#include "Colors.h"
|
||||
#include "config.h"
|
||||
|
||||
void manageArguments(argparse::ArgumentParser& program, int argc, char** argv)
|
||||
{
|
||||
program.add_argument("-m", "--model").default_value("").help("Filter results of the selected model) (any for all models)");
|
||||
program.add_argument("-s", "--score").default_value("").help("Filter results of the score name supplied");
|
||||
program.add_argument("--build").help("build best score results file").default_value(false).implicit_value(true);
|
||||
program.add_argument("--report").help("report of best score results file").default_value(false).implicit_value(true);
|
||||
program.add_argument("--friedman").help("Friedman test").default_value(false).implicit_value(true);
|
||||
program.add_argument("--excel").help("Output to excel").default_value(false).implicit_value(true);
|
||||
program.add_argument("--level").help("significance level").default_value(0.05).scan<'g', double>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = std::stod(value);
|
||||
if (k < 0.01 || k > 0.15) {
|
||||
throw std::runtime_error("Significance level hast to be a number in [0.01, 0.15]");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const std::runtime_error& err) {
|
||||
throw std::runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw std::runtime_error("Number of folds must be an decimal number");
|
||||
}});
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
argparse::ArgumentParser program("b_best", { project_version.begin(), project_version.end() });
|
||||
manageArguments(program, argc, argv);
|
||||
std::string model, score;
|
||||
bool build, report, friedman, excel;
|
||||
double level;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
model = program.get<std::string>("model");
|
||||
score = program.get<std::string>("score");
|
||||
build = program.get<bool>("build");
|
||||
report = program.get<bool>("report");
|
||||
friedman = program.get<bool>("friedman");
|
||||
excel = program.get<bool>("excel");
|
||||
level = program.get<double>("level");
|
||||
if (model == "" || score == "") {
|
||||
throw std::runtime_error("Model and score name must be supplied");
|
||||
}
|
||||
if (friedman && model != "any") {
|
||||
std::cerr << "Friedman test can only be used with all models" << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
if (!report && !build) {
|
||||
std::cerr << "Either build, report or both, have to be selected to do anything!" << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
catch (const std::exception& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
// Generate report
|
||||
auto results = platform::BestResults(platform::Paths::results(), score, model, friedman, level);
|
||||
if (build) {
|
||||
if (model == "any") {
|
||||
results.buildAll();
|
||||
} else {
|
||||
std::string fileName = results.build();
|
||||
std::cout << Colors::GREEN() << fileName << " created!" << Colors::RESET() << std::endl;
|
||||
}
|
||||
}
|
||||
if (report) {
|
||||
if (model == "any") {
|
||||
results.reportAll(excel);
|
||||
} else {
|
||||
results.reportSingle(excel);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
@@ -1,232 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <map>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <mpi.h>
|
||||
#include "DotEnv.h"
|
||||
#include "Models.h"
|
||||
#include "modelRegister.h"
|
||||
#include "GridSearch.h"
|
||||
#include "Paths.h"
|
||||
#include "Timer.h"
|
||||
#include "Colors.h"
|
||||
#include "config.h"
|
||||
|
||||
using json = nlohmann::json;
|
||||
const int MAXL = 133;
|
||||
|
||||
void manageArguments(argparse::ArgumentParser& program)
|
||||
{
|
||||
auto env = platform::DotEnv();
|
||||
auto& group = program.add_mutually_exclusive_group(true);
|
||||
program.add_argument("-m", "--model")
|
||||
.help("Model to use " + platform::Models::instance()->tostring())
|
||||
.action([](const std::string& value) {
|
||||
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
|
||||
if (find(choices.begin(), choices.end(), value) != choices.end()) {
|
||||
return value;
|
||||
}
|
||||
throw std::runtime_error("Model must be one of " + platform::Models::instance()->tostring());
|
||||
}
|
||||
);
|
||||
group.add_argument("--dump").help("Show the grid combinations").default_value(false).implicit_value(true);
|
||||
group.add_argument("--report").help("Report the computed hyperparameters").default_value(false).implicit_value(true);
|
||||
group.add_argument("--compute").help("Perform computation of the grid output hyperparameters").default_value(false).implicit_value(true);
|
||||
program.add_argument("--discretize").help("Discretize input datasets").default_value((bool)stoi(env.get("discretize"))).implicit_value(true);
|
||||
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
|
||||
program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true);
|
||||
program.add_argument("--continue").help("Continue computing from that dataset").default_value(platform::GridSearch::NO_CONTINUE());
|
||||
program.add_argument("--only").help("Used with continue to compute that dataset only").default_value(false).implicit_value(true);
|
||||
program.add_argument("--exclude").default_value("[]").help("Datasets to exclude in json format, e.g. [\"dataset1\", \"dataset2\"]");
|
||||
program.add_argument("--nested").help("Set the double/nested cross validation number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw std::runtime_error("Number of nested folds must be greater than 1");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const runtime_error& err) {
|
||||
throw std::runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw std::runtime_error("Number of nested folds must be an integer");
|
||||
}});
|
||||
program.add_argument("--score").help("Score used in gridsearch").default_value("accuracy");
|
||||
program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw std::runtime_error("Number of folds must be greater than 1");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const runtime_error& err) {
|
||||
throw std::runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw std::runtime_error("Number of folds must be an integer");
|
||||
}});
|
||||
auto seed_values = env.getSeeds();
|
||||
program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
|
||||
}
|
||||
|
||||
void list_dump(std::string& model)
|
||||
{
|
||||
auto data = platform::GridData(platform::Paths::grid_input(model));
|
||||
std::cout << Colors::MAGENTA() << "Listing configuration input file (Grid)" << std::endl << std::endl;
|
||||
int index = 0;
|
||||
int max_hyper = 15;
|
||||
int max_dataset = 7;
|
||||
auto combinations = data.getGridFile();
|
||||
for (auto const& item : combinations) {
|
||||
if (item.first.size() > max_dataset) {
|
||||
max_dataset = item.first.size();
|
||||
}
|
||||
if (item.second.dump().size() > max_hyper) {
|
||||
max_hyper = item.second.dump().size();
|
||||
}
|
||||
}
|
||||
std::cout << Colors::GREEN() << left << " # " << left << setw(max_dataset) << "Dataset" << " #Com. "
|
||||
<< setw(max_hyper) << "Hyperparameters" << std::endl;
|
||||
std::cout << "=== " << string(max_dataset, '=') << " ===== " << string(max_hyper, '=') << std::endl;
|
||||
bool odd = true;
|
||||
for (auto const& item : combinations) {
|
||||
auto color = odd ? Colors::CYAN() : Colors::BLUE();
|
||||
std::cout << color;
|
||||
auto num_combinations = data.getNumCombinations(item.first);
|
||||
std::cout << setw(3) << fixed << right << ++index << left << " " << setw(max_dataset) << item.first
|
||||
<< " " << setw(5) << right << num_combinations << " " << setw(max_hyper) << item.second.dump() << std::endl;
|
||||
odd = !odd;
|
||||
}
|
||||
std::cout << Colors::RESET() << std::endl;
|
||||
}
|
||||
std::string headerLine(const std::string& text, int utf = 0)
|
||||
{
|
||||
int n = MAXL - text.length() - 3;
|
||||
n = n < 0 ? 0 : n;
|
||||
return "* " + text + std::string(n + utf, ' ') + "*\n";
|
||||
}
|
||||
void list_results(json& results, std::string& model)
|
||||
{
|
||||
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
|
||||
std::cout << headerLine("Listing computed hyperparameters for model " + model);
|
||||
std::cout << headerLine("Date & time: " + results["date"].get<std::string>() + " Duration: " + results["duration"].get<std::string>());
|
||||
std::cout << headerLine("Score: " + results["score"].get<std::string>());
|
||||
std::cout << headerLine(
|
||||
"Random seeds: " + results["seeds"].dump()
|
||||
+ " Discretized: " + (results["discretize"].get<bool>() ? "True" : "False")
|
||||
+ " Stratified: " + (results["stratified"].get<bool>() ? "True" : "False")
|
||||
+ " #Folds: " + std::to_string(results["n_folds"].get<int>())
|
||||
+ " Nested: " + (results["nested"].get<int>() == 0 ? "False" : to_string(results["nested"].get<int>()))
|
||||
);
|
||||
std::cout << std::string(MAXL, '*') << std::endl;
|
||||
int spaces = 7;
|
||||
int hyperparameters_spaces = 15;
|
||||
for (const auto& item : results["results"].items()) {
|
||||
auto key = item.key();
|
||||
auto value = item.value();
|
||||
if (key.size() > spaces) {
|
||||
spaces = key.size();
|
||||
}
|
||||
if (value["hyperparameters"].dump().size() > hyperparameters_spaces) {
|
||||
hyperparameters_spaces = value["hyperparameters"].dump().size();
|
||||
}
|
||||
}
|
||||
std::cout << Colors::GREEN() << " # " << left << setw(spaces) << "Dataset" << " " << setw(19) << "Date" << " "
|
||||
<< "Duration " << setw(8) << "Score" << " " << "Hyperparameters" << std::endl;
|
||||
std::cout << "=== " << string(spaces, '=') << " " << string(19, '=') << " " << string(8, '=') << " "
|
||||
<< string(8, '=') << " " << string(hyperparameters_spaces, '=') << std::endl;
|
||||
bool odd = true;
|
||||
int index = 0;
|
||||
for (const auto& item : results["results"].items()) {
|
||||
auto color = odd ? Colors::CYAN() : Colors::BLUE();
|
||||
auto value = item.value();
|
||||
std::cout << color;
|
||||
std::cout << std::setw(3) << std::right << index++ << " ";
|
||||
std::cout << left << setw(spaces) << item.key() << " " << value["date"].get<string>()
|
||||
<< " " << setw(8) << right << value["duration"].get<string>() << " " << setw(8) << setprecision(6)
|
||||
<< fixed << right << value["score"].get<double>() << " " << value["hyperparameters"].dump() << std::endl;
|
||||
odd = !odd;
|
||||
}
|
||||
std::cout << Colors::RESET() << std::endl;
|
||||
}
|
||||
|
||||
/*
|
||||
* Main
|
||||
*/
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
argparse::ArgumentParser program("b_grid", { project_version.begin(), project_version.end() });
|
||||
manageArguments(program);
|
||||
struct platform::ConfigGrid config;
|
||||
bool dump, compute;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
config.model = program.get<std::string>("model");
|
||||
config.score = program.get<std::string>("score");
|
||||
config.discretize = program.get<bool>("discretize");
|
||||
config.stratified = program.get<bool>("stratified");
|
||||
config.n_folds = program.get<int>("folds");
|
||||
config.quiet = program.get<bool>("quiet");
|
||||
config.only = program.get<bool>("only");
|
||||
config.seeds = program.get<std::vector<int>>("seeds");
|
||||
config.nested = program.get<int>("nested");
|
||||
config.continue_from = program.get<std::string>("continue");
|
||||
if (config.continue_from == platform::GridSearch::NO_CONTINUE() && config.only) {
|
||||
throw std::runtime_error("Cannot use --only without --continue");
|
||||
}
|
||||
dump = program.get<bool>("dump");
|
||||
compute = program.get<bool>("compute");
|
||||
if (dump && (config.continue_from != platform::GridSearch::NO_CONTINUE() || config.only)) {
|
||||
throw std::runtime_error("Cannot use --dump with --continue or --only");
|
||||
}
|
||||
auto excluded = program.get<std::string>("exclude");
|
||||
config.excluded = json::parse(excluded);
|
||||
}
|
||||
catch (const exception& err) {
|
||||
cerr << err.what() << std::endl;
|
||||
cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
/*
|
||||
* Begin Processing
|
||||
*/
|
||||
auto env = platform::DotEnv();
|
||||
config.platform = env.get("platform");
|
||||
platform::Paths::createPath(platform::Paths::grid());
|
||||
auto grid_search = platform::GridSearch(config);
|
||||
platform::Timer timer;
|
||||
timer.start();
|
||||
if (dump) {
|
||||
list_dump(config.model);
|
||||
} else {
|
||||
if (compute) {
|
||||
struct platform::ConfigMPI mpi_config;
|
||||
mpi_config.manager = 0; // which process is the manager
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs);
|
||||
if (mpi_config.n_procs < 2) {
|
||||
throw std::runtime_error("Cannot use --compute with less than 2 mpi processes, try mpirun -np 2 ...");
|
||||
}
|
||||
grid_search.go(mpi_config);
|
||||
if (mpi_config.rank == mpi_config.manager) {
|
||||
auto results = grid_search.loadResults();
|
||||
list_results(results, config.model);
|
||||
std::cout << "Process took " << timer.getDurationString() << std::endl;
|
||||
}
|
||||
MPI_Finalize();
|
||||
} else {
|
||||
// List results
|
||||
auto results = grid_search.loadResults();
|
||||
if (results.empty()) {
|
||||
std::cout << "** No results found" << std::endl;
|
||||
} else {
|
||||
list_results(results, config.model);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << "Done!" << std::endl;
|
||||
return 0;
|
||||
}
|
@@ -1,56 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <locale>
|
||||
#include "Paths.h"
|
||||
#include "Colors.h"
|
||||
#include "Datasets.h"
|
||||
|
||||
const int BALANCE_LENGTH = 75;
|
||||
|
||||
struct separated : numpunct<char> {
|
||||
char do_decimal_point() const { return ','; }
|
||||
char do_thousands_sep() const { return '.'; }
|
||||
std::string do_grouping() const { return "\03"; }
|
||||
};
|
||||
|
||||
void outputBalance(const std::string& balance)
|
||||
{
|
||||
auto temp = std::string(balance);
|
||||
while (temp.size() > BALANCE_LENGTH - 1) {
|
||||
auto part = temp.substr(0, BALANCE_LENGTH);
|
||||
std::cout << part << std::endl;
|
||||
std::cout << setw(48) << " ";
|
||||
temp = temp.substr(BALANCE_LENGTH);
|
||||
}
|
||||
std::cout << temp << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
auto data = platform::Datasets(false, platform::Paths::datasets());
|
||||
locale mylocale(std::cout.getloc(), new separated);
|
||||
locale::global(mylocale);
|
||||
std::cout.imbue(mylocale);
|
||||
std::cout << Colors::GREEN() << "Dataset Sampl. Feat. Cls. Balance" << std::endl;
|
||||
std::string balanceBars = std::string(BALANCE_LENGTH, '=');
|
||||
std::cout << "============================== ====== ===== === " << balanceBars << std::endl;
|
||||
bool odd = true;
|
||||
for (const auto& dataset : data.getNames()) {
|
||||
auto color = odd ? Colors::CYAN() : Colors::BLUE();
|
||||
std::cout << color << setw(30) << left << dataset << " ";
|
||||
data.loadDataset(dataset);
|
||||
auto nSamples = data.getNSamples(dataset);
|
||||
std::cout << setw(6) << right << nSamples << " ";
|
||||
std::cout << setw(5) << right << data.getFeatures(dataset).size() << " ";
|
||||
std::cout << setw(3) << right << data.getNClasses(dataset) << " ";
|
||||
std::stringstream oss;
|
||||
std::string sep = "";
|
||||
for (auto number : data.getClassesCounts(dataset)) {
|
||||
oss << sep << std::setprecision(2) << fixed << (float)number / nSamples * 100.0 << "% (" << number << ")";
|
||||
sep = " / ";
|
||||
}
|
||||
outputBalance(oss.str());
|
||||
odd = !odd;
|
||||
}
|
||||
std::cout << Colors::RESET() << std::endl;
|
||||
return 0;
|
||||
}
|
@@ -1,135 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "Experiment.h"
|
||||
#include "Datasets.h"
|
||||
#include "DotEnv.h"
|
||||
#include "Models.h"
|
||||
#include "modelRegister.h"
|
||||
#include "Paths.h"
|
||||
#include "config.h"
|
||||
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
void manageArguments(argparse::ArgumentParser& program)
|
||||
{
|
||||
auto env = platform::DotEnv();
|
||||
program.add_argument("-d", "--dataset").default_value("").help("Dataset file name");
|
||||
program.add_argument("--hyperparameters").default_value("{}").help("Hyperparameters passed to the model in Experiment");
|
||||
program.add_argument("--hyper-file").default_value("").help("Hyperparameters file name." \
|
||||
"Mutually exclusive with hyperparameters. This file should contain hyperparameters for each dataset in json format.");
|
||||
program.add_argument("-m", "--model")
|
||||
.help("Model to use " + platform::Models::instance()->tostring())
|
||||
.action([](const std::string& value) {
|
||||
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
|
||||
if (find(choices.begin(), choices.end(), value) != choices.end()) {
|
||||
return value;
|
||||
}
|
||||
throw std::runtime_error("Model must be one of " + platform::Models::instance()->tostring());
|
||||
}
|
||||
);
|
||||
program.add_argument("--title").default_value("").help("Experiment title");
|
||||
program.add_argument("--discretize").help("Discretize input dataset").default_value((bool)stoi(env.get("discretize"))).implicit_value(true);
|
||||
program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true);
|
||||
program.add_argument("--save").help("Save result (always save if no dataset is supplied)").default_value(false).implicit_value(true);
|
||||
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
|
||||
program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw std::runtime_error("Number of folds must be greater than 1");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const runtime_error& err) {
|
||||
throw std::runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw std::runtime_error("Number of folds must be an integer");
|
||||
}});
|
||||
auto seed_values = env.getSeeds();
|
||||
program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
argparse::ArgumentParser program("b_main", { project_version.begin(), project_version.end() });
|
||||
manageArguments(program);
|
||||
std::string file_name, model_name, title, hyperparameters_file;
|
||||
json hyperparameters_json;
|
||||
bool discretize_dataset, stratified, saveResults, quiet;
|
||||
std::vector<int> seeds;
|
||||
std::vector<std::string> filesToTest;
|
||||
int n_folds;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
file_name = program.get<std::string>("dataset");
|
||||
model_name = program.get<std::string>("model");
|
||||
discretize_dataset = program.get<bool>("discretize");
|
||||
stratified = program.get<bool>("stratified");
|
||||
quiet = program.get<bool>("quiet");
|
||||
n_folds = program.get<int>("folds");
|
||||
seeds = program.get<std::vector<int>>("seeds");
|
||||
auto hyperparameters = program.get<std::string>("hyperparameters");
|
||||
hyperparameters_json = json::parse(hyperparameters);
|
||||
hyperparameters_file = program.get<std::string>("hyper-file");
|
||||
if (hyperparameters_file != "" && hyperparameters != "{}") {
|
||||
throw runtime_error("hyperparameters and hyper_file are mutually exclusive");
|
||||
}
|
||||
title = program.get<std::string>("title");
|
||||
if (title == "" && file_name == "") {
|
||||
throw runtime_error("title is mandatory if dataset is not provided");
|
||||
}
|
||||
saveResults = program.get<bool>("save");
|
||||
}
|
||||
catch (const exception& err) {
|
||||
cerr << err.what() << std::endl;
|
||||
cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
auto datasets = platform::Datasets(discretize_dataset, platform::Paths::datasets());
|
||||
if (file_name != "") {
|
||||
if (!datasets.isDataset(file_name)) {
|
||||
cerr << "Dataset " << file_name << " not found" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
if (title == "") {
|
||||
title = "Test " + file_name + " " + model_name + " " + to_string(n_folds) + " folds";
|
||||
}
|
||||
filesToTest.push_back(file_name);
|
||||
} else {
|
||||
filesToTest = datasets.getNames();
|
||||
saveResults = true;
|
||||
}
|
||||
platform::HyperParameters test_hyperparams;
|
||||
if (hyperparameters_file != "") {
|
||||
test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_file);
|
||||
} else {
|
||||
test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_json);
|
||||
}
|
||||
|
||||
/*
|
||||
* Begin Processing
|
||||
*/
|
||||
auto env = platform::DotEnv();
|
||||
auto experiment = platform::Experiment();
|
||||
experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("14.0.3");
|
||||
experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform"));
|
||||
experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy");
|
||||
experiment.setHyperparameters(test_hyperparams);
|
||||
for (auto seed : seeds) {
|
||||
experiment.addRandomSeed(seed);
|
||||
}
|
||||
platform::Timer timer;
|
||||
timer.start();
|
||||
experiment.go(filesToTest, quiet);
|
||||
experiment.setDuration(timer.getDuration());
|
||||
if (saveResults) {
|
||||
experiment.save(platform::Paths::results());
|
||||
}
|
||||
if (!quiet)
|
||||
experiment.report();
|
||||
std::cout << "Done!" << std::endl;
|
||||
return 0;
|
||||
}
|
@@ -1,49 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include "ManageResults.h"
|
||||
#include "config.h"
|
||||
|
||||
|
||||
void manageArguments(argparse::ArgumentParser& program, int argc, char** argv)
|
||||
{
|
||||
program.add_argument("-n", "--number").default_value(0).help("Number of results to show (0 = all)").scan<'i', int>();
|
||||
program.add_argument("-m", "--model").default_value("any").help("Filter results of the selected model)");
|
||||
program.add_argument("-s", "--score").default_value("any").help("Filter results of the score name supplied");
|
||||
program.add_argument("--complete").help("Show only results with all datasets").default_value(false).implicit_value(true);
|
||||
program.add_argument("--partial").help("Show only partial results").default_value(false).implicit_value(true);
|
||||
program.add_argument("--compare").help("Compare with best results").default_value(false).implicit_value(true);
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
auto number = program.get<int>("number");
|
||||
if (number < 0) {
|
||||
throw std::runtime_error("Number of results must be greater than or equal to 0");
|
||||
}
|
||||
auto model = program.get<std::string>("model");
|
||||
auto score = program.get<std::string>("score");
|
||||
auto complete = program.get<bool>("complete");
|
||||
auto partial = program.get<bool>("partial");
|
||||
auto compare = program.get<bool>("compare");
|
||||
}
|
||||
catch (const std::exception& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
auto program = argparse::ArgumentParser("b_manage", { project_version.begin(), project_version.end() });
|
||||
manageArguments(program, argc, argv);
|
||||
int number = program.get<int>("number");
|
||||
std::string model = program.get<std::string>("model");
|
||||
std::string score = program.get<std::string>("score");
|
||||
auto complete = program.get<bool>("complete");
|
||||
auto partial = program.get<bool>("partial");
|
||||
auto compare = program.get<bool>("compare");
|
||||
if (complete)
|
||||
partial = false;
|
||||
auto manager = platform::ManageResults(number, model, score, complete, partial, compare);
|
||||
manager.doMenu();
|
||||
return 0;
|
||||
}
|
@@ -1,29 +0,0 @@
|
||||
#ifndef MODEL_REGISTER_H
|
||||
#define MODEL_REGISTER_H
|
||||
static platform::Registrar registrarT("TAN",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::TAN();});
|
||||
static platform::Registrar registrarTLD("TANLd",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::TANLd();});
|
||||
static platform::Registrar registrarS("SPODE",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::SPODE(2);});
|
||||
static platform::Registrar registrarSLD("SPODELd",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::SPODELd(2);});
|
||||
static platform::Registrar registrarK("KDB",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::KDB(2);});
|
||||
static platform::Registrar registrarKLD("KDBLd",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::KDBLd(2);});
|
||||
static platform::Registrar registrarA("AODE",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODE();});
|
||||
static platform::Registrar registrarALD("AODELd",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODELd();});
|
||||
static platform::Registrar registrarBA("BoostAODE",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::BoostAODE();});
|
||||
static platform::Registrar registrarSt("STree",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new pywrap::STree();});
|
||||
static platform::Registrar registrarOdte("Odte",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new pywrap::ODTE();});
|
||||
static platform::Registrar registrarSvc("SVC",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new pywrap::SVC();});
|
||||
static platform::Registrar registrarRaF("RandomForest",
|
||||
[](void) -> bayesnet::BaseClassifier* { return new pywrap::RandomForest();});
|
||||
#endif
|
@@ -4,12 +4,17 @@
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include "BestResults.h"
|
||||
#include "Result.h"
|
||||
#include "Colors.h"
|
||||
#include "Statistics.h"
|
||||
#include <cctype>
|
||||
#include "common/Colors.h"
|
||||
#include "common/CLocale.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Utils.h" // compute_std
|
||||
#include "results/Result.h"
|
||||
#include "BestResultsExcel.h"
|
||||
#include "CLocale.h"
|
||||
#include "BestResultsTex.h"
|
||||
#include "BestResultsMd.h"
|
||||
#include "best/Statistics.h"
|
||||
#include "BestResults.h"
|
||||
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
@@ -37,41 +42,36 @@ namespace platform {
|
||||
json bests;
|
||||
for (const auto& file : files) {
|
||||
auto result = Result(path, file);
|
||||
auto data = result.load();
|
||||
auto data = result.getJson();
|
||||
for (auto const& item : data.at("results")) {
|
||||
bool update = false;
|
||||
// Check if results file contains only one dataset
|
||||
bool update = true;
|
||||
auto datasetName = item.at("dataset").get<std::string>();
|
||||
if (bests.contains(datasetName)) {
|
||||
if (item.at("score").get<double>() > bests[datasetName].at(0).get<double>()) {
|
||||
update = true;
|
||||
if (dataset != "any" && dataset != datasetName) {
|
||||
continue;
|
||||
}
|
||||
if (bests.contains(datasetName)) {
|
||||
if (item.at("score").get<double>() < bests[datasetName].at(0).get<double>()) {
|
||||
update = false;
|
||||
}
|
||||
} else {
|
||||
update = true;
|
||||
}
|
||||
if (update) {
|
||||
bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file };
|
||||
bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file, item.at("score_std").get<double>() };
|
||||
}
|
||||
}
|
||||
}
|
||||
std::string bestFileName = path + bestResultFile();
|
||||
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
|
||||
fclose(fileTest);
|
||||
std::cout << Colors::MAGENTA() << "File " << bestFileName << " already exists and it shall be overwritten." << Colors::RESET() << std::endl;
|
||||
if (bests.empty()) {
|
||||
std::cerr << Colors::MAGENTA() << "No results found for model " << model << " and score " << score << Colors::RESET() << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
std::string bestFileName = path + Paths::bestResultsFile(score, model);
|
||||
std::ofstream file(bestFileName);
|
||||
file << bests;
|
||||
file.close();
|
||||
return bestFileName;
|
||||
}
|
||||
std::string BestResults::bestResultFile()
|
||||
{
|
||||
return "best_results_" + score + "_" + model + ".json";
|
||||
}
|
||||
std::pair<std::string, std::string> getModelScore(std::string name)
|
||||
{
|
||||
// results_accuracy_BoostAODE_MacBookpro16_2023-09-06_12:27:00_1.json
|
||||
int i = 0;
|
||||
auto pos = name.find("_");
|
||||
auto pos2 = name.find("_", pos + 1);
|
||||
std::string score = name.substr(pos + 1, pos2 - pos - 1);
|
||||
@@ -93,6 +93,7 @@ namespace platform {
|
||||
}
|
||||
}
|
||||
}
|
||||
std::sort(files.begin(), files.end());
|
||||
return files;
|
||||
}
|
||||
json BestResults::loadFile(const std::string& fileName)
|
||||
@@ -121,29 +122,44 @@ namespace platform {
|
||||
models.insert(fileModel);
|
||||
}
|
||||
result = std::vector<std::string>(models.begin(), models.end());
|
||||
maxModelName = (*max_element(result.begin(), result.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
maxModelName = std::max(minLength, maxModelName);
|
||||
return result;
|
||||
}
|
||||
std::string toLower(std::string data)
|
||||
{
|
||||
std::transform(data.begin(), data.end(), data.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
return data;
|
||||
}
|
||||
std::vector<std::string> BestResults::getDatasets(json table)
|
||||
{
|
||||
std::vector<std::string> datasets;
|
||||
for (const auto& dataset : table.items()) {
|
||||
datasets.push_back(dataset.key());
|
||||
for (const auto& dataset_ : table.items()) {
|
||||
datasets.push_back(dataset_.key());
|
||||
}
|
||||
std::stable_sort(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) {
|
||||
return toLower(a) < toLower(b);
|
||||
});
|
||||
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
maxDatasetName = std::max(7, maxDatasetName);
|
||||
return datasets;
|
||||
}
|
||||
void BestResults::buildAll()
|
||||
{
|
||||
auto models = getModels();
|
||||
std::cout << "Building best results for model: ";
|
||||
for (const auto& model : models) {
|
||||
std::cout << "Building best results for model: " << model << std::endl;
|
||||
this->model = model;
|
||||
std::cout << model << ", ";
|
||||
build();
|
||||
}
|
||||
std::cout << "end." << std::endl << std::endl;
|
||||
model = "any";
|
||||
}
|
||||
void BestResults::listFile()
|
||||
{
|
||||
std::string bestFileName = path + bestResultFile();
|
||||
std::string bestFileName = path + Paths::bestResultsFile(score, model);
|
||||
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
|
||||
fclose(fileTest);
|
||||
} else {
|
||||
@@ -154,7 +170,6 @@ namespace platform {
|
||||
auto date = ftime_to_string(std::filesystem::last_write_time(bestFileName));
|
||||
auto data = loadFile(bestFileName);
|
||||
auto datasets = getDatasets(data);
|
||||
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
int maxFileName = 0;
|
||||
int maxHyper = 15;
|
||||
for (auto const& item : data.items()) {
|
||||
@@ -168,10 +183,9 @@ namespace platform {
|
||||
std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << "Dataset" << "Score " << std::setw(maxFileName) << "File" << " Hyperparameters" << std::endl;
|
||||
std::cout << "=== " << std::string(maxDatasetName, '=') << " =========== " << std::string(maxFileName, '=') << " " << std::string(maxHyper, '=') << std::endl;
|
||||
auto i = 0;
|
||||
bool odd = true;
|
||||
double total = 0;
|
||||
for (auto const& item : data.items()) {
|
||||
auto color = odd ? Colors::BLUE() : Colors::CYAN();
|
||||
auto color = (i % 2) ? Colors::BLUE() : Colors::CYAN();
|
||||
double value = item.value().at(0).get<double>();
|
||||
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
|
||||
std::cout << std::setw(maxDatasetName) << std::left << item.key() << " ";
|
||||
@@ -180,10 +194,10 @@ namespace platform {
|
||||
std::cout << item.value().at(1) << " ";
|
||||
std::cout << std::endl;
|
||||
total += value;
|
||||
odd = !odd;
|
||||
}
|
||||
std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ===========" << std::endl;
|
||||
std::cout << std::setw(5 + maxDatasetName) << "Total.................. " << std::setw(11) << std::setprecision(8) << std::fixed << total << std::endl;
|
||||
std::cout << Colors::GREEN() << " Total" << std::string(maxDatasetName - 5, '.') << " " << std::setw(11) << std::setprecision(8) << std::fixed << total << std::endl;
|
||||
|
||||
}
|
||||
json BestResults::buildTableResults(std::vector<std::string> models)
|
||||
{
|
||||
@@ -191,7 +205,7 @@ namespace platform {
|
||||
auto maxDate = std::filesystem::file_time_type::max();
|
||||
for (const auto& model : models) {
|
||||
this->model = model;
|
||||
std::string bestFileName = path + bestResultFile();
|
||||
std::string bestFileName = path + Paths::bestResultsFile(score, model);
|
||||
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
|
||||
fclose(fileTest);
|
||||
} else {
|
||||
@@ -208,13 +222,20 @@ namespace platform {
|
||||
table["dateTable"] = ftime_to_string(maxDate);
|
||||
return table;
|
||||
}
|
||||
void BestResults::printTableResults(std::vector<std::string> models, json table)
|
||||
|
||||
void BestResults::printTableResults(std::vector<std::string> models, json table, bool tex, bool index)
|
||||
{
|
||||
std::stringstream oss;
|
||||
oss << Colors::GREEN() << "Best results for " << score << " as of " << table.at("dateTable").get<std::string>() << std::endl;
|
||||
std::cout << oss.str();
|
||||
std::cout << std::string(oss.str().size() - 8, '-') << std::endl;
|
||||
std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << std::string("Dataset");
|
||||
auto bestResultsTex = BestResultsTex(score);
|
||||
auto bestResultsMd = BestResultsMd();
|
||||
if (tex) {
|
||||
bestResultsTex.results_header(models, table.at("dateTable").get<std::string>(), index);
|
||||
bestResultsMd.results_header(models, table.at("dateTable").get<std::string>());
|
||||
}
|
||||
for (const auto& model : models) {
|
||||
std::cout << std::setw(maxModelName) << std::left << model << " ";
|
||||
}
|
||||
@@ -225,21 +246,27 @@ namespace platform {
|
||||
}
|
||||
std::cout << std::endl;
|
||||
auto i = 0;
|
||||
bool odd = true;
|
||||
std::map<std::string, double> totals;
|
||||
std::map<std::string, std::vector<double>> totals;
|
||||
int nDatasets = table.begin().value().size();
|
||||
for (const auto& model : models) {
|
||||
totals[model] = 0.0;
|
||||
}
|
||||
auto datasets = getDatasets(table.begin().value());
|
||||
for (auto const& dataset : datasets) {
|
||||
auto color = odd ? Colors::BLUE() : Colors::CYAN();
|
||||
if (tex) {
|
||||
bestResultsTex.results_body(datasets, table, index);
|
||||
bestResultsMd.results_body(datasets, table);
|
||||
}
|
||||
for (auto const& dataset_ : datasets) {
|
||||
auto color = (i % 2) ? Colors::BLUE() : Colors::CYAN();
|
||||
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
|
||||
std::cout << std::setw(maxDatasetName) << std::left << dataset << " ";
|
||||
std::cout << std::setw(maxDatasetName) << std::left << dataset_ << " ";
|
||||
double maxValue = 0;
|
||||
// Find out the max value for this dataset
|
||||
for (const auto& model : models) {
|
||||
double value = table[model].at(dataset).at(0).get<double>();
|
||||
double value;
|
||||
try {
|
||||
value = table[model].at(dataset_).at(0).get<double>();
|
||||
}
|
||||
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
|
||||
value = -1.0;
|
||||
}
|
||||
if (value > maxValue) {
|
||||
maxValue = value;
|
||||
}
|
||||
@@ -247,34 +274,53 @@ namespace platform {
|
||||
// Print the row with red colors on max values
|
||||
for (const auto& model : models) {
|
||||
std::string efectiveColor = color;
|
||||
double value = table[model].at(dataset).at(0).get<double>();
|
||||
double value, std;
|
||||
try {
|
||||
value = table[model].at(dataset_).at(0).get<double>();
|
||||
std = table[model].at(dataset_).at(3).get<double>();
|
||||
}
|
||||
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
|
||||
value = -1.0;
|
||||
std = -1.0;
|
||||
}
|
||||
if (value == maxValue) {
|
||||
efectiveColor = Colors::RED();
|
||||
}
|
||||
totals[model] += value;
|
||||
std::cout << efectiveColor << std::setw(maxModelName) << std::setprecision(maxModelName - 2) << std::fixed << value << " ";
|
||||
if (value == -1) {
|
||||
std::cout << Colors::YELLOW() << std::setw(maxModelName) << std::right << "N/A" << " ";
|
||||
} else {
|
||||
totals[model].push_back(value);
|
||||
std::cout << efectiveColor << std::setw(maxModelName - 6) << std::setprecision(maxModelName - 8) << std::fixed << value;
|
||||
std::cout << efectiveColor << "±" << std::setw(5) << std::setprecision(3) << std::fixed << std << " ";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
odd = !odd;
|
||||
}
|
||||
std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ";
|
||||
for (const auto& model : models) {
|
||||
std::cout << std::string(maxModelName, '=') << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << Colors::GREEN() << std::setw(5 + maxDatasetName) << " Totals...................";
|
||||
double max = 0.0;
|
||||
std::cout << Colors::GREEN() << " Average" << std::string(maxDatasetName - 7, '.') << " ";
|
||||
double max_value = 0.0;
|
||||
std::string best_model = "";
|
||||
for (const auto& total : totals) {
|
||||
if (total.second > max) {
|
||||
max = total.second;
|
||||
auto actual = std::reduce(total.second.begin(), total.second.end());
|
||||
if (actual > max_value) {
|
||||
max_value = actual;
|
||||
best_model = total.first;
|
||||
}
|
||||
}
|
||||
if (tex) {
|
||||
bestResultsTex.results_footer(totals, best_model);
|
||||
bestResultsMd.results_footer(totals, best_model);
|
||||
}
|
||||
for (const auto& model : models) {
|
||||
std::string efectiveColor = Colors::GREEN();
|
||||
if (totals[model] == max) {
|
||||
efectiveColor = Colors::RED();
|
||||
}
|
||||
std::cout << efectiveColor << std::right << std::setw(maxModelName) << std::setprecision(maxModelName - 4) << std::fixed << totals[model] << " ";
|
||||
std::string efectiveColor = model == best_model ? Colors::RED() : Colors::GREEN();
|
||||
double value = std::reduce(totals[model].begin(), totals[model].end()) / nDatasets;
|
||||
double std = compute_std(totals[model], value);
|
||||
std::cout << efectiveColor << std::right << std::setw(maxModelName - 6) << std::setprecision(maxModelName - 8) << std::fixed << value;
|
||||
std::cout << efectiveColor << "±" << std::setw(5) << std::setprecision(3) << std::fixed << std << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
@@ -286,58 +332,53 @@ namespace platform {
|
||||
// Build the table of results
|
||||
json table = buildTableResults(models);
|
||||
std::vector<std::string> datasets = getDatasets(table.begin().value());
|
||||
BestResultsExcel excel(score, datasets);
|
||||
excel.reportSingle(model, path + bestResultFile());
|
||||
messageExcelFile(excel.getFileName());
|
||||
BestResultsExcel excel_report(path, score, datasets);
|
||||
excel_report.reportSingle(model, path + Paths::bestResultsFile(score, model));
|
||||
messageOutputFile("Excel", excel_report.getFileName());
|
||||
excelFileName = excel_report.getFileName();
|
||||
}
|
||||
}
|
||||
void BestResults::reportAll(bool excel)
|
||||
void BestResults::reportAll(bool excel, bool tex, bool index)
|
||||
{
|
||||
auto models = getModels();
|
||||
// Build the table of results
|
||||
json table = buildTableResults(models);
|
||||
std::vector<std::string> datasets = getDatasets(table.begin().value());
|
||||
maxModelName = (*max_element(models.begin(), models.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
maxModelName = std::max(12, maxModelName);
|
||||
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
maxDatasetName = std::max(25, maxDatasetName);
|
||||
// Print the table of results
|
||||
printTableResults(models, table);
|
||||
printTableResults(models, table, tex, index);
|
||||
// Compute the Friedman test
|
||||
std::map<std::string, std::map<std::string, float>> ranksModels;
|
||||
if (friedman) {
|
||||
Statistics stats(models, datasets, table, significance);
|
||||
Statistics stats(score, models, datasets, table, significance);
|
||||
auto result = stats.friedmanTest();
|
||||
stats.postHocHolmTest(result);
|
||||
stats.postHocTest();
|
||||
stats.postHocTestReport(result, tex);
|
||||
ranksModels = stats.getRanks();
|
||||
}
|
||||
if (tex) {
|
||||
messageOutputFile("TeX", Paths::tex() + Paths::tex_output());
|
||||
messageOutputFile("MarkDown", Paths::tex() + Paths::md_output());
|
||||
if (friedman) {
|
||||
messageOutputFile("TeX", Paths::tex() + Paths::tex_post_hoc());
|
||||
messageOutputFile("MarkDown", Paths::tex() + Paths::md_post_hoc());
|
||||
}
|
||||
}
|
||||
if (excel) {
|
||||
BestResultsExcel excel(score, datasets);
|
||||
BestResultsExcel excel(path, score, datasets);
|
||||
excel.reportAll(models, table, ranksModels, friedman, significance);
|
||||
if (friedman) {
|
||||
int idx = -1;
|
||||
double min = 2000;
|
||||
// Find out the control model
|
||||
auto totals = std::vector<double>(models.size(), 0.0);
|
||||
for (const auto& dataset : datasets) {
|
||||
for (int i = 0; i < models.size(); ++i) {
|
||||
totals[i] += ranksModels[dataset][models[i]];
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < models.size(); ++i) {
|
||||
if (totals[i] < min) {
|
||||
min = totals[i];
|
||||
idx = i;
|
||||
}
|
||||
}
|
||||
Statistics stats(score, models, datasets, table, significance);
|
||||
int idx = stats.getControlIdx();
|
||||
model = models.at(idx);
|
||||
excel.reportSingle(model, path + bestResultFile());
|
||||
excel.reportSingle(model, path + Paths::bestResultsFile(score, model));
|
||||
}
|
||||
messageExcelFile(excel.getFileName());
|
||||
messageOutputFile("Excel", excel.getFileName());
|
||||
excelFileName = excel.getFileName();
|
||||
}
|
||||
}
|
||||
void BestResults::messageExcelFile(const std::string& fileName)
|
||||
void BestResults::messageOutputFile(const std::string& title, const std::string& fileName)
|
||||
{
|
||||
std::cout << Colors::YELLOW() << "** Excel file generated: " << fileName << Colors::RESET() << std::endl;
|
||||
std::cout << Colors::YELLOW() << "** " << std::setw(8) << std::left << title
|
||||
<< " file generated: " << fileName << Colors::RESET() << std::endl;
|
||||
}
|
||||
}
|
@@ -2,35 +2,39 @@
|
||||
#define BESTRESULTS_H
|
||||
#include <string>
|
||||
#include <nlohmann/json.hpp>
|
||||
using json = nlohmann::json;
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
class BestResults {
|
||||
public:
|
||||
explicit BestResults(const std::string& path, const std::string& score, const std::string& model, bool friedman, double significance = 0.05)
|
||||
: path(path), score(score), model(model), friedman(friedman), significance(significance)
|
||||
explicit BestResults(const std::string& path, const std::string& score, const std::string& model, const std::string& dataset, bool friedman, double significance = 0.05)
|
||||
: path(path), score(score), model(model), dataset(dataset), friedman(friedman), significance(significance)
|
||||
{
|
||||
}
|
||||
std::string build();
|
||||
void reportSingle(bool excel);
|
||||
void reportAll(bool excel);
|
||||
void reportAll(bool excel, bool tex, bool index);
|
||||
void buildAll();
|
||||
std::string getExcelFileName() const { return excelFileName; }
|
||||
private:
|
||||
std::vector<std::string> getModels();
|
||||
std::vector<std::string> getDatasets(json table);
|
||||
std::vector<std::string> loadResultFiles();
|
||||
void messageExcelFile(const std::string& fileName);
|
||||
void messageOutputFile(const std::string& title, const std::string& fileName);
|
||||
json buildTableResults(std::vector<std::string> models);
|
||||
void printTableResults(std::vector<std::string> models, json table);
|
||||
std::string bestResultFile();
|
||||
void printTableResults(std::vector<std::string> models, json table, bool tex, bool index);
|
||||
json loadFile(const std::string& fileName);
|
||||
void listFile();
|
||||
std::string path;
|
||||
std::string score;
|
||||
std::string model;
|
||||
std::string dataset;
|
||||
bool friedman;
|
||||
double significance;
|
||||
int maxModelName = 0;
|
||||
int maxDatasetName = 0;
|
||||
int minLength = 13; // Minimum length for scores
|
||||
std::string excelFileName;
|
||||
};
|
||||
}
|
||||
#endif //BESTRESULTS_H
|
||||
#endif
|
@@ -1,10 +1,10 @@
|
||||
#include <sstream>
|
||||
#include "BestResultsExcel.h"
|
||||
#include "Paths.h"
|
||||
#include <map>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "Statistics.h"
|
||||
#include "ReportExcel.h"
|
||||
#include "common/Paths.h"
|
||||
#include "reports/ReportExcel.h"
|
||||
#include "best/Statistics.h"
|
||||
#include "BestResultsExcel.h"
|
||||
|
||||
namespace platform {
|
||||
json loadResultData(const std::string& fileName)
|
||||
@@ -30,9 +30,10 @@ namespace platform {
|
||||
}
|
||||
return columnName;
|
||||
}
|
||||
BestResultsExcel::BestResultsExcel(const std::string& score, const std::vector<std::string>& datasets) : score(score), datasets(datasets)
|
||||
BestResultsExcel::BestResultsExcel(const std::string& path, const std::string& score, const std::vector<std::string>& datasets) : path(path), score(score), datasets(datasets)
|
||||
{
|
||||
workbook = workbook_new((Paths::excel() + fileName).c_str());
|
||||
file_name = Paths::bestResultsExcel(score);
|
||||
workbook = workbook_new(getFileName().c_str());
|
||||
setProperties("Best Results");
|
||||
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
datasetNameSize = std::max(datasetNameSize, maxDatasetName);
|
||||
@@ -63,19 +64,21 @@ namespace platform {
|
||||
json data = loadResultData(fileName);
|
||||
|
||||
std::string title = "Best results for " + model;
|
||||
worksheet_merge_range(worksheet, 0, 0, 0, 4, title.c_str(), styles["headerFirst"]);
|
||||
worksheet_merge_range(worksheet, 0, 0, 0, 5, title.c_str(), styles["headerFirst"]);
|
||||
// Body header
|
||||
row = 3;
|
||||
int col = 1;
|
||||
writeString(row, 0, "Nº", "bodyHeader");
|
||||
writeString(row, 0, "#", "bodyHeader");
|
||||
writeString(row, 1, "Dataset", "bodyHeader");
|
||||
writeString(row, 2, "Score", "bodyHeader");
|
||||
writeString(row, 3, "File", "bodyHeader");
|
||||
writeString(row, 4, "Hyperparameters", "bodyHeader");
|
||||
writeString(row, 5, "F", "bodyHeader");
|
||||
auto i = 0;
|
||||
std::string hyperparameters;
|
||||
int hypSize = 22;
|
||||
std::map<std::string, std::string> files; // map of files imported and their tabs
|
||||
int numLines = data.size();
|
||||
for (auto const& item : data.items()) {
|
||||
row++;
|
||||
writeInt(row, 0, i++, "ints");
|
||||
@@ -89,7 +92,7 @@ namespace platform {
|
||||
catch (const std::out_of_range& oor) {
|
||||
auto tabName = "table_" + std::to_string(i);
|
||||
auto worksheetNew = workbook_add_worksheet(workbook, tabName.c_str());
|
||||
json data = loadResultData(Paths::results() + fileName);
|
||||
json data = loadResultData(path + fileName);
|
||||
auto report = ReportExcel(data, false, workbook, worksheetNew);
|
||||
report.show();
|
||||
hyperlink = "#table_" + std::to_string(i);
|
||||
@@ -103,6 +106,8 @@ namespace platform {
|
||||
hypSize = hyperparameters.size();
|
||||
}
|
||||
writeString(row, 4, hyperparameters, "text");
|
||||
std::string countHyperparameters = "=COUNTIF(e5:e" + std::to_string(numLines + 4) + ", e" + std::to_string(row + 1) + ")";
|
||||
worksheet_write_formula(worksheet, row, 5, countHyperparameters.c_str(), efectiveStyle("ints"));
|
||||
}
|
||||
row++;
|
||||
// Set Totals
|
||||
@@ -159,6 +164,7 @@ namespace platform {
|
||||
addConditionalFormat("max");
|
||||
footer(false);
|
||||
if (friedman) {
|
||||
if (score == "accuracy") {
|
||||
// Create Sheet with ranks
|
||||
worksheet = workbook_add_worksheet(workbook, "Ranks");
|
||||
formatColumns();
|
||||
@@ -166,14 +172,12 @@ namespace platform {
|
||||
body(true);
|
||||
addConditionalFormat("min");
|
||||
footer(true);
|
||||
}
|
||||
// Create Sheet with Friedman Test
|
||||
doFriedman();
|
||||
}
|
||||
}
|
||||
std::string BestResultsExcel::getFileName()
|
||||
{
|
||||
return Paths::excel() + fileName;
|
||||
}
|
||||
|
||||
void BestResultsExcel::header(bool ranks)
|
||||
{
|
||||
row = 0;
|
||||
@@ -182,7 +186,7 @@ namespace platform {
|
||||
// Body header
|
||||
row = 3;
|
||||
int col = 1;
|
||||
writeString(row, 0, "Nº", "bodyHeader");
|
||||
writeString(row, 0, "#", "bodyHeader");
|
||||
writeString(row, 1, "Dataset", "bodyHeader");
|
||||
for (const auto& model : models) {
|
||||
writeString(row, ++col, model.c_str(), "bodyHeader");
|
||||
@@ -237,14 +241,15 @@ namespace platform {
|
||||
for (int i = 0; i < columns_sizes.size(); ++i) {
|
||||
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
|
||||
}
|
||||
worksheet_merge_range(worksheet, 0, 0, 0, 1 + models.size(), "Friedman Test", styles["headerFirst"]);
|
||||
worksheet_merge_range(worksheet, 0, 0, 0, 7, "Friedman Test", styles["headerFirst"]);
|
||||
row = 2;
|
||||
Statistics stats(models, datasets, table, significance, false);
|
||||
Statistics stats(score, models, datasets, table, significance, false); // No output
|
||||
auto result = stats.friedmanTest();
|
||||
stats.postHocHolmTest(result);
|
||||
stats.postHocTest();
|
||||
stats.postHocTestReport(result, false); // No tex output
|
||||
auto friedmanResult = stats.getFriedmanResult();
|
||||
auto holmResult = stats.getHolmResult();
|
||||
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Null hypothesis: H0 'There is no significant differences between all the classifiers.'", styles["headerSmall"]);
|
||||
auto postHocResults = stats.getPostHocResults();
|
||||
worksheet_merge_range(worksheet, row, 0, row, 7, "Null hypothesis: H0 'There is no significant differences between all the classifiers.'", styles["headerSmall"]);
|
||||
row += 2;
|
||||
writeString(row, 1, "Friedman Q", "bodyHeader");
|
||||
writeDouble(row, 2, friedmanResult.statistic, "bodyHeader");
|
||||
@@ -258,11 +263,11 @@ namespace platform {
|
||||
writeDouble(row, 4, significance, "bodyHeader");
|
||||
writeString(row, 5, friedmanResult.reject ? "Reject H0" : "Accept H0", "bodyHeader");
|
||||
row += 3;
|
||||
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Holm Test", styles["headerFirst"]);
|
||||
worksheet_merge_range(worksheet, row, 0, row, 7, "Holm Test", styles["headerFirst"]);
|
||||
row += 2;
|
||||
worksheet_merge_range(worksheet, row, 0, row, 1 + models.size(), "Null hypothesis: H0 'There is no significant differences between the control model and the other models.'", styles["headerSmall"]);
|
||||
worksheet_merge_range(worksheet, row, 0, row, 7, "Null hypothesis: H0 'There is no significant differences between the control model and the other models.'", styles["headerSmall"]);
|
||||
row += 2;
|
||||
std::string controlModel = "Control Model: " + holmResult.model;
|
||||
std::string controlModel = "Control Model: " + postHocResults.at(0).model;
|
||||
worksheet_merge_range(worksheet, row, 1, row, 7, controlModel.c_str(), styles["bodyHeader_odd"]);
|
||||
row++;
|
||||
writeString(row, 1, "Model", "bodyHeader");
|
||||
@@ -274,7 +279,7 @@ namespace platform {
|
||||
writeString(row, 7, "Reject H0", "bodyHeader");
|
||||
row++;
|
||||
bool first = true;
|
||||
for (const auto& item : holmResult.holmLines) {
|
||||
for (const auto& item : postHocResults) {
|
||||
writeString(row, 1, item.model, "text");
|
||||
if (first) {
|
||||
// Control model info
|
||||
@@ -296,5 +301,8 @@ namespace platform {
|
||||
}
|
||||
row++;
|
||||
}
|
||||
// set column width for the 5th and the 7th column
|
||||
worksheet_set_column(worksheet, 4, 5, 10, NULL);
|
||||
worksheet_set_column(worksheet, 6, 7, 10, NULL);
|
||||
}
|
||||
}
|
@@ -1,21 +1,19 @@
|
||||
#ifndef BESTRESULTS_EXCEL_H
|
||||
#define BESTRESULTS_EXCEL_H
|
||||
#include "ExcelFile.h"
|
||||
#ifndef BESTRESULTSEXCEL_H
|
||||
#define BESTRESULTSEXCEL_H
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "reports/ExcelFile.h"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace platform {
|
||||
|
||||
class BestResultsExcel : ExcelFile {
|
||||
using json = nlohmann::ordered_json;
|
||||
class BestResultsExcel : public ExcelFile {
|
||||
public:
|
||||
BestResultsExcel(const std::string& score, const std::vector<std::string>& datasets);
|
||||
BestResultsExcel(const std::string& path, const std::string& score, const std::vector<std::string>& datasets);
|
||||
~BestResultsExcel();
|
||||
void reportAll(const std::vector<std::string>& models, const json& table, const std::map<std::string, std::map<std::string, float>>& ranks, bool friedman, double significance);
|
||||
void reportSingle(const std::string& model, const std::string& fileName);
|
||||
std::string getFileName();
|
||||
private:
|
||||
void build();
|
||||
void header(bool ranks);
|
||||
@@ -24,7 +22,7 @@ namespace platform {
|
||||
void formatColumns();
|
||||
void doFriedman();
|
||||
void addConditionalFormat(std::string formula);
|
||||
const std::string fileName = "BestResults.xlsx";
|
||||
std::string path;
|
||||
std::string score;
|
||||
std::vector<std::string> models;
|
||||
std::vector<std::string> datasets;
|
||||
@@ -36,4 +34,4 @@ namespace platform {
|
||||
int datasetNameSize = 25; // Min size of the column
|
||||
};
|
||||
}
|
||||
#endif //BESTRESULTS_EXCEL_H
|
||||
#endif
|
105
src/best/BestResultsMd.cpp
Normal file
105
src/best/BestResultsMd.cpp
Normal file
@@ -0,0 +1,105 @@
|
||||
#include <iostream>
|
||||
#include "BestResultsMd.h"
|
||||
#include "common/Utils.h" // compute_std
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
void BestResultsMd::openMdFile(const std::string& name)
|
||||
{
|
||||
handler.open(name);
|
||||
if (!handler.is_open()) {
|
||||
std::cerr << "Error opening file " << name << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
void BestResultsMd::results_header(const std::vector<std::string>& models, const std::string& date)
|
||||
{
|
||||
this->models = models;
|
||||
auto file_name = Paths::tex() + Paths::md_output();
|
||||
openMdFile(file_name);
|
||||
handler << "<!-- This file has been generated by the platform program" << std::endl;
|
||||
handler << " Date: " << date.c_str() << std::endl;
|
||||
handler << "" << std::endl;
|
||||
handler << " Table of results" << std::endl;
|
||||
handler << "-->" << std::endl;
|
||||
handler << "| # | Dataset |";
|
||||
for (const auto& model : models) {
|
||||
handler << " " << model.c_str() << " |";
|
||||
}
|
||||
handler << std::endl;
|
||||
handler << "|--: | :--- |";
|
||||
for (const auto& model : models) {
|
||||
handler << " :---: |";
|
||||
}
|
||||
handler << std::endl;
|
||||
}
|
||||
void BestResultsMd::results_body(const std::vector<std::string>& datasets, json& table)
|
||||
{
|
||||
int i = 0;
|
||||
for (auto const& dataset : datasets) {
|
||||
// Find out max value for this dataset
|
||||
double max_value = 0;
|
||||
// Find out the max value for this dataset
|
||||
for (const auto& model : models) {
|
||||
double value;
|
||||
try {
|
||||
value = table[model].at(dataset).at(0).get<double>();
|
||||
}
|
||||
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
|
||||
value = -1.0;
|
||||
}
|
||||
if (value > max_value) {
|
||||
max_value = value;
|
||||
}
|
||||
}
|
||||
handler << "| " << ++i << " | " << dataset.c_str() << " | ";
|
||||
for (const auto& model : models) {
|
||||
double value = table[model].at(dataset).at(0).get<double>();
|
||||
double std_value = table[model].at(dataset).at(3).get<double>();
|
||||
const char* bold = value == max_value ? "**" : "";
|
||||
handler << bold << std::setprecision(4) << std::fixed << value << "±" << std::setprecision(3) << std_value << bold << " | ";
|
||||
}
|
||||
handler << std::endl;
|
||||
}
|
||||
}
|
||||
void BestResultsMd::results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model)
|
||||
{
|
||||
handler << "| | **Average Score** | ";
|
||||
int nDatasets = totals.begin()->second.size();
|
||||
for (const auto& model : models) {
|
||||
double value = std::reduce(totals.at(model).begin(), totals.at(model).end()) / nDatasets;
|
||||
double std_value = compute_std(totals.at(model), value);
|
||||
const char* bold = model == best_model ? "**" : "";
|
||||
handler << bold << std::setprecision(4) << std::fixed << value << "±" << std::setprecision(3) << std::fixed << std_value << bold << " | ";
|
||||
}
|
||||
|
||||
handler.close();
|
||||
}
|
||||
void BestResultsMd::postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date)
|
||||
{
|
||||
auto file_name = Paths::tex() + Paths::md_post_hoc();
|
||||
openMdFile(file_name);
|
||||
handler << "<!-- This file has been generated by the platform program" << std::endl;
|
||||
handler << " Date: " << date.c_str() << std::endl;
|
||||
handler << std::endl;
|
||||
handler << " Post-hoc handler test" << std::endl;
|
||||
handler << "-->" << std::endl;
|
||||
handler << "Post-hoc " << kind << " test: H<sub>0</sub>: There is no significant differences between the control model and the other models." << std::endl << std::endl;
|
||||
handler << "| classifier | pvalue | rank | win | tie | loss | H<sub>0</sub> |" << std::endl;
|
||||
handler << "| :-- | --: | --: | --:| --: | --: | :--: |" << std::endl;
|
||||
bool first = true;
|
||||
for (auto const& line : postHocResults) {
|
||||
auto textStatus = !line.reject ? "**" : " ";
|
||||
if (first) {
|
||||
handler << "| " << line.model << " | - | " << std::fixed << std::setprecision(2) << line.rank << " | - | - | - |" << std::endl;
|
||||
first = false;
|
||||
} else {
|
||||
handler << "| " << line.model << " | " << textStatus << std::scientific << std::setprecision(4) << line.pvalue << textStatus << " |";
|
||||
handler << std::fixed << std::setprecision(2) << line.rank << " | " << line.wtl.win << " | " << line.wtl.tie << " | " << line.wtl.loss << " |";
|
||||
handler << (line.reject ? "rejected" : "**accepted**") << " |" << std::endl;
|
||||
}
|
||||
}
|
||||
handler << std::endl;
|
||||
handler.close();
|
||||
}
|
||||
}
|
24
src/best/BestResultsMd.h
Normal file
24
src/best/BestResultsMd.h
Normal file
@@ -0,0 +1,24 @@
|
||||
#ifndef BEST_RESULTS_MD_H
|
||||
#define BEST_RESULTS_MD_H
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "common/Paths.h"
|
||||
#include "Statistics.h"
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
class BestResultsMd {
|
||||
public:
|
||||
BestResultsMd() = default;
|
||||
~BestResultsMd() = default;
|
||||
void results_header(const std::vector<std::string>& models, const std::string& date);
|
||||
void results_body(const std::vector<std::string>& datasets, json& table);
|
||||
void results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model);
|
||||
void postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date);
|
||||
private:
|
||||
void openMdFile(const std::string& name);
|
||||
std::ofstream handler;
|
||||
std::vector<std::string> models;
|
||||
};
|
||||
}
|
||||
#endif
|
124
src/best/BestResultsTex.cpp
Normal file
124
src/best/BestResultsTex.cpp
Normal file
@@ -0,0 +1,124 @@
|
||||
#include <iostream>
|
||||
#include "BestResultsTex.h"
|
||||
#include "common/Utils.h" // compute_std
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
void BestResultsTex::openTexFile(const std::string& name)
|
||||
{
|
||||
handler.open(name);
|
||||
if (!handler.is_open()) {
|
||||
std::cerr << "Error opening file " << name << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
void BestResultsTex::results_header(const std::vector<std::string>& models, const std::string& date, bool index)
|
||||
{
|
||||
this->models = models;
|
||||
auto file_name = Paths::tex() + Paths::tex_output();
|
||||
openTexFile(file_name);
|
||||
handler << "%% This file has been generated by the platform program" << std::endl;
|
||||
handler << "%% Date: " << date.c_str() << std::endl;
|
||||
handler << "%%" << std::endl;
|
||||
handler << "%% Table of results" << std::endl;
|
||||
handler << "%%" << std::endl;
|
||||
handler << "\\begin{table}[htbp] " << std::endl;
|
||||
handler << "\\centering " << std::endl;
|
||||
handler << "\\tiny " << std::endl;
|
||||
handler << "\\renewcommand{\\arraystretch }{1.2} " << std::endl;
|
||||
handler << "\\renewcommand{\\tabcolsep }{0.07cm} " << std::endl;
|
||||
auto umetric = score;
|
||||
umetric[0] = toupper(umetric[0]);
|
||||
handler << "\\caption{" << umetric << " results(mean $\\pm$ std) for all the algorithms and datasets} " << std::endl;
|
||||
handler << "\\label{tab:results_" << score << "}" << std::endl;
|
||||
std::string header_dataset_name = index ? "r" : "l";
|
||||
handler << "\\begin{tabular} {{" << header_dataset_name << std::string(models.size(), 'c').c_str() << "}}" << std::endl;
|
||||
handler << "\\hline " << std::endl;
|
||||
handler << "" << std::endl;
|
||||
for (const auto& model : models) {
|
||||
handler << "& " << model.c_str();
|
||||
}
|
||||
handler << "\\\\" << std::endl;
|
||||
handler << "\\hline" << std::endl;
|
||||
}
|
||||
void BestResultsTex::results_body(const std::vector<std::string>& datasets, json& table, bool index)
|
||||
{
|
||||
int i = 0;
|
||||
for (auto const& dataset : datasets) {
|
||||
// Find out max value for this dataset
|
||||
double max_value = 0;
|
||||
for (const auto& model : models) {
|
||||
double value;
|
||||
try {
|
||||
value = table[model].at(dataset).at(0).get<double>();
|
||||
}
|
||||
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
|
||||
value = -1.0;
|
||||
}
|
||||
if (value > max_value) {
|
||||
max_value = value;
|
||||
}
|
||||
}
|
||||
if (index)
|
||||
handler << ++i << " ";
|
||||
else
|
||||
handler << dataset << " ";
|
||||
for (const auto& model : models) {
|
||||
double value = table[model].at(dataset).at(0).get<double>();
|
||||
double std_value = table[model].at(dataset).at(3).get<double>();
|
||||
const char* bold = value == max_value ? "\\bfseries" : "";
|
||||
handler << "& " << bold << std::setprecision(4) << std::fixed << value << "$\\pm$" << std::setprecision(3) << std_value;
|
||||
}
|
||||
handler << "\\\\" << std::endl;
|
||||
}
|
||||
}
|
||||
void BestResultsTex::results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model)
|
||||
{
|
||||
handler << "\\hline" << std::endl;
|
||||
handler << "Average ";
|
||||
int nDatasets = totals.begin()->second.size();
|
||||
for (const auto& model : models) {
|
||||
double value = std::reduce(totals.at(model).begin(), totals.at(model).end()) / nDatasets;
|
||||
double std_value = compute_std(totals.at(model), value);
|
||||
const char* bold = model == best_model ? "\\bfseries" : "";
|
||||
handler << "& " << bold << std::setprecision(4) << std::fixed << value << "$\\pm$" << std::setprecision(3) << std::fixed << std_value;
|
||||
}
|
||||
handler << "\\\\" << std::endl;
|
||||
handler << "\\hline " << std::endl;
|
||||
handler << "\\end{tabular}" << std::endl;
|
||||
handler << "\\end{table}" << std::endl;
|
||||
handler.close();
|
||||
}
|
||||
void BestResultsTex::postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date)
|
||||
{
|
||||
auto file_name = Paths::tex() + Paths::tex_post_hoc();
|
||||
openTexFile(file_name);
|
||||
handler << "%% This file has been generated by the platform program" << std::endl;
|
||||
handler << "%% Date: " << date.c_str() << std::endl;
|
||||
handler << "%%" << std::endl;
|
||||
handler << "%% Post-hoc " << kind << " test" << std::endl;
|
||||
handler << "%%" << std::endl;
|
||||
handler << "\\begin{table}[htbp]" << std::endl;
|
||||
handler << "\\centering" << std::endl;
|
||||
handler << "\\caption{Results of the post-hoc " << kind << " test for the mean " << score << " of the algorithms.}\\label{ tab:tests }" << std::endl;
|
||||
handler << "\\begin{tabular}{lrrrrr}" << std::endl;
|
||||
handler << "\\hline" << std::endl;
|
||||
handler << "classifier & pvalue & rank & win & tie & loss\\\\" << std::endl;
|
||||
handler << "\\hline" << std::endl;
|
||||
bool first = true;
|
||||
for (auto const& line : postHocResults) {
|
||||
auto textStatus = !line.reject ? "\\bf " : " ";
|
||||
if (first) {
|
||||
handler << line.model << " & - & " << std::fixed << std::setprecision(2) << line.rank << " & - & - & - \\\\" << std::endl;
|
||||
first = false;
|
||||
} else {
|
||||
handler << line.model << " & " << textStatus << std::scientific << std::setprecision(4) << line.pvalue << " & ";
|
||||
handler << std::fixed << std::setprecision(2) << line.rank << " & " << line.wtl.win << " & " << line.wtl.tie << " & " << line.wtl.loss << "\\\\" << std::endl;
|
||||
}
|
||||
}
|
||||
handler << "\\hline " << std::endl;
|
||||
handler << "\\end{tabular}" << std::endl;
|
||||
handler << "\\end{table}" << std::endl;
|
||||
handler.close();
|
||||
}
|
||||
}
|
26
src/best/BestResultsTex.h
Normal file
26
src/best/BestResultsTex.h
Normal file
@@ -0,0 +1,26 @@
|
||||
#ifndef BEST_RESULTS_TEX_H
|
||||
#define BEST_RESULTS_TEX_H
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "common/Paths.h"
|
||||
#include "Statistics.h"
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
class BestResultsTex {
|
||||
public:
|
||||
BestResultsTex(const std::string score, bool dataset_name = true) : score{ score }, dataset_name{ dataset_name } {};
|
||||
~BestResultsTex() = default;
|
||||
void results_header(const std::vector<std::string>& models, const std::string& date, bool index);
|
||||
void results_body(const std::vector<std::string>& datasets, json& table, bool index);
|
||||
void results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model);
|
||||
void postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date);
|
||||
private:
|
||||
std::string score;
|
||||
bool dataset_name;
|
||||
void openTexFile(const std::string& name);
|
||||
std::ofstream handler;
|
||||
std::vector<std::string> models;
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -3,7 +3,7 @@
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include "DotEnv.h"
|
||||
#include "common/DotEnv.h"
|
||||
namespace platform {
|
||||
class BestScore {
|
||||
public:
|
||||
@@ -24,5 +24,4 @@ namespace platform {
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,22 +1,31 @@
|
||||
#include <sstream>
|
||||
#include "Statistics.h"
|
||||
#include "Colors.h"
|
||||
#include "Symbols.h"
|
||||
#include <boost/math/distributions/chi_squared.hpp>
|
||||
#include <boost/math/distributions/normal.hpp>
|
||||
#include "CLocale.h"
|
||||
#include "common/Colors.h"
|
||||
#include "common/Symbols.h"
|
||||
#include "common/CLocale.h"
|
||||
#include "BestResultsTex.h"
|
||||
#include "BestResultsMd.h"
|
||||
#include "Statistics.h"
|
||||
#include "WilcoxonTest.hpp"
|
||||
|
||||
|
||||
namespace platform {
|
||||
|
||||
Statistics::Statistics(const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance, bool output) :
|
||||
models(models), datasets(datasets), data(data), significance(significance), output(output)
|
||||
Statistics::Statistics(const std::string& score, const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance, bool output) :
|
||||
score(score), models(models), datasets(datasets), data(data), significance(significance), output(output)
|
||||
{
|
||||
if (score == "accuracy") {
|
||||
postHocType = "Holm";
|
||||
hlen = 85;
|
||||
} else {
|
||||
postHocType = "Wilcoxon";
|
||||
hlen = 88;
|
||||
}
|
||||
nModels = models.size();
|
||||
nDatasets = datasets.size();
|
||||
auto temp = ConfigLocale();
|
||||
};
|
||||
|
||||
}
|
||||
void Statistics::fit()
|
||||
{
|
||||
if (nModels < 3 || nDatasets < 3) {
|
||||
@@ -25,9 +34,11 @@ namespace platform {
|
||||
throw std::runtime_error("Can't make the Friedman test with less than 3 models and/or less than 3 datasets.");
|
||||
}
|
||||
ranksModels.clear();
|
||||
computeRanks();
|
||||
computeRanks(); // compute greaterAverage and ranks
|
||||
// Set the control model as the one with the lowest average rank
|
||||
controlIdx = distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; }));
|
||||
controlIdx = score == "accuracy" ?
|
||||
distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; }))
|
||||
: greaterAverage; // The model with the greater average score
|
||||
computeWTL();
|
||||
maxModelName = (*std::max_element(models.begin(), models.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
maxDatasetName = (*std::max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
@@ -64,11 +75,16 @@ namespace platform {
|
||||
void Statistics::computeRanks()
|
||||
{
|
||||
std::map<std::string, float> ranksLine;
|
||||
std::map<std::string, float> averages;
|
||||
for (const auto& model : models) {
|
||||
averages[model] = 0;
|
||||
}
|
||||
for (const auto& dataset : datasets) {
|
||||
std::vector<std::pair<std::string, double>> ranksOrder;
|
||||
for (const auto& model : models) {
|
||||
double value = data[model].at(dataset).at(0).get<double>();
|
||||
ranksOrder.push_back({ model, value });
|
||||
averages[model] += value;
|
||||
}
|
||||
// Assign the ranks
|
||||
ranksLine = assignRanks(ranksOrder);
|
||||
@@ -86,10 +102,17 @@ namespace platform {
|
||||
for (const auto& rank : ranks) {
|
||||
ranks[rank.first] /= nDatasets;
|
||||
}
|
||||
// Average the scores
|
||||
for (const auto& average : averages) {
|
||||
averages[average.first] /= nDatasets;
|
||||
}
|
||||
// Get the model with the greater average score
|
||||
greaterAverage = distance(averages.begin(), max_element(averages.begin(), averages.end(), [](const auto& l, const auto& r) { return l.second < r.second; }));
|
||||
}
|
||||
void Statistics::computeWTL()
|
||||
{
|
||||
// Compute the WTL matrix
|
||||
const double practical_threshold = 0.0005;
|
||||
// Compute the WTL matrix (Win Tie Loss)
|
||||
for (int i = 0; i < nModels; ++i) {
|
||||
wtl[i] = { 0, 0, 0 };
|
||||
}
|
||||
@@ -102,23 +125,85 @@ namespace platform {
|
||||
continue;
|
||||
}
|
||||
double value = data[models[i]].at(item.key()).at(0).get<double>();
|
||||
if (value < controlValue) {
|
||||
wtl[i].win++;
|
||||
} else if (value == controlValue) {
|
||||
double diff = controlValue - value; // control − comparison
|
||||
if (std::fabs(diff) <= practical_threshold) {
|
||||
wtl[i].tie++;
|
||||
} else if (diff < 0) {
|
||||
wtl[i].win++;
|
||||
} else {
|
||||
wtl[i].loss++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Statistics::postHocHolmTest(bool friedmanResult)
|
||||
int Statistics::getControlIdx()
|
||||
{
|
||||
if (!fitted) {
|
||||
fit();
|
||||
}
|
||||
return controlIdx;
|
||||
}
|
||||
void Statistics::postHocTest()
|
||||
{
|
||||
if (score == "accuracy") {
|
||||
postHocHolmTest();
|
||||
} else {
|
||||
postHocWilcoxonTest();
|
||||
}
|
||||
}
|
||||
void Statistics::postHocWilcoxonTest()
|
||||
{
|
||||
if (!fitted) {
|
||||
fit();
|
||||
}
|
||||
// Reference: Wilcoxon, F. (1945). “Individual Comparisons by Ranking Methods”. Biometrics Bulletin, 1(6), 80-83.
|
||||
auto wilcoxon = WilcoxonTest(models, datasets, data, significance);
|
||||
controlIdx = wilcoxon.getControlIdx();
|
||||
postHocResults = wilcoxon.getPostHocResults();
|
||||
setResultsOrder();
|
||||
// Fill the ranks info
|
||||
for (const auto& item : postHocResults) {
|
||||
ranks[item.model] = item.rank;
|
||||
}
|
||||
Holm_Bonferroni();
|
||||
restoreResultsOrder();
|
||||
}
|
||||
void Statistics::Holm_Bonferroni()
|
||||
{
|
||||
// The algorithm need the p-values sorted from the lowest to the highest
|
||||
// Sort the models by p-value
|
||||
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
|
||||
return a.pvalue < b.pvalue;
|
||||
});
|
||||
// Holm adjustment
|
||||
for (int i = 0; i < postHocResults.size(); ++i) {
|
||||
auto item = postHocResults.at(i);
|
||||
double before = i == 0 ? 0.0 : postHocResults.at(i - 1).pvalue;
|
||||
double p_value = std::min((long double)1.0, item.pvalue * (nModels - i));
|
||||
p_value = std::max(before, p_value);
|
||||
postHocResults[i].pvalue = p_value;
|
||||
}
|
||||
}
|
||||
void Statistics::setResultsOrder()
|
||||
{
|
||||
int c = 0;
|
||||
for (auto& item : postHocResults) {
|
||||
item.idx = c++;
|
||||
}
|
||||
|
||||
}
|
||||
void Statistics::restoreResultsOrder()
|
||||
{
|
||||
// Restore the order of the results
|
||||
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
|
||||
return a.idx < b.idx;
|
||||
});
|
||||
}
|
||||
void Statistics::postHocHolmTest()
|
||||
{
|
||||
if (!fitted) {
|
||||
fit();
|
||||
}
|
||||
std::stringstream oss;
|
||||
// Reference https://link.springer.com/article/10.1007/s44196-022-00083-8
|
||||
// Post-hoc Holm test
|
||||
// Calculate the p-value for the models paired with the control model
|
||||
@@ -126,75 +211,67 @@ namespace platform {
|
||||
boost::math::normal dist(0.0, 1.0);
|
||||
double diff = sqrt(nModels * (nModels + 1) / (6.0 * nDatasets));
|
||||
for (int i = 0; i < nModels; i++) {
|
||||
PostHocLine line;
|
||||
line.model = models[i];
|
||||
line.rank = ranks.at(models[i]);
|
||||
line.wtl = wtl.at(i);
|
||||
line.reject = false;
|
||||
if (i == controlIdx) {
|
||||
stats[i] = 0.0;
|
||||
postHocResults.push_back(line);
|
||||
continue;
|
||||
}
|
||||
double z = abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
|
||||
double p_value = (long double)2 * (1 - cdf(dist, z));
|
||||
stats[i] = p_value;
|
||||
double z = std::abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
|
||||
line.pvalue = (long double)2 * (1 - cdf(dist, z));
|
||||
line.reject = (line.pvalue < significance);
|
||||
postHocResults.push_back(line);
|
||||
}
|
||||
// Sort the models by p-value
|
||||
std::vector<std::pair<int, double>> statsOrder;
|
||||
for (const auto& stat : stats) {
|
||||
statsOrder.push_back({ stat.first, stat.second });
|
||||
}
|
||||
std::sort(statsOrder.begin(), statsOrder.end(), [](const std::pair<int, double>& a, const std::pair<int, double>& b) {
|
||||
return a.second < b.second;
|
||||
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
|
||||
return a.rank < b.rank;
|
||||
});
|
||||
|
||||
// Holm adjustment
|
||||
for (int i = 0; i < statsOrder.size(); ++i) {
|
||||
auto item = statsOrder.at(i);
|
||||
double before = i == 0 ? 0.0 : statsOrder.at(i - 1).second;
|
||||
double p_value = std::min((double)1.0, item.second * (nModels - i));
|
||||
p_value = std::max(before, p_value);
|
||||
statsOrder[i] = { item.first, p_value };
|
||||
setResultsOrder();
|
||||
Holm_Bonferroni();
|
||||
restoreResultsOrder();
|
||||
}
|
||||
holmResult.model = models.at(controlIdx);
|
||||
|
||||
void Statistics::postHocTestReport(bool friedmanResult, bool tex)
|
||||
{
|
||||
|
||||
std::stringstream oss;
|
||||
auto color = friedmanResult ? Colors::CYAN() : Colors::YELLOW();
|
||||
oss << color;
|
||||
oss << " *************************************************************************************************************" << std::endl;
|
||||
oss << " Post-hoc Holm test: H0: 'There is no significant differences between the control model and the other models.'" << std::endl;
|
||||
oss << " " << std::string(hlen + 25, '*') << std::endl;
|
||||
oss << " Post-hoc " << postHocType << " test: H0: 'There is no significant differences between the control model and the other models.'" << std::endl;
|
||||
oss << " Control model: " << models.at(controlIdx) << std::endl;
|
||||
oss << " " << std::left << std::setw(maxModelName) << std::string("Model") << " p-value rank win tie loss Status" << std::endl;
|
||||
oss << " " << std::string(maxModelName, '=') << " ============ ========= === === ==== =============" << std::endl;
|
||||
// sort ranks from lowest to highest
|
||||
std::vector<std::pair<std::string, float>> ranksOrder;
|
||||
for (const auto& rank : ranks) {
|
||||
ranksOrder.push_back({ rank.first, rank.second });
|
||||
}
|
||||
std::sort(ranksOrder.begin(), ranksOrder.end(), [](const std::pair<std::string, float>& a, const std::pair<std::string, float>& b) {
|
||||
return a.second < b.second;
|
||||
});
|
||||
// Show the control model info.
|
||||
oss << " " << Colors::BLUE() << std::left << std::setw(maxModelName) << ranksOrder.at(0).first << " ";
|
||||
oss << std::setw(12) << " " << std::setprecision(7) << std::fixed << " " << ranksOrder.at(0).second << std::endl;
|
||||
for (const auto& item : ranksOrder) {
|
||||
auto idx = distance(models.begin(), find(models.begin(), models.end(), item.first));
|
||||
double pvalue = 0.0;
|
||||
for (const auto& stat : statsOrder) {
|
||||
if (stat.first == idx) {
|
||||
pvalue = stat.second;
|
||||
}
|
||||
}
|
||||
holmResult.holmLines.push_back({ item.first, pvalue, item.second, wtl.at(idx), pvalue < significance });
|
||||
if (item.first == models.at(controlIdx)) {
|
||||
bool first = true;
|
||||
for (const auto& item : postHocResults) {
|
||||
if (first) {
|
||||
oss << " " << Colors::BLUE() << std::left << std::setw(maxModelName) << item.model << " ";
|
||||
oss << std::setw(12) << " " << std::setprecision(7) << std::fixed << " " << item.rank << std::endl;
|
||||
first = false;
|
||||
continue;
|
||||
}
|
||||
auto pvalue = item.pvalue;
|
||||
auto colorStatus = pvalue > significance ? Colors::GREEN() : Colors::MAGENTA();
|
||||
auto status = pvalue > significance ? Symbols::check_mark : Symbols::cross;
|
||||
auto textStatus = pvalue > significance ? " accepted H0" : " rejected H0";
|
||||
oss << " " << colorStatus << std::left << std::setw(maxModelName) << item.first << " ";
|
||||
oss << std::setprecision(6) << std::scientific << pvalue << std::setprecision(7) << std::fixed << " " << item.second;
|
||||
oss << " " << std::right << std::setw(3) << wtl.at(idx).win << " " << std::setw(3) << wtl.at(idx).tie << " " << std::setw(4) << wtl.at(idx).loss;
|
||||
oss << " " << colorStatus << std::left << std::setw(maxModelName) << item.model << " ";
|
||||
oss << std::setprecision(6) << std::scientific << pvalue << std::setprecision(7) << std::fixed << " " << item.rank;
|
||||
oss << " " << std::right << std::setw(3) << item.wtl.win << " " << std::setw(3) << item.wtl.tie << " " << std::setw(4) << item.wtl.loss;
|
||||
oss << " " << status << textStatus << std::endl;
|
||||
}
|
||||
oss << color << " *************************************************************************************************************" << std::endl;
|
||||
oss << color << " " << std::string(hlen + 25, '*') << std::endl;
|
||||
oss << Colors::RESET();
|
||||
if (output) {
|
||||
std::cout << oss.str();
|
||||
}
|
||||
if (tex) {
|
||||
BestResultsTex bestResultsTex(score);
|
||||
BestResultsMd bestResultsMd;
|
||||
bestResultsTex.postHoc_test(postHocResults, postHocType, get_date() + " " + get_time());
|
||||
bestResultsMd.postHoc_test(postHocResults, postHocType, get_date() + " " + get_time());
|
||||
}
|
||||
}
|
||||
bool Statistics::friedmanTest()
|
||||
{
|
||||
@@ -205,7 +282,7 @@ namespace platform {
|
||||
// Friedman test
|
||||
// Calculate the Friedman statistic
|
||||
oss << Colors::BLUE() << std::endl;
|
||||
oss << "***************************************************************************************************************" << std::endl;
|
||||
oss << std::string(hlen, '*') << std::endl;
|
||||
oss << Colors::GREEN() << "Friedman test: H0: 'There is no significant differences between all the classifiers.'" << Colors::BLUE() << std::endl;
|
||||
double degreesOfFreedom = nModels - 1.0;
|
||||
double sumSquared = 0;
|
||||
@@ -230,23 +307,11 @@ namespace platform {
|
||||
oss << Colors::YELLOW() << "The null hypothesis H0 is accepted. Computed p-values will not be significant." << std::endl;
|
||||
result = false;
|
||||
}
|
||||
oss << Colors::BLUE() << "***************************************************************************************************************" << Colors::RESET() << std::endl;
|
||||
oss << Colors::BLUE() << std::string(hlen, '*') << Colors::RESET() << std::endl;
|
||||
if (output) {
|
||||
std::cout << oss.str();
|
||||
}
|
||||
friedmanResult = { friedmanQ, criticalValue, p_value, result };
|
||||
return result;
|
||||
}
|
||||
FriedmanResult& Statistics::getFriedmanResult()
|
||||
{
|
||||
return friedmanResult;
|
||||
}
|
||||
HolmResult& Statistics::getHolmResult()
|
||||
{
|
||||
return holmResult;
|
||||
}
|
||||
std::map<std::string, std::map<std::string, float>>& Statistics::getRanks()
|
||||
{
|
||||
return ranksModels;
|
||||
}
|
||||
} // namespace platform
|
72
src/best/Statistics.h
Normal file
72
src/best/Statistics.h
Normal file
@@ -0,0 +1,72 @@
|
||||
#ifndef STATISTICS_H
|
||||
#define STATISTICS_H
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
struct WTL {
|
||||
uint win;
|
||||
uint tie;
|
||||
uint loss;
|
||||
};
|
||||
struct FriedmanResult {
|
||||
double statistic;
|
||||
double criticalValue;
|
||||
long double pvalue;
|
||||
bool reject;
|
||||
};
|
||||
struct PostHocLine {
|
||||
uint idx; //index of the main order
|
||||
std::string model;
|
||||
long double pvalue;
|
||||
double rank;
|
||||
WTL wtl;
|
||||
bool reject;
|
||||
};
|
||||
|
||||
class Statistics {
|
||||
public:
|
||||
Statistics(const std::string& score, const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance = 0.05, bool output = true);
|
||||
bool friedmanTest();
|
||||
void postHocTest();
|
||||
void postHocTestReport(bool friedmanResult, bool tex);
|
||||
int getControlIdx();
|
||||
FriedmanResult& getFriedmanResult() { return friedmanResult; }
|
||||
std::vector<PostHocLine>& getPostHocResults() { return postHocResults; }
|
||||
std::map<std::string, std::map<std::string, float>>& getRanks() { return ranksModels; } // ranks of the models per dataset
|
||||
private:
|
||||
void fit();
|
||||
void postHocHolmTest();
|
||||
void postHocWilcoxonTest();
|
||||
void computeRanks();
|
||||
void computeWTL();
|
||||
void Holm_Bonferroni();
|
||||
void setResultsOrder(); // Set the order of the results based on the statistic analysis needed
|
||||
void restoreResultsOrder(); // Restore the order of the results after the Holm-Bonferroni adjustment
|
||||
const std::string& score;
|
||||
std::string postHocType;
|
||||
const std::vector<std::string>& models;
|
||||
const std::vector<std::string>& datasets;
|
||||
const json& data;
|
||||
double significance;
|
||||
bool output;
|
||||
bool fitted = false;
|
||||
int nModels = 0;
|
||||
int nDatasets = 0;
|
||||
int controlIdx = 0;
|
||||
int greaterAverage = -1; // The model with the greater average score
|
||||
std::map<int, WTL> wtl;
|
||||
std::map<std::string, float> ranks;
|
||||
int maxModelName = 0;
|
||||
int maxDatasetName = 0;
|
||||
int hlen; // length of the line
|
||||
FriedmanResult friedmanResult;
|
||||
std::vector<PostHocLine> postHocResults;
|
||||
std::map<std::string, std::map<std::string, float>> ranksModels;
|
||||
};
|
||||
}
|
||||
#endif
|
245
src/best/WilcoxonTest.hpp
Normal file
245
src/best/WilcoxonTest.hpp
Normal file
@@ -0,0 +1,245 @@
|
||||
#ifndef BEST_WILCOXON_TEST_HPP
|
||||
#define BEST_WILCOXON_TEST_HPP
|
||||
// WilcoxonTest.hpp
|
||||
// Stand‑alone class for paired Wilcoxon signed‑rank post‑hoc analysis
|
||||
// ------------------------------------------------------------------
|
||||
// * Constructor takes the *already‑loaded* nlohmann::json object plus the
|
||||
// vectors of model and dataset names.
|
||||
// * Internally selects a control model (highest average AUC) and builds all
|
||||
// statistics (ranks, W/T/L counts, Wilcoxon p‑values).
|
||||
// * Public API:
|
||||
// int getControlIdx() const;
|
||||
// PostHocResult getPostHocResult() const;
|
||||
//
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <limits>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "Statistics.h"
|
||||
|
||||
namespace platform {
|
||||
class WilcoxonTest {
|
||||
public:
|
||||
WilcoxonTest(const std::vector<std::string>& models, const std::vector<std::string>& datasets,
|
||||
const json& data, double alpha = 0.05) : models_(models), datasets_(datasets), data_(data), alpha_(alpha)
|
||||
{
|
||||
buildAUCTable(); // extracts all AUCs into a dense matrix
|
||||
computeAverageAUCs(); // per‑model mean (→ control selection)
|
||||
computeAverageRanks(); // Friedman‑style ranks per model
|
||||
selectControlModel(); // sets control_idx_
|
||||
buildPostHocResult(); // fills postHocResult_
|
||||
}
|
||||
|
||||
int getControlIdx() const noexcept { return control_idx_; }
|
||||
const std::vector<PostHocLine>& getPostHocResults() const noexcept { return postHocResults_; }
|
||||
|
||||
private:
|
||||
//-------------------------------------------------- helper structs ----
|
||||
// When a value is missing we keep NaN so that ordinary arithmetic still
|
||||
// works (NaN simply propagates and we can test with std::isnan).
|
||||
using Matrix = std::vector<std::vector<double>>; // [model][dataset]
|
||||
|
||||
//------------------------------------------------- implementation ----
|
||||
void buildAUCTable()
|
||||
{
|
||||
const std::size_t M = models_.size();
|
||||
const std::size_t D = datasets_.size();
|
||||
auc_.assign(M, std::vector<double>(D, std::numeric_limits<double>::quiet_NaN()));
|
||||
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
const auto& model = models_[i];
|
||||
for (std::size_t j = 0; j < D; ++j) {
|
||||
const auto& ds = datasets_[j];
|
||||
try {
|
||||
auc_[i][j] = data_.at(model).at(ds).at(0).get<double>();
|
||||
}
|
||||
catch (...) {
|
||||
// leave as NaN when value missing
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void computeAverageAUCs()
|
||||
{
|
||||
const std::size_t M = models_.size();
|
||||
avg_auc_.resize(M, std::numeric_limits<double>::quiet_NaN());
|
||||
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
double sum = 0.0;
|
||||
std::size_t cnt = 0;
|
||||
for (double v : auc_[i]) {
|
||||
if (!std::isnan(v)) { sum += v; ++cnt; }
|
||||
}
|
||||
avg_auc_[i] = cnt ? sum / cnt : std::numeric_limits<double>::quiet_NaN();
|
||||
}
|
||||
}
|
||||
|
||||
// Average rank across datasets (1 = best).
|
||||
void computeAverageRanks()
|
||||
{
|
||||
const std::size_t M = models_.size();
|
||||
const std::size_t D = datasets_.size();
|
||||
rank_sum_.assign(M, 0.0);
|
||||
rank_cnt_.assign(M, 0);
|
||||
|
||||
const double EPS = 1e-10;
|
||||
|
||||
for (std::size_t j = 0; j < D; ++j) {
|
||||
// Collect present values for this dataset
|
||||
std::vector<std::pair<double, std::size_t>> vals; // (auc, model_idx)
|
||||
vals.reserve(M);
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
if (!std::isnan(auc_[i][j]))
|
||||
vals.emplace_back(auc_[i][j], i);
|
||||
}
|
||||
if (vals.empty()) continue; // no info for this dataset
|
||||
|
||||
// Sort descending (higher AUC better)
|
||||
std::sort(vals.begin(), vals.end(), [](auto a, auto b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
|
||||
// Assign ranks with average for ties
|
||||
std::size_t k = 0;
|
||||
while (k < vals.size()) {
|
||||
std::size_t l = k + 1;
|
||||
while (l < vals.size() && std::fabs(vals[l].first - vals[k].first) < EPS) ++l;
|
||||
const double avg_rank = (k + 1 + l) * 0.5; // average of ranks (1‑based)
|
||||
for (std::size_t m = k; m < l; ++m) {
|
||||
const auto idx = vals[m].second;
|
||||
rank_sum_[idx] += avg_rank;
|
||||
++rank_cnt_[idx];
|
||||
}
|
||||
k = l;
|
||||
}
|
||||
}
|
||||
|
||||
// Final average
|
||||
avg_rank_.resize(M, std::numeric_limits<double>::quiet_NaN());
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
avg_rank_[i] = rank_cnt_[i] ? rank_sum_[i] / rank_cnt_[i]
|
||||
: std::numeric_limits<double>::quiet_NaN();
|
||||
}
|
||||
}
|
||||
|
||||
void selectControlModel()
|
||||
{
|
||||
// pick model with highest average AUC (ties → first)
|
||||
control_idx_ = 0;
|
||||
for (std::size_t i = 1; i < avg_auc_.size(); ++i) {
|
||||
if (avg_auc_[i] > avg_auc_[control_idx_]) control_idx_ = static_cast<int>(i);
|
||||
}
|
||||
}
|
||||
|
||||
void buildPostHocResult()
|
||||
{
|
||||
const std::size_t M = models_.size();
|
||||
const std::size_t D = datasets_.size();
|
||||
const std::string& control_name = models_[control_idx_];
|
||||
|
||||
const double practical_threshold = 0.0005; // same heuristic as original code
|
||||
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
PostHocLine line;
|
||||
line.model = models_[i];
|
||||
line.rank = avg_auc_[i];
|
||||
|
||||
WTL wtl = { 0, 0, 0 }; // win, tie, loss
|
||||
std::vector<double> differences;
|
||||
differences.reserve(D);
|
||||
|
||||
for (std::size_t j = 0; j < D; ++j) {
|
||||
double auc_control = auc_[control_idx_][j];
|
||||
double auc_other = auc_[i][j];
|
||||
if (std::isnan(auc_control) || std::isnan(auc_other)) continue;
|
||||
|
||||
double diff = auc_control - auc_other; // control − comparison
|
||||
if (std::fabs(diff) <= practical_threshold) {
|
||||
++wtl.tie;
|
||||
} else if (diff < 0) {
|
||||
++wtl.win; // comparison wins
|
||||
} else {
|
||||
++wtl.loss; // control wins
|
||||
}
|
||||
differences.push_back(diff);
|
||||
}
|
||||
|
||||
line.wtl = wtl;
|
||||
line.pvalue = differences.empty() ? 1.0L : static_cast<long double>(wilcoxonSignedRankTest(differences));
|
||||
line.reject = (line.pvalue < alpha_);
|
||||
|
||||
postHocResults_.push_back(std::move(line));
|
||||
}
|
||||
// Sort results by rank (descending)
|
||||
std::sort(postHocResults_.begin(), postHocResults_.end(), [](const PostHocLine& a, const PostHocLine& b) {
|
||||
return a.rank > b.rank;
|
||||
});
|
||||
}
|
||||
|
||||
// ------------------------------------------------ Wilcoxon (private) --
|
||||
static double wilcoxonSignedRankTest(const std::vector<double>& diffs)
|
||||
{
|
||||
if (diffs.empty()) return 1.0;
|
||||
|
||||
// Build |diff| + sign vector (exclude zeros)
|
||||
struct Node { double absval; int sign; };
|
||||
std::vector<Node> v;
|
||||
v.reserve(diffs.size());
|
||||
for (double d : diffs) {
|
||||
if (d != 0.0) v.push_back({ std::fabs(d), d > 0 ? 1 : -1 });
|
||||
}
|
||||
if (v.empty()) return 1.0;
|
||||
|
||||
// Sort by absolute value
|
||||
std::sort(v.begin(), v.end(), [](const Node& a, const Node& b) { return a.absval < b.absval; });
|
||||
|
||||
const double EPS = 1e-10;
|
||||
const std::size_t n = v.size();
|
||||
std::vector<double> ranks(n, 0.0);
|
||||
|
||||
std::size_t i = 0;
|
||||
while (i < n) {
|
||||
std::size_t j = i + 1;
|
||||
while (j < n && std::fabs(v[j].absval - v[i].absval) < EPS) ++j;
|
||||
double avg_rank = (i + 1 + j) * 0.5; // 1‑based ranks
|
||||
for (std::size_t k = i; k < j; ++k) ranks[k] = avg_rank;
|
||||
i = j;
|
||||
}
|
||||
|
||||
double w_plus = 0.0, w_minus = 0.0;
|
||||
for (std::size_t k = 0; k < n; ++k) {
|
||||
if (v[k].sign > 0) w_plus += ranks[k];
|
||||
else w_minus += ranks[k];
|
||||
}
|
||||
double w = std::min(w_plus, w_minus);
|
||||
double mean_w = n * (n + 1) / 4.0;
|
||||
double sd_w = std::sqrt(n * (n + 1) * (2 * n + 1) / 24.0);
|
||||
if (sd_w == 0.0) return 1.0; // degenerate (all diffs identical)
|
||||
|
||||
double z = (w - mean_w) / sd_w;
|
||||
double p_two = std::erfc(std::fabs(z) / std::sqrt(2.0)); // 2‑sided tail
|
||||
return p_two;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------- data ----
|
||||
std::vector<std::string> models_;
|
||||
std::vector<std::string> datasets_;
|
||||
json data_;
|
||||
double alpha_;
|
||||
|
||||
Matrix auc_; // [model][dataset]
|
||||
std::vector<double> avg_auc_; // mean AUC per model
|
||||
std::vector<double> avg_rank_; // mean rank per model
|
||||
std::vector<double> rank_sum_; // helper for ranks
|
||||
std::vector<int> rank_cnt_; // datasets counted per model
|
||||
|
||||
int control_idx_ = -1;
|
||||
std::vector<PostHocLine> postHocResults_;
|
||||
};
|
||||
|
||||
} // namespace platform
|
||||
#endif // BEST_WILCOXON_TEST_HPP
|
91
src/commands/b_best.cpp
Normal file
91
src/commands/b_best.cpp
Normal file
@@ -0,0 +1,91 @@
|
||||
#include <iostream>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include "main/Models.h"
|
||||
#include "main/modelRegister.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Colors.h"
|
||||
#include "common/Utils.h"
|
||||
#include "best/BestResults.h"
|
||||
#include "common/DotEnv.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
void manageArguments(argparse::ArgumentParser& program)
|
||||
{
|
||||
auto env = platform::DotEnv();
|
||||
program.add_argument("-m", "--model").help("Model to use or any").default_value("any");
|
||||
program.add_argument("--folder").help("Results folder to use").default_value(platform::Paths::results());
|
||||
program.add_argument("-d", "--dataset").default_value("any").help("Filter results of the selected model) (any for all datasets)");
|
||||
program.add_argument("-s", "--score").default_value(env.get("score")).help("Filter results of the score name supplied");
|
||||
program.add_argument("--friedman").help("Friedman test").default_value(false).implicit_value(true);
|
||||
program.add_argument("--excel").help("Output to excel").default_value(false).implicit_value(true);
|
||||
program.add_argument("--tex").help("Output results to TeX & Markdown files").default_value(false).implicit_value(true);
|
||||
program.add_argument("--index").help("In tex output show the index of the dataset instead of the name to save space").default_value(false).implicit_value(true);
|
||||
program.add_argument("--level").help("significance level").default_value(0.05).scan<'g', double>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = std::stod(value);
|
||||
if (k < 0.01 || k > 0.15) {
|
||||
throw std::runtime_error("Significance level hast to be a number in [0.01, 0.15]");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const std::runtime_error& err) {
|
||||
throw std::runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw std::runtime_error("Number of folds must be an decimal number");
|
||||
}});
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
argparse::ArgumentParser program("b_best", { platform_project_version.begin(), platform_project_version.end() });
|
||||
manageArguments(program);
|
||||
std::string model, dataset, score, folder;
|
||||
bool build, report, friedman, excel, tex, index;
|
||||
double level;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
model = program.get<std::string>("model");
|
||||
folder = program.get<std::string>("folder");
|
||||
if (folder.back() != '/') {
|
||||
folder += '/';
|
||||
}
|
||||
dataset = program.get<std::string>("dataset");
|
||||
score = program.get<std::string>("score");
|
||||
friedman = program.get<bool>("friedman");
|
||||
excel = program.get<bool>("excel");
|
||||
tex = program.get<bool>("tex");
|
||||
index = program.get<bool>("index");
|
||||
level = program.get<double>("level");
|
||||
if (model == "" || score == "") {
|
||||
throw std::runtime_error("Model and score name must be supplied");
|
||||
}
|
||||
if (friedman && (model != "any" || dataset != "any")) {
|
||||
std::cerr << "Friedman test can only be used with all models and all the datasets" << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
catch (const std::exception& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
// Generate report
|
||||
auto results = platform::BestResults(folder, score, model, dataset, friedman, level);
|
||||
if (model == "any") {
|
||||
results.buildAll();
|
||||
results.reportAll(excel, tex, index);
|
||||
} else {
|
||||
std::string fileName = results.build();
|
||||
std::cout << Colors::GREEN() << fileName << " created!" << Colors::RESET() << std::endl;
|
||||
results.reportSingle(excel);
|
||||
}
|
||||
if (excel) {
|
||||
auto fileName = results.getExcelFileName();
|
||||
std::cout << "Opening " << fileName << std::endl;
|
||||
platform::openFile(fileName);
|
||||
}
|
||||
std::cout << Colors::RESET();
|
||||
return 0;
|
||||
}
|
318
src/commands/b_grid.cpp
Normal file
318
src/commands/b_grid.cpp
Normal file
@@ -0,0 +1,318 @@
|
||||
#include <iostream>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <map>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <mpi.h>
|
||||
#include "main/Models.h"
|
||||
#include "main/ArgumentsExperiment.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Timer.hpp"
|
||||
#include "common/Colors.h"
|
||||
#include "common/DotEnv.h"
|
||||
#include "grid/GridSearch.h"
|
||||
#include "grid/GridExperiment.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
const int MAXL = 133;
|
||||
|
||||
void assignModel(argparse::ArgumentParser& parser)
|
||||
{
|
||||
auto models = platform::Models::instance();
|
||||
parser.add_argument("-m", "--model")
|
||||
.help("Model to use " + models->toString())
|
||||
.required()
|
||||
.action([models](const std::string& value) {
|
||||
static const std::vector<std::string> choices = models->getNames();
|
||||
if (find(choices.begin(), choices.end(), value) != choices.end()) {
|
||||
return value;
|
||||
}
|
||||
throw std::runtime_error("Model must be one of " + models->toString());
|
||||
}
|
||||
);
|
||||
}
|
||||
void add_search_args(argparse::ArgumentParser& program)
|
||||
{
|
||||
auto env = platform::DotEnv();
|
||||
program.add_argument("--discretize").help("Discretize input datasets").default_value((bool)stoi(env.get("discretize"))).implicit_value(true);
|
||||
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
|
||||
program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true);
|
||||
program.add_argument("--continue").help("Continue computing from that dataset").default_value(platform::GridSearch::NO_CONTINUE());
|
||||
program.add_argument("--only").help("Used with continue to search with that dataset only").default_value(false).implicit_value(true);
|
||||
program.add_argument("--exclude").default_value("[]").help("Datasets to exclude in json format, e.g. [\"dataset1\", \"dataset2\"]");
|
||||
auto valid_choices = env.valid_tokens("smooth_strat");
|
||||
auto& smooth_arg = program.add_argument("--smooth-strat").help("Smooth strategy used in Bayes Network node initialization. Valid values: " + env.valid_values("smooth_strat")).default_value(env.get("smooth_strat"));
|
||||
for (auto choice : valid_choices) {
|
||||
smooth_arg.choices(choice);
|
||||
}
|
||||
program.add_argument("--nested").help("Set the double/nested cross validation number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw std::runtime_error("Number of nested folds must be greater than 1");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const runtime_error& err) {
|
||||
throw std::runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw std::runtime_error("Number of nested folds must be an integer");
|
||||
}});
|
||||
program.add_argument("--score").help("Score used in gridsearch").default_value("accuracy");
|
||||
program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw std::runtime_error("Number of folds must be greater than 1");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const runtime_error& err) {
|
||||
throw std::runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw std::runtime_error("Number of folds must be an integer");
|
||||
}});
|
||||
auto seed_values = env.getSeeds();
|
||||
program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
|
||||
}
|
||||
std::string headerLine(const std::string& text, int utf = 0)
|
||||
{
|
||||
int n = MAXL - text.length() - 3;
|
||||
n = n < 0 ? 0 : n;
|
||||
return "* " + text + std::string(n + utf, ' ') + "*\n";
|
||||
}
|
||||
void list_dump(std::string& model)
|
||||
{
|
||||
auto data = platform::GridData(platform::Paths::grid_input(model));
|
||||
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
|
||||
std::cout << headerLine("Listing configuration input file (Grid)");
|
||||
std::cout << headerLine("Model: " + model);
|
||||
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
|
||||
int index = 0;
|
||||
int max_hyper = 15;
|
||||
int max_dataset = 7;
|
||||
auto combinations = data.getGridFile();
|
||||
for (auto const& item : combinations) {
|
||||
if (item.first.size() > max_dataset) {
|
||||
max_dataset = item.first.size();
|
||||
}
|
||||
for (auto const& [key, value] : item.second.items()) {
|
||||
if (value.dump().size() > max_hyper) {
|
||||
max_hyper = value.dump().size();
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << Colors::GREEN() << left << " # " << left << setw(max_dataset) << "Dataset" << " #Com. "
|
||||
<< setw(max_hyper) << "Hyperparameters" << std::endl;
|
||||
std::cout << "=== " << string(max_dataset, '=') << " ===== " << string(max_hyper, '=') << std::endl;
|
||||
int i = 0;
|
||||
for (auto const& item : combinations) {
|
||||
auto color = (i++ % 2) ? Colors::CYAN() : Colors::BLUE();
|
||||
std::cout << color;
|
||||
auto num_combinations = data.getNumCombinations(item.first);
|
||||
std::cout << setw(3) << fixed << right << ++index << left << " " << setw(max_dataset) << item.first
|
||||
<< " " << setw(5) << right << num_combinations << " ";
|
||||
std::string prefix = "";
|
||||
for (auto const& [key, value] : item.second.items()) {
|
||||
std::cout << prefix << setw(max_hyper) << std::left << value.dump() << std::endl;
|
||||
prefix = string(11 + max_dataset, ' ');
|
||||
}
|
||||
}
|
||||
std::cout << Colors::RESET() << std::endl;
|
||||
}
|
||||
void list_results(json& results, std::string& model)
|
||||
{
|
||||
std::cout << Colors::MAGENTA() << std::string(MAXL, '*') << std::endl;
|
||||
std::cout << headerLine("Listing computed hyperparameters for model " + model);
|
||||
std::cout << headerLine("Date & time: " + results["date"].get<std::string>() + " Duration: " + results["duration"].get<std::string>());
|
||||
std::cout << headerLine("Score: " + results["score"].get<std::string>());
|
||||
std::cout << headerLine(
|
||||
"Random seeds: " + results["seeds"].dump()
|
||||
+ " Discretized: " + (results["discretize"].get<bool>() ? "True" : "False")
|
||||
+ " Stratified: " + (results["stratified"].get<bool>() ? "True" : "False")
|
||||
+ " #Folds: " + std::to_string(results["n_folds"].get<int>())
|
||||
+ " Nested: " + (results["nested"].get<int>() == 0 ? "False" : to_string(results["nested"].get<int>()))
|
||||
);
|
||||
std::cout << std::string(MAXL, '*') << std::endl;
|
||||
int spaces = 7;
|
||||
int hyperparameters_spaces = 15;
|
||||
nlohmann::json temp = results["results"]; // To show in alphabetical order of the dataset
|
||||
for (const auto& item : temp.items()) {
|
||||
auto key = item.key();
|
||||
auto value = item.value();
|
||||
if (key.size() > spaces) {
|
||||
spaces = key.size();
|
||||
}
|
||||
if (value["hyperparameters"].dump().size() > hyperparameters_spaces) {
|
||||
hyperparameters_spaces = value["hyperparameters"].dump().size();
|
||||
}
|
||||
}
|
||||
std::cout << Colors::GREEN() << " # " << left << setw(spaces) << "Dataset" << " " << setw(19) << "Date" << " "
|
||||
<< "Duration " << setw(8) << "Score" << " " << "Hyperparameters" << std::endl;
|
||||
std::cout << "=== " << string(spaces, '=') << " " << string(19, '=') << " " << string(8, '=') << " "
|
||||
<< string(8, '=') << " " << string(hyperparameters_spaces, '=') << std::endl;
|
||||
int index = 0;
|
||||
for (const auto& item : temp.items()) {
|
||||
auto color = (index % 2) ? Colors::CYAN() : Colors::BLUE();
|
||||
auto value = item.value();
|
||||
std::cout << color;
|
||||
std::cout << std::setw(3) << std::right << index++ << " ";
|
||||
std::cout << left << setw(spaces) << item.key() << " " << value["date"].get<string>()
|
||||
<< " " << setw(8) << right << value["duration"].get<string>() << " " << setw(8) << setprecision(6)
|
||||
<< fixed << right << value["score"].get<double>() << " " << value["hyperparameters"].dump() << std::endl;
|
||||
}
|
||||
std::cout << Colors::RESET() << std::endl;
|
||||
}
|
||||
|
||||
/*
|
||||
* Main
|
||||
*/
|
||||
void dump(argparse::ArgumentParser& program)
|
||||
{
|
||||
auto model = program.get<std::string>("model");
|
||||
list_dump(model);
|
||||
}
|
||||
void report(argparse::ArgumentParser& program)
|
||||
{
|
||||
// List results
|
||||
struct platform::ConfigGrid config;
|
||||
config.model = program.get<std::string>("model");
|
||||
auto grid_search = platform::GridSearch(config);
|
||||
auto results = grid_search.loadResults();
|
||||
if (results.empty()) {
|
||||
std::cout << "** No results found" << std::endl;
|
||||
} else {
|
||||
list_results(results, config.model);
|
||||
}
|
||||
}
|
||||
void search(argparse::ArgumentParser& program)
|
||||
{
|
||||
struct platform::ConfigGrid config;
|
||||
config.model = program.get<std::string>("model");
|
||||
config.score = program.get<std::string>("score");
|
||||
config.discretize = program.get<bool>("discretize");
|
||||
config.stratified = program.get<bool>("stratified");
|
||||
config.smooth_strategy = program.get<std::string>("smooth-strat");
|
||||
config.n_folds = program.get<int>("folds");
|
||||
config.quiet = program.get<bool>("quiet");
|
||||
config.only = program.get<bool>("only");
|
||||
config.seeds = program.get<std::vector<int>>("seeds");
|
||||
config.nested = program.get<int>("nested");
|
||||
config.continue_from = program.get<std::string>("continue");
|
||||
if (config.continue_from == platform::GridSearch::NO_CONTINUE() && config.only) {
|
||||
throw std::runtime_error("Cannot use --only without --continue");
|
||||
}
|
||||
auto excluded = program.get<std::string>("exclude");
|
||||
config.excluded = json::parse(excluded);
|
||||
platform::Paths::createPath(platform::Paths::grid());
|
||||
auto grid_search = platform::GridSearch(config);
|
||||
platform::Timer timer;
|
||||
timer.start();
|
||||
struct platform::ConfigMPI mpi_config;
|
||||
mpi_config.manager = 0; // which process is the manager
|
||||
MPI_Init(nullptr, nullptr);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs);
|
||||
if (mpi_config.n_procs < 2) {
|
||||
throw std::runtime_error("Cannot use --search with less than 2 mpi processes, try mpirun -np 2 ...");
|
||||
}
|
||||
grid_search.go(mpi_config);
|
||||
if (mpi_config.rank == mpi_config.manager) {
|
||||
auto results = grid_search.loadResults();
|
||||
std::cout << Colors::RESET() << "* Report of the computed hyperparameters" << std::endl;
|
||||
list_results(results, config.model);
|
||||
std::cout << "Process took " << timer.getDurationString() << std::endl;
|
||||
}
|
||||
MPI_Finalize();
|
||||
}
|
||||
void experiment(argparse::ArgumentParser& program)
|
||||
{
|
||||
struct platform::ConfigGrid config;
|
||||
auto arguments = platform::ArgumentsExperiment(program, platform::experiment_t::GRID);
|
||||
arguments.parse();
|
||||
auto path_results = arguments.getPathResults();
|
||||
auto grid_experiment = platform::GridExperiment(arguments, config);
|
||||
platform::Timer timer;
|
||||
timer.start();
|
||||
struct platform::ConfigMPI mpi_config;
|
||||
mpi_config.manager = 0; // which process is the manager
|
||||
MPI_Init(nullptr, nullptr);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs);
|
||||
if (mpi_config.n_procs < 2) {
|
||||
throw std::runtime_error("Cannot use --experiment with less than 2 mpi processes, try mpirun -np 2 ...");
|
||||
}
|
||||
grid_experiment.go(mpi_config);
|
||||
if (mpi_config.rank == mpi_config.manager) {
|
||||
auto experiment = grid_experiment.getExperiment();
|
||||
std::cout << "* Report of the computed hyperparameters" << std::endl;
|
||||
auto duration = timer.getDuration();
|
||||
experiment.setDuration(duration);
|
||||
if (grid_experiment.haveToSaveResults()) {
|
||||
experiment.saveResult(path_results);
|
||||
}
|
||||
experiment.report();
|
||||
std::cout << "Process took " << duration << std::endl;
|
||||
}
|
||||
MPI_Finalize();
|
||||
}
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
//
|
||||
// Manage arguments
|
||||
//
|
||||
argparse::ArgumentParser program("b_grid", { platform_project_version.begin(), platform_project_version.end() });
|
||||
// grid dump subparser
|
||||
argparse::ArgumentParser dump_command("dump");
|
||||
dump_command.add_description("Dump the combinations of hyperparameters of a model.");
|
||||
assignModel(dump_command);
|
||||
|
||||
// grid report subparser
|
||||
argparse::ArgumentParser report_command("report");
|
||||
assignModel(report_command);
|
||||
report_command.add_description("Report the computed hyperparameters of a model.");
|
||||
|
||||
// grid search subparser
|
||||
argparse::ArgumentParser search_command("search");
|
||||
search_command.add_description("Search using mpi the hyperparameters of a model.");
|
||||
assignModel(search_command);
|
||||
add_search_args(search_command);
|
||||
|
||||
// grid experiment subparser
|
||||
argparse::ArgumentParser experiment_command("experiment");
|
||||
experiment_command.add_description("Experiment like b_main using mpi.");
|
||||
auto arguments = platform::ArgumentsExperiment(experiment_command, platform::experiment_t::GRID);
|
||||
arguments.add_arguments();
|
||||
program.add_subparser(dump_command);
|
||||
program.add_subparser(report_command);
|
||||
program.add_subparser(search_command);
|
||||
program.add_subparser(experiment_command);
|
||||
|
||||
//
|
||||
// Process options
|
||||
//
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
bool found = false;
|
||||
map<std::string, void(*)(argparse::ArgumentParser&)> commands = { {"dump", &dump}, {"report", &report}, {"search", &search}, { "experiment",&experiment } };
|
||||
for (const auto& command : commands) {
|
||||
if (program.is_subcommand_used(command.first)) {
|
||||
std::invoke(command.second, program.at<argparse::ArgumentParser>(command.first));
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
throw std::runtime_error("You must specify one of the following commands: dump, experiment, report, search \n");
|
||||
}
|
||||
}
|
||||
catch (const exception& err) {
|
||||
cerr << err.what() << std::endl;
|
||||
cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
std::cout << "Done!" << std::endl;
|
||||
return 0;
|
||||
}
|
119
src/commands/b_list.cpp
Normal file
119
src/commands/b_list.cpp
Normal file
@@ -0,0 +1,119 @@
|
||||
#include <iostream>
|
||||
#include <locale>
|
||||
#include <map>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "main/Models.h"
|
||||
#include "main/modelRegister.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Colors.h"
|
||||
#include "common/Datasets.h"
|
||||
#include "common/Utils.h"
|
||||
#include "reports/DatasetsExcel.h"
|
||||
#include "reports/DatasetsConsole.h"
|
||||
#include "results/ResultsDatasetConsole.h"
|
||||
#include "results/ResultsDataset.h"
|
||||
#include "results/ResultsDatasetExcel.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
|
||||
void list_datasets(argparse::ArgumentParser& program)
|
||||
{
|
||||
auto excel = program.get<bool>("excel");
|
||||
auto report = platform::DatasetsConsole();
|
||||
report.report();
|
||||
std::cout << report.getOutput();
|
||||
if (excel) {
|
||||
auto data = report.getData();
|
||||
auto ereport = new platform::DatasetsExcel();
|
||||
ereport->report(data);
|
||||
std::cout << std::endl << Colors::GREEN() << "Output saved in " << ereport->getFileName() << std::endl;
|
||||
auto fileName = ereport->getExcelFileName();
|
||||
delete ereport;
|
||||
std::cout << "Opening " << fileName << std::endl;
|
||||
platform::openFile(fileName);
|
||||
}
|
||||
}
|
||||
|
||||
void list_results(argparse::ArgumentParser& program)
|
||||
{
|
||||
auto dataset = program.get<string>("dataset");
|
||||
auto score = program.get<string>("score");
|
||||
auto model = program.get<string>("model");
|
||||
auto excel = program.get<bool>("excel");
|
||||
auto report = platform::ResultsDatasetsConsole();
|
||||
if (!report.report(dataset, score, model))
|
||||
return;
|
||||
std::cout << report.getOutput();
|
||||
if (excel) {
|
||||
auto data = report.getData();
|
||||
auto ereport = new platform::ResultsDatasetExcel();
|
||||
ereport->report(data);
|
||||
std::cout << std::endl << Colors::GREEN() << "Output saved in " << ereport->getFileName() << std::endl;
|
||||
auto fileName = ereport->getExcelFileName();
|
||||
delete ereport;
|
||||
std::cout << "Opening " << fileName << std::endl;
|
||||
platform::openFile(fileName);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
argparse::ArgumentParser program("b_list", { platform_project_version.begin(), platform_project_version.end() });
|
||||
//
|
||||
// datasets subparser
|
||||
//
|
||||
argparse::ArgumentParser datasets_command("datasets");
|
||||
datasets_command.add_description("List datasets available in the platform.");
|
||||
datasets_command.add_argument("--excel").help("Output in Excel format").default_value(false).implicit_value(true);
|
||||
//
|
||||
// results subparser
|
||||
//
|
||||
argparse::ArgumentParser results_command("results");
|
||||
results_command.add_description("List the results of a given dataset.");
|
||||
auto datasets = platform::Datasets(false, platform::Paths::datasets());
|
||||
results_command.add_argument("-d", "--dataset")
|
||||
.help("Dataset to use " + datasets.toString())
|
||||
.required()
|
||||
.action([](const std::string& value) {
|
||||
auto datasets = platform::Datasets(false, platform::Paths::datasets());
|
||||
static const std::vector<std::string> choices = datasets.getNames();
|
||||
if (find(choices.begin(), choices.end(), value) != choices.end()) {
|
||||
return value;
|
||||
}
|
||||
throw std::runtime_error("Dataset must be one of " + datasets.toString());
|
||||
}
|
||||
);
|
||||
results_command.add_argument("-m", "--model")
|
||||
.help("Model to use or any")
|
||||
.default_value("any");
|
||||
results_command.add_argument("--excel").help("Output in Excel format").default_value(false).implicit_value(true);
|
||||
results_command.add_argument("-s", "--score").default_value("accuracy").help("Filter results of the score name supplied");
|
||||
|
||||
// Add subparsers
|
||||
program.add_subparser(datasets_command);
|
||||
program.add_subparser(results_command);
|
||||
// Parse command line and execute
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
bool found = false;
|
||||
map<std::string, void(*)(argparse::ArgumentParser&)> commands = { {"datasets", &list_datasets}, {"results", &list_results} };
|
||||
for (const auto& command : commands) {
|
||||
if (program.is_subcommand_used(command.first)) {
|
||||
std::invoke(command.second, program.at<argparse::ArgumentParser>(command.first));
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
throw std::runtime_error("You must specify one of the following commands: {datasets, results}\n");
|
||||
}
|
||||
}
|
||||
catch (const exception& err) {
|
||||
cerr << err.what() << std::endl;
|
||||
cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
std::cout << Colors::RESET() << std::endl;
|
||||
return 0;
|
||||
}
|
37
src/commands/b_main.cpp
Normal file
37
src/commands/b_main.cpp
Normal file
@@ -0,0 +1,37 @@
|
||||
#include <argparse/argparse.hpp>
|
||||
#include "main/Experiment.h"
|
||||
#include "main/ArgumentsExperiment.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
argparse::ArgumentParser program("b_main", { platform_project_version.begin(), platform_project_version.end() });
|
||||
auto arguments = platform::ArgumentsExperiment(program, platform::experiment_t::NORMAL);
|
||||
arguments.add_arguments();
|
||||
arguments.parse_args(argc, argv);
|
||||
/*
|
||||
* Begin Processing
|
||||
*/
|
||||
// Initialize the experiment class with the command line arguments
|
||||
auto experiment = arguments.initializedExperiment();
|
||||
auto path_results = arguments.getPathResults();
|
||||
platform::Timer timer;
|
||||
timer.start();
|
||||
experiment.go();
|
||||
experiment.setDuration(timer.getDuration());
|
||||
if (!arguments.isQuiet()) {
|
||||
// Classification report if only one dataset is tested
|
||||
experiment.report();
|
||||
}
|
||||
if (arguments.haveToSaveResults()) {
|
||||
experiment.saveResult(path_results);
|
||||
}
|
||||
if (arguments.doGraph()) {
|
||||
experiment.saveGraph();
|
||||
}
|
||||
return 0;
|
||||
}
|
85
src/commands/b_manage.cpp
Normal file
85
src/commands/b_manage.cpp
Normal file
@@ -0,0 +1,85 @@
|
||||
|
||||
#include <utility>
|
||||
#include <iostream>
|
||||
#include <sys/ioctl.h>
|
||||
#include "common/Paths.h"
|
||||
#include <argparse/argparse.hpp>
|
||||
#include "manage/ManageScreen.h"
|
||||
#include <signal.h>
|
||||
#include "config_platform.h"
|
||||
|
||||
platform::ManageScreen* manager = nullptr;
|
||||
|
||||
void manageArguments(argparse::ArgumentParser& program, int argc, char** argv)
|
||||
{
|
||||
program.add_argument("-m", "--model").default_value("any").help("Filter results of the selected model)");
|
||||
program.add_argument("-s", "--score").default_value("any").help("Filter results of the score name supplied");
|
||||
program.add_argument("--folder").help("Results folder to use").default_value(platform::Paths::results());
|
||||
program.add_argument("--platform").default_value("any").help("Filter results of the selected platform");
|
||||
program.add_argument("--complete").help("Show only results with all datasets").default_value(false).implicit_value(true);
|
||||
program.add_argument("--partial").help("Show only partial results").default_value(false).implicit_value(true);
|
||||
program.add_argument("--compare").help("Compare with best results").default_value(false).implicit_value(true);
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
auto platform = program.get<std::string>("platform");
|
||||
auto model = program.get<std::string>("model");
|
||||
auto score = program.get<std::string>("score");
|
||||
auto complete = program.get<bool>("complete");
|
||||
auto partial = program.get<bool>("partial");
|
||||
auto compare = program.get<bool>("compare");
|
||||
}
|
||||
catch (const std::exception& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<int, int> numRowsCols()
|
||||
{
|
||||
#ifdef TIOCGSIZE
|
||||
struct ttysize ts;
|
||||
ioctl(STDIN_FILENO, TIOCGSIZE, &ts);
|
||||
return { ts.ts_lines, ts.ts_cols };
|
||||
#elif defined(TIOCGWINSZ)
|
||||
struct winsize ts;
|
||||
ioctl(STDIN_FILENO, TIOCGWINSZ, &ts);
|
||||
return { ts.ws_row, ts.ws_col };
|
||||
#endif /* TIOCGSIZE */
|
||||
}
|
||||
void handleResize(int sig)
|
||||
{
|
||||
auto [rows, cols] = numRowsCols();
|
||||
manager->updateSize(rows, cols);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
auto program = argparse::ArgumentParser("b_manage", { platform_project_version.begin(), platform_project_version.end() });
|
||||
manageArguments(program, argc, argv);
|
||||
std::string model = program.get<std::string>("model");
|
||||
std::string path = program.get<std::string>("folder");
|
||||
if (path.back() != '/') {
|
||||
path += '/';
|
||||
}
|
||||
std::string score = program.get<std::string>("score");
|
||||
std::string platform = program.get<std::string>("platform");
|
||||
bool complete = program.get<bool>("complete");
|
||||
bool partial = program.get<bool>("partial");
|
||||
bool compare = program.get<bool>("compare");
|
||||
if (complete)
|
||||
partial = false;
|
||||
signal(SIGWINCH, handleResize);
|
||||
auto [rows, cols] = numRowsCols();
|
||||
manager = new platform::ManageScreen(path, rows, cols, model, score, platform, complete, partial, compare);
|
||||
manager->doMenu();
|
||||
auto fileName = manager->getExcelFileName();
|
||||
delete manager;
|
||||
if (!fileName.empty()) {
|
||||
std::cout << "Opening " << fileName << std::endl;
|
||||
platform::openFile(fileName);
|
||||
}
|
||||
return 0;
|
||||
}
|
102
src/commands/b_results.cpp
Normal file
102
src/commands/b_results.cpp
Normal file
@@ -0,0 +1,102 @@
|
||||
#include <iostream>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "argparse/argparse.hpp"
|
||||
#include "common/Paths.h"
|
||||
#include "results/JsonValidator.h"
|
||||
#include "results/SchemaV1_0.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
using json = nlohmann::json;
|
||||
namespace fs = std::filesystem;
|
||||
void header(const std::string& message, int length, const std::string& symbol)
|
||||
{
|
||||
std::cout << std::string(length + 11, symbol[0]) << std::endl;
|
||||
std::cout << symbol << " " << std::setw(length + 7) << std::left << message << " " << symbol << std::endl;
|
||||
std::cout << std::string(length + 11, symbol[0]) << std::endl;
|
||||
}
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
argparse::ArgumentParser program("b_results", { platform_project_version.begin(), platform_project_version.end() });
|
||||
program.add_description("Check the results files and optionally fixes them.");
|
||||
program.add_argument("--fix").help("Fix any errors in results").default_value(false).implicit_value(true);
|
||||
program.add_argument("--file").help("check only this results file").default_value("");
|
||||
std::string nameSuffix = "results_";
|
||||
std::string schemaVersion = "1.0";
|
||||
bool fix_it = false;
|
||||
std::string selected_file;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
fix_it = program.get<bool>("fix");
|
||||
selected_file = program.get<std::string>("file");
|
||||
}
|
||||
catch (const std::exception& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
//
|
||||
// Determine the files to process
|
||||
//
|
||||
std::vector<std::string> result_files;
|
||||
int max_length = 0;
|
||||
if (selected_file != "") {
|
||||
if (!selected_file.starts_with(platform::Paths::results())) {
|
||||
selected_file = platform::Paths::results() + selected_file;
|
||||
}
|
||||
// Only check the selected file
|
||||
result_files.push_back(selected_file);
|
||||
max_length = selected_file.length();
|
||||
} else {
|
||||
// Load the result files and find the longest file name
|
||||
for (const auto& entry : fs::directory_iterator(platform::Paths::results())) {
|
||||
if (entry.is_regular_file() && entry.path().filename().string().starts_with(nameSuffix) && entry.path().filename().string().ends_with(".json")) {
|
||||
std::string fileName = entry.path().string();
|
||||
if (fileName.length() > max_length) {
|
||||
max_length = fileName.length();
|
||||
}
|
||||
result_files.push_back(fileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
//
|
||||
// Process the results files
|
||||
//
|
||||
if (result_files.empty()) {
|
||||
std::cerr << "Error: No result files found." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
std::string header_message = "Processing " + std::to_string(result_files.size()) + " result files.";
|
||||
header(header_message, max_length, "*");
|
||||
platform::JsonValidator validator(platform::SchemaV1_0::schema);
|
||||
int n_errors = 0;
|
||||
std::vector<std::string> files_with_errors;
|
||||
for (const auto& file_name : result_files) {
|
||||
std::vector<std::string> errors = validator.validate_file(file_name);
|
||||
if (!errors.empty()) {
|
||||
n_errors++;
|
||||
std::cout << std::setw(max_length) << std::left << file_name << ": " << errors.size() << " Errors:" << std::endl;
|
||||
for (const auto& error : errors) {
|
||||
std::cout << " - " << error << std::endl;
|
||||
}
|
||||
if (fix_it) {
|
||||
validator.fix_it(file_name);
|
||||
std::cout << " -> File fixed." << std::endl;
|
||||
}
|
||||
files_with_errors.push_back(file_name);
|
||||
}
|
||||
}
|
||||
if (n_errors == 0) {
|
||||
header("All files are valid.", max_length, "*");
|
||||
} else {
|
||||
std::string $verb = (fix_it) ? "had" : "have";
|
||||
std::string msg = std::to_string(n_errors) + " files " + $verb + " errors.";
|
||||
header(msg, max_length, "*");
|
||||
for (const auto& file_name : files_with_errors) {
|
||||
std::cout << "- " << file_name << std::endl;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
@@ -1,5 +1,5 @@
|
||||
#ifndef LOCALE_H
|
||||
#define LOCALE_H
|
||||
#ifndef CLOCALE_H
|
||||
#define CLOCALE_H
|
||||
#include <locale>
|
||||
#include <iostream>
|
||||
#include <string>
|
30
src/common/Colors.h
Normal file
30
src/common/Colors.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef COLORS_H
|
||||
#define COLORS_H
|
||||
#include <string>
|
||||
class Colors {
|
||||
public:
|
||||
static std::string BLACK() { return "\033[1;30m"; }
|
||||
static std::string IBLACK() { return "\033[0;90m"; }
|
||||
static std::string BLUE() { return "\033[1;34m"; }
|
||||
static std::string IBLUE() { return "\033[0;94m"; }
|
||||
static std::string CYAN() { return "\033[1;36m"; }
|
||||
static std::string ICYAN() { return "\033[0;96m"; }
|
||||
static std::string GREEN() { return "\033[1;32m"; }
|
||||
static std::string IGREEN() { return "\033[0;92m"; }
|
||||
static std::string MAGENTA() { return "\033[1;35m"; }
|
||||
static std::string IMAGENTA() { return "\033[0;95m"; }
|
||||
static std::string RED() { return "\033[1;31m"; }
|
||||
static std::string IRED() { return "\033[0;91m"; }
|
||||
static std::string YELLOW() { return "\033[1;33m"; }
|
||||
static std::string IYELLOW() { return "\033[0;93m"; }
|
||||
static std::string WHITE() { return "\033[1;37m"; }
|
||||
static std::string IWHITE() { return "\033[0;97m"; }
|
||||
static std::string RESET() { return "\033[0m"; }
|
||||
static std::string BOLD() { return "\033[1m"; }
|
||||
static std::string UNDERLINE() { return "\033[4m"; }
|
||||
static std::string BLINK() { return "\033[5m"; }
|
||||
static std::string REVERSE() { return "\033[7m"; }
|
||||
static std::string CONCEALED() { return "\033[8m"; }
|
||||
static std::string CLRSCR() { return "\033[2J\033[1;1H"; }
|
||||
};
|
||||
#endif
|
278
src/common/Dataset.cpp
Normal file
278
src/common/Dataset.cpp
Normal file
@@ -0,0 +1,278 @@
|
||||
#include <ArffFiles.hpp>
|
||||
#include <fstream>
|
||||
#include "Dataset.h"
|
||||
namespace platform {
|
||||
const std::string message_dataset_not_loaded = "Dataset not loaded.";
|
||||
Dataset::Dataset(const Dataset& dataset) :
|
||||
path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples),
|
||||
n_features(dataset.n_features), numericFeatures(dataset.numericFeatures), features(dataset.features),
|
||||
states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y),
|
||||
X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), yv(dataset.yv),
|
||||
fileType(dataset.fileType)
|
||||
{
|
||||
}
|
||||
std::string Dataset::getName() const
|
||||
{
|
||||
return name;
|
||||
}
|
||||
std::vector<std::string> Dataset::getFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return features;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
int Dataset::getNFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_features;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
int Dataset::getNSamples() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_samples;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::string Dataset::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
int Dataset::getNClasses() const
|
||||
{
|
||||
if (loaded) {
|
||||
return *std::max_element(yv.begin(), yv.end()) + 1;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::vector<std::string> Dataset::getLabels() const
|
||||
{
|
||||
// Return the labels factorization result
|
||||
if (loaded) {
|
||||
return labels;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::vector<int> Dataset::getClassesCounts() const
|
||||
{
|
||||
if (loaded) {
|
||||
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
|
||||
for (auto y : yv) {
|
||||
counts[y]++;
|
||||
}
|
||||
return counts;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::map<std::string, std::vector<int>> Dataset::getStates() const
|
||||
{
|
||||
if (loaded) {
|
||||
return states;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xv, yv };
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
|
||||
{
|
||||
if (loaded) {
|
||||
return { X, y };
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
void Dataset::load_csv()
|
||||
{
|
||||
ifstream file(path + "/" + name + ".csv");
|
||||
if (!file.is_open()) {
|
||||
throw std::invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
labels.clear();
|
||||
std::string line;
|
||||
getline(file, line);
|
||||
std::vector<std::string> tokens = split(line, ',');
|
||||
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
|
||||
if (className == "-1") {
|
||||
className = tokens.back();
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = split(line, ',');
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv[i].push_back(stof(tokens[i]));
|
||||
}
|
||||
auto label = trim(tokens.back());
|
||||
if (find(labels.begin(), labels.end(), label) == labels.end()) {
|
||||
labels.push_back(label);
|
||||
}
|
||||
yv.push_back(stoi(label));
|
||||
}
|
||||
file.close();
|
||||
}
|
||||
void Dataset::computeStates()
|
||||
{
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
auto [max_value, idx] = torch::max(X_train.index({ i, "..." }), 0);
|
||||
states[features[i]] = std::vector<int>(max_value.item<int>() + 1);
|
||||
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
|
||||
}
|
||||
auto [max_value, idx] = torch::max(y_train, 0);
|
||||
states[className] = std::vector<int>(max_value.item<int>() + 1);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
}
|
||||
void Dataset::load_arff()
|
||||
{
|
||||
auto arff = ArffFiles();
|
||||
arff.load(path + "/" + name + ".arff", className);
|
||||
// Get Dataset X, y
|
||||
Xv = arff.getX();
|
||||
yv = arff.getY();
|
||||
// Get className & Features
|
||||
className = arff.getClassName();
|
||||
auto attributes = arff.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
|
||||
labels = arff.getLabels();
|
||||
}
|
||||
std::vector<std::string> tokenize(std::string line)
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
for (auto i = 0; i < line.size(); ++i) {
|
||||
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
|
||||
std::string token = line.substr(0, i);
|
||||
tokens.push_back(token);
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
i = 0;
|
||||
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
}
|
||||
}
|
||||
if (line.size() > 0) {
|
||||
tokens.push_back(line);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
void Dataset::load_rdata()
|
||||
{
|
||||
ifstream file(path + "/" + name + "_R.dat");
|
||||
if (!file.is_open()) {
|
||||
throw std::invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
std::string line;
|
||||
labels.clear();
|
||||
getline(file, line);
|
||||
line = ArffFiles::trim(line);
|
||||
std::vector<std::string> tokens = tokenize(line);
|
||||
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
|
||||
if (className == "-1") {
|
||||
className = ArffFiles::trim(tokens.back());
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = tokenize(line);
|
||||
// We have to skip the first token, which is the instance number.
|
||||
for (auto i = 1; i < features.size() + 1; ++i) {
|
||||
const float value = stof(tokens[i]);
|
||||
Xv[i - 1].push_back(value);
|
||||
}
|
||||
auto label = trim(tokens.back());
|
||||
if (find(labels.begin(), labels.end(), label) == labels.end()) {
|
||||
labels.push_back(label);
|
||||
}
|
||||
yv.push_back(stoi(label));
|
||||
}
|
||||
file.close();
|
||||
}
|
||||
void Dataset::load()
|
||||
{
|
||||
if (loaded) {
|
||||
return;
|
||||
}
|
||||
if (fileType == CSV) {
|
||||
load_csv();
|
||||
} else if (fileType == ARFF) {
|
||||
load_arff();
|
||||
} else if (fileType == RDATA) {
|
||||
load_rdata();
|
||||
}
|
||||
n_samples = Xv[0].size();
|
||||
n_features = Xv.size();
|
||||
if (numericFeaturesIdx.size() == 0) {
|
||||
numericFeatures = std::vector<bool>(n_features, false);
|
||||
} else {
|
||||
if (numericFeaturesIdx.at(0) == -1) {
|
||||
numericFeatures = std::vector<bool>(n_features, true);
|
||||
} else {
|
||||
numericFeatures = std::vector<bool>(n_features, false);
|
||||
for (auto i : numericFeaturesIdx) {
|
||||
numericFeatures[i] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Build Tensors
|
||||
X = torch::zeros({ n_features, n_samples }, torch::kFloat32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
|
||||
}
|
||||
y = torch::tensor(yv, torch::kInt32);
|
||||
loaded = true;
|
||||
}
|
||||
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> Dataset::getTrainTestTensors(std::vector<int>& train, std::vector<int>& test)
|
||||
{
|
||||
if (!loaded) {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
auto train_t = torch::tensor(train);
|
||||
int samples_train = train.size();
|
||||
int samples_test = test.size();
|
||||
auto test_t = torch::tensor(test);
|
||||
X_train = X.index({ "...", train_t });
|
||||
y_train = y.index({ train_t });
|
||||
X_test = X.index({ "...", test_t });
|
||||
y_test = y.index({ test_t });
|
||||
if (discretize) {
|
||||
auto discretizer = Discretization::instance()->create(discretizer_algorithm);
|
||||
auto X_train_d = torch::zeros({ n_features, samples_train }, torch::kInt32);
|
||||
auto X_test_d = torch::zeros({ n_features, samples_test }, torch::kInt32);
|
||||
for (auto feature = 0; feature < n_features; ++feature) {
|
||||
if (numericFeatures[feature]) {
|
||||
auto feature_train = X_train.index({ feature, "..." });
|
||||
auto feature_test = X_test.index({ feature, "..." });
|
||||
auto feature_train_disc = discretizer->fit_transform_t(feature_train, y_train);
|
||||
auto feature_test_disc = discretizer->transform_t(feature_test);
|
||||
X_train_d.index_put_({ feature, "..." }, feature_train_disc);
|
||||
X_test_d.index_put_({ feature, "..." }, feature_test_disc);
|
||||
} else {
|
||||
X_train_d.index_put_({ feature, "..." }, X_train.index({ feature, "..." }).to(torch::kInt32));
|
||||
X_test_d.index_put_({ feature, "..." }, X_test.index({ feature, "..." }).to(torch::kInt32));
|
||||
}
|
||||
}
|
||||
X_train = X_train_d;
|
||||
X_test = X_test_d;
|
||||
assert(X_train.dtype() == torch::kInt32);
|
||||
assert(X_test.dtype() == torch::kInt32);
|
||||
computeStates();
|
||||
}
|
||||
assert(y_train.dtype() == torch::kInt32);
|
||||
assert(y_test.dtype() == torch::kInt32);
|
||||
return { X_train, X_test, y_train, y_test };
|
||||
}
|
||||
}
|
@@ -4,75 +4,57 @@
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "CPPFImdlp.h"
|
||||
#include <tuple>
|
||||
#include <common/DiscretizationRegister.h>
|
||||
#include "Utils.h"
|
||||
#include "SourceData.h"
|
||||
namespace platform {
|
||||
enum fileType_t { CSV, ARFF, RDATA };
|
||||
class SourceData {
|
||||
public:
|
||||
SourceData(std::string source)
|
||||
{
|
||||
if (source == "Surcov") {
|
||||
path = "datasets/";
|
||||
fileType = CSV;
|
||||
} else if (source == "Arff") {
|
||||
path = "datasets/";
|
||||
fileType = ARFF;
|
||||
} else if (source == "Tanveer") {
|
||||
path = "data/";
|
||||
fileType = RDATA;
|
||||
} else {
|
||||
throw std::invalid_argument("Unknown source.");
|
||||
}
|
||||
}
|
||||
std::string getPath()
|
||||
{
|
||||
return path;
|
||||
}
|
||||
fileType_t getFileType()
|
||||
{
|
||||
return fileType;
|
||||
}
|
||||
private:
|
||||
std::string path;
|
||||
fileType_t fileType;
|
||||
};
|
||||
class Dataset {
|
||||
public:
|
||||
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType, std::vector<int> numericFeaturesIdx, std::string discretizer_algo = "none") :
|
||||
path(path), name(name), className(className), discretize(discretize),
|
||||
loaded(false), fileType(fileType), numericFeaturesIdx(numericFeaturesIdx), discretizer_algorithm(discretizer_algo)
|
||||
{
|
||||
};
|
||||
explicit Dataset(const Dataset&);
|
||||
std::string getName() const;
|
||||
std::string getClassName() const;
|
||||
int getNClasses() const;
|
||||
std::vector<std::string> getLabels() const; // return the labels factorization result
|
||||
std::vector<int> getClassesCounts() const;
|
||||
std::vector<string> getFeatures() const;
|
||||
std::map<std::string, std::vector<int>> getStates() const;
|
||||
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
|
||||
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
|
||||
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> getTrainTestTensors(std::vector<int>& train, std::vector<int>& test);
|
||||
int getNFeatures() const;
|
||||
int getNSamples() const;
|
||||
std::vector<bool>& getNumericFeatures() { return numericFeatures; }
|
||||
void load();
|
||||
const bool inline isLoaded() const { return loaded; };
|
||||
private:
|
||||
std::string path;
|
||||
std::string name;
|
||||
fileType_t fileType;
|
||||
std::string className;
|
||||
int n_samples{ 0 }, n_features{ 0 };
|
||||
std::vector<int> numericFeaturesIdx;
|
||||
std::string discretizer_algorithm;
|
||||
std::vector<bool> numericFeatures; // true if feature is numeric
|
||||
std::vector<std::string> features;
|
||||
std::vector<std::string> labels;
|
||||
std::map<std::string, std::vector<int>> states;
|
||||
bool loaded;
|
||||
bool discretize;
|
||||
torch::Tensor X, y;
|
||||
torch::Tensor X_train, X_test, y_train, y_test;
|
||||
std::vector<std::vector<float>> Xv;
|
||||
std::vector<std::vector<int>> Xd;
|
||||
std::vector<int> yv;
|
||||
void buildTensors();
|
||||
void load_csv();
|
||||
void load_arff();
|
||||
void load_rdata();
|
||||
void computeStates();
|
||||
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y);
|
||||
public:
|
||||
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
|
||||
explicit Dataset(const Dataset&);
|
||||
std::string getName() const;
|
||||
std::string getClassName() const;
|
||||
std::vector<string> getFeatures() const;
|
||||
std::map<std::string, std::vector<int>> getStates() const;
|
||||
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
|
||||
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
|
||||
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
|
||||
int getNFeatures() const;
|
||||
int getNSamples() const;
|
||||
void load();
|
||||
const bool inline isLoaded() const { return loaded; };
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
105
src/common/Datasets.cpp
Normal file
105
src/common/Datasets.cpp
Normal file
@@ -0,0 +1,105 @@
|
||||
#include <fstream>
|
||||
#include<algorithm>
|
||||
#include "Datasets.h"
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
const std::string message_dataset_not_loaded = "dataset not loaded.";
|
||||
Datasets::Datasets(bool discretize, std::string sfileType, std::string discretizer_algorithm) :
|
||||
discretize(discretize), sfileType(sfileType), discretizer_algorithm(discretizer_algorithm)
|
||||
{
|
||||
if ((discretizer_algorithm == "none" || discretizer_algorithm == "") && discretize) {
|
||||
throw std::runtime_error("Can't discretize without discretization algorithm");
|
||||
}
|
||||
load();
|
||||
}
|
||||
void Datasets::load()
|
||||
{
|
||||
auto sd = SourceData(sfileType);
|
||||
fileType = sd.getFileType();
|
||||
path = sd.getPath();
|
||||
ifstream catalog(path + "all.txt");
|
||||
std::vector<int> numericFeaturesIdx;
|
||||
if (!catalog.is_open()) {
|
||||
throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
|
||||
}
|
||||
std::string line;
|
||||
std::vector<std::string> sorted_lines;
|
||||
while (getline(catalog, line)) {
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
sorted_lines.push_back(line);
|
||||
}
|
||||
sort(sorted_lines.begin(), sorted_lines.end(), [](const auto& lhs, const auto& rhs) {
|
||||
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
|
||||
|
||||
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
|
||||
});
|
||||
|
||||
for (const auto& line : sorted_lines) {
|
||||
std::vector<std::string> tokens = split(line, ';');
|
||||
std::string name = tokens[0];
|
||||
std::string className;
|
||||
numericFeaturesIdx.clear();
|
||||
int size = tokens.size();
|
||||
switch (size) {
|
||||
case 1:
|
||||
className = "-1";
|
||||
numericFeaturesIdx.push_back(-1);
|
||||
break;
|
||||
case 2:
|
||||
className = tokens[1];
|
||||
numericFeaturesIdx.push_back(-1);
|
||||
break;
|
||||
case 3:
|
||||
{
|
||||
className = tokens[1];
|
||||
auto numericFeatures = tokens[2];
|
||||
if (numericFeatures == "all") {
|
||||
numericFeaturesIdx.push_back(-1);
|
||||
} else {
|
||||
if (numericFeatures != "none") {
|
||||
auto features = json::parse(numericFeatures);
|
||||
for (auto& f : features) {
|
||||
numericFeaturesIdx.push_back(f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw std::invalid_argument("Invalid catalog file format.");
|
||||
|
||||
}
|
||||
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType, numericFeaturesIdx, discretizer_algorithm);
|
||||
}
|
||||
catalog.close();
|
||||
}
|
||||
std::vector<std::string> Datasets::getNames()
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
|
||||
sort(result.begin(), result.end(), [](const auto& lhs, const auto& rhs) {
|
||||
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
|
||||
|
||||
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
|
||||
});
|
||||
return result;
|
||||
}
|
||||
bool Datasets::isDataset(const std::string& name) const
|
||||
{
|
||||
return datasets.find(name) != datasets.end();
|
||||
}
|
||||
std::string Datasets::toString() const
|
||||
{
|
||||
std::string result;
|
||||
std::string sep = "";
|
||||
for (const auto& d : datasets) {
|
||||
result += sep + d.first;
|
||||
sep = ", ";
|
||||
}
|
||||
return "{" + result + "}";
|
||||
}
|
||||
}
|
22
src/common/Datasets.h
Normal file
22
src/common/Datasets.h
Normal file
@@ -0,0 +1,22 @@
|
||||
#ifndef DATASETS_H
|
||||
#define DATASETS_H
|
||||
#include "Dataset.h"
|
||||
namespace platform {
|
||||
class Datasets {
|
||||
public:
|
||||
explicit Datasets(bool discretize, std::string sfileType, std::string discretizer_algorithm = "none");
|
||||
std::vector<std::string> getNames();
|
||||
bool isDataset(const std::string& name) const;
|
||||
Dataset& getDataset(const std::string& name) const { return *datasets.at(name); }
|
||||
std::string toString() const;
|
||||
private:
|
||||
std::string path;
|
||||
fileType_t fileType;
|
||||
std::string sfileType;
|
||||
std::string discretizer_algorithm;
|
||||
std::map<std::string, std::unique_ptr<Dataset>> datasets;
|
||||
bool discretize;
|
||||
void load(); // Loads the list of datasets
|
||||
};
|
||||
};
|
||||
#endif
|
55
src/common/Discretization.cpp
Normal file
55
src/common/Discretization.cpp
Normal file
@@ -0,0 +1,55 @@
|
||||
#include "Discretization.h"
|
||||
|
||||
namespace platform {
|
||||
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
|
||||
Discretization* Discretization::factory = nullptr;
|
||||
Discretization* Discretization::instance()
|
||||
{
|
||||
//manages singleton
|
||||
if (factory == nullptr)
|
||||
factory = new Discretization();
|
||||
return factory;
|
||||
}
|
||||
void Discretization::registerFactoryFunction(const std::string& name,
|
||||
function<mdlp::Discretizer* (void)> classFactoryFunction)
|
||||
{
|
||||
// register the class factory function
|
||||
functionRegistry[name] = classFactoryFunction;
|
||||
}
|
||||
std::shared_ptr<mdlp::Discretizer> Discretization::create(const std::string& name)
|
||||
{
|
||||
mdlp::Discretizer* instance = nullptr;
|
||||
|
||||
// find name in the registry and call factory method.
|
||||
auto it = functionRegistry.find(name);
|
||||
if (it != functionRegistry.end())
|
||||
instance = it->second();
|
||||
// wrap instance in a shared ptr and return
|
||||
if (instance != nullptr)
|
||||
return std::unique_ptr<mdlp::Discretizer>(instance);
|
||||
else
|
||||
throw std::runtime_error("Discretizer not found: " + name);
|
||||
}
|
||||
std::vector<std::string> Discretization::getNames()
|
||||
{
|
||||
std::vector<std::string> names;
|
||||
transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names),
|
||||
[](const pair<std::string, function<mdlp::Discretizer* (void)>>& pair) { return pair.first; });
|
||||
return names;
|
||||
}
|
||||
std::string Discretization::toString()
|
||||
{
|
||||
std::string result = "";
|
||||
std::string sep = "";
|
||||
for (const auto& pair : functionRegistry) {
|
||||
result += sep + pair.first;
|
||||
sep = ", ";
|
||||
}
|
||||
return "{" + result + "}";
|
||||
}
|
||||
RegistrarDiscretization::RegistrarDiscretization(const std::string& name, function<mdlp::Discretizer* (void)> classFactoryFunction)
|
||||
{
|
||||
// register the class factory function
|
||||
Discretization::instance()->registerFactoryFunction(name, classFactoryFunction);
|
||||
}
|
||||
}
|
33
src/common/Discretization.h
Normal file
33
src/common/Discretization.h
Normal file
@@ -0,0 +1,33 @@
|
||||
#ifndef DISCRETIZATION_H
|
||||
#define DISCRETIZATION_H
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
#include <fimdlp/Discretizer.h>
|
||||
#include <fimdlp/BinDisc.h>
|
||||
#include <fimdlp/CPPFImdlp.h>
|
||||
namespace platform {
|
||||
class Discretization {
|
||||
public:
|
||||
Discretization(Discretization&) = delete;
|
||||
void operator=(const Discretization&) = delete;
|
||||
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
|
||||
static Discretization* instance();
|
||||
std::shared_ptr<mdlp::Discretizer> create(const std::string& name);
|
||||
void registerFactoryFunction(const std::string& name,
|
||||
function<mdlp::Discretizer* (void)> classFactoryFunction);
|
||||
std::vector<string> getNames();
|
||||
std::string toString();
|
||||
private:
|
||||
map<std::string, function<mdlp::Discretizer* (void)>> functionRegistry;
|
||||
static Discretization* factory; //singleton
|
||||
Discretization() {};
|
||||
};
|
||||
class RegistrarDiscretization {
|
||||
public:
|
||||
RegistrarDiscretization(const std::string& className, function<mdlp::Discretizer* (void)> classFactoryFunction);
|
||||
};
|
||||
}
|
||||
#endif
|
45
src/common/DiscretizationRegister.h
Normal file
45
src/common/DiscretizationRegister.h
Normal file
@@ -0,0 +1,45 @@
|
||||
#ifndef DISCRETIZATIONREGISTER_H
|
||||
#define DISCRETIZATIONREGISTER_H
|
||||
#include <common/Discretization.h>
|
||||
#include <limits>
|
||||
static platform::RegistrarDiscretization registrarM("mdlp",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp();});
|
||||
static platform::RegistrarDiscretization registrarM3("mdlp3",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp(3, numeric_limits<int>::max(), 3);});
|
||||
static platform::RegistrarDiscretization registrarM4("mdlp4",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp(3, numeric_limits<int>::max(), 4);});
|
||||
static platform::RegistrarDiscretization registrarM5("mdlp5",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp(3, numeric_limits<int>::max(), 5);});
|
||||
static platform::RegistrarDiscretization registrarBU3("bin3u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ3("bin3q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU4("bin4u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ4("bin4q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU5("bin5u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(5, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ5("bin5q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(5, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU6("bin6u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(6, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ6("bin6q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(6, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU7("bin7u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(7, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ7("bin7q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(7, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU8("bin8u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(8, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ8("bin8q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(8, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU9("bin9u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(9, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ9("bin9q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(9, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU10("bin10u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(10, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ10("bin10q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(10, mdlp::strategy_t::QUANTILE);});
|
||||
#endif
|
151
src/common/DotEnv.h
Normal file
151
src/common/DotEnv.h
Normal file
@@ -0,0 +1,151 @@
|
||||
#ifndef DOTENV_H
|
||||
#define DOTENV_H
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include "Utils.h"
|
||||
|
||||
//#include "Dataset.h"
|
||||
namespace platform {
|
||||
class DotEnv {
|
||||
private:
|
||||
std::map<std::string, std::string> env;
|
||||
std::map<std::string, std::vector<std::string>> valid;
|
||||
public:
|
||||
DotEnv(bool create = false)
|
||||
{
|
||||
valid =
|
||||
{
|
||||
{"depth", {"any"}},
|
||||
{"discretize", {"0", "1"}},
|
||||
{"discretize_algo", {"mdlp", "mdlp3", "mdlp4", "mdlp5", "bin3u", "bin3q", "bin4u", "bin4q", "bin5q", "bin5u", "bin6q", "bin6u", "bin7q", "bin7u", "bin8q", "bin8u", "bin9q", "bin9u", "bin10q", "bin10u"}},
|
||||
{"experiment", {"discretiz", "odte", "covid", "Test"}},
|
||||
{"fit_features", {"0", "1"}},
|
||||
{"framework", {"bulma", "bootstrap"}},
|
||||
{"ignore_nan", {"0", "1"}},
|
||||
{"leaves", {"any"}},
|
||||
{"margin", {"0.1", "0.2", "0.3"}},
|
||||
{"model", {"any"}},
|
||||
{"n_folds", {"5", "10"}},
|
||||
{"nodes", {"any"}},
|
||||
{"platform", {"any"}},
|
||||
{"stratified", {"0", "1"}},
|
||||
{"score", {"accuracy", "roc-auc-ovr"}},
|
||||
{"seeds", {"any"}},
|
||||
{"smooth_strat", {"ORIGINAL", "LAPLACE", "CESTNIK"}},
|
||||
{"source_data", {"Arff", "Tanveer", "Surcov", "Test"}},
|
||||
};
|
||||
if (create) {
|
||||
// For testing purposes
|
||||
std::ofstream file(".env");
|
||||
file << "experiment=Test" << std::endl;
|
||||
file << "source_data=Test" << std::endl;
|
||||
file << "margin=0.1" << std::endl;
|
||||
file << "score=accuracy" << std::endl;
|
||||
file << "platform=um790Linux" << std::endl;
|
||||
file << "n_folds=5" << std::endl;
|
||||
file << "discretize_algo=mdlp" << std::endl;
|
||||
file << "smooth_strat=ORIGINAL" << std::endl;
|
||||
file << "stratified=0" << std::endl;
|
||||
file << "model=TAN" << std::endl;
|
||||
file << "seeds=[271]" << std::endl;
|
||||
file << "discretize=0" << std::endl;
|
||||
file << "ignore_nan=0" << std::endl;
|
||||
file << "nodes=Nodes" << std::endl;
|
||||
file << "leaves=Edges" << std::endl;
|
||||
file << "depth=States" << std::endl;
|
||||
file << "fit_features=0" << std::endl;
|
||||
file << "framework=bulma" << std::endl;
|
||||
file << "margin=0.1" << std::endl;
|
||||
file.close();
|
||||
}
|
||||
std::ifstream file(".env");
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "File .env not found" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
std::string line;
|
||||
while (std::getline(file, line)) {
|
||||
line = trim(line);
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
std::istringstream iss(line);
|
||||
std::string key, value;
|
||||
if (std::getline(iss, key, '=') && std::getline(iss, value)) {
|
||||
key = trim(key);
|
||||
value = trim(value);
|
||||
parse(key, value);
|
||||
env[key] = value;
|
||||
}
|
||||
}
|
||||
parseEnv();
|
||||
}
|
||||
void parse(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (valid.find(key) == valid.end()) {
|
||||
std::cerr << "Invalid key in .env: " << key << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
if (valid[key].front() == "any") {
|
||||
return;
|
||||
}
|
||||
if (std::find(valid[key].begin(), valid[key].end(), value) == valid[key].end()) {
|
||||
std::cerr << "Invalid value in .env: " << key << " = " << value << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
std::vector<std::string> valid_tokens(const std::string& key)
|
||||
{
|
||||
if (valid.find(key) == valid.end()) {
|
||||
return {};
|
||||
}
|
||||
return valid.at(key);
|
||||
}
|
||||
std::string valid_values(const std::string& key)
|
||||
{
|
||||
std::string valid_values = "{", sep = "";
|
||||
if (valid.find(key) == valid.end()) {
|
||||
return "{}";
|
||||
}
|
||||
for (const auto& value : valid.at(key)) {
|
||||
valid_values += sep + value;
|
||||
sep = ", ";
|
||||
}
|
||||
return valid_values + "}";
|
||||
}
|
||||
void parseEnv()
|
||||
{
|
||||
for (auto& [key, values] : valid) {
|
||||
if (env.find(key) == env.end()) {
|
||||
std::cerr << "Key not found in .env: " << key << ", valid values: " << valid_values(key) << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::string get(const std::string& key)
|
||||
{
|
||||
if (env.find(key) == env.end()) {
|
||||
std::cerr << "Key not found in .env: " << key << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
return env.at(key);
|
||||
}
|
||||
std::vector<int> getSeeds()
|
||||
{
|
||||
auto seeds = std::vector<int>();
|
||||
auto seeds_str = env["seeds"];
|
||||
seeds_str = trim(seeds_str);
|
||||
seeds_str = seeds_str.substr(1, seeds_str.size() - 2);
|
||||
auto seeds_str_split = split(seeds_str, ',');
|
||||
transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) {
|
||||
return stoi(str);
|
||||
});
|
||||
return seeds;
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
80
src/common/Paths.h
Normal file
80
src/common/Paths.h
Normal file
@@ -0,0 +1,80 @@
|
||||
#ifndef PATHS_H
|
||||
#define PATHS_H
|
||||
#include <string>
|
||||
#include <filesystem>
|
||||
#include "DotEnv.h"
|
||||
namespace platform {
|
||||
class Paths {
|
||||
public:
|
||||
static std::string createIfNotExists(const std::string& folder)
|
||||
{
|
||||
if (!std::filesystem::exists(folder)) {
|
||||
std::filesystem::create_directory(folder);
|
||||
}
|
||||
return folder;
|
||||
}
|
||||
static std::string results() { return createIfNotExists("results/"); }
|
||||
static std::string hiddenResults() { return createIfNotExists("hidden_results/"); }
|
||||
static std::string excel() { return createIfNotExists("excel/"); }
|
||||
static std::string grid() { return createIfNotExists("grid/"); }
|
||||
static std::string graphs() { return createIfNotExists("graphs/"); }
|
||||
static std::string tex() { return createIfNotExists("tex/"); }
|
||||
static std::string datasets()
|
||||
{
|
||||
auto env = platform::DotEnv();
|
||||
return env.get("source_data");
|
||||
}
|
||||
static std::string experiment_file(const std::string& fileName, bool discretize, bool stratified, int seed, int nfold)
|
||||
{
|
||||
std::string disc = discretize ? "_disc_" : "_ndisc_";
|
||||
std::string strat = stratified ? "strat_" : "nstrat_";
|
||||
return "datasets_experiment/" + fileName + disc + strat + std::to_string(seed) + "_" + std::to_string(nfold) + ".json";
|
||||
}
|
||||
static void createPath(const std::string& path)
|
||||
{
|
||||
// Create directory if it does not exist
|
||||
try {
|
||||
std::filesystem::create_directory(path);
|
||||
}
|
||||
catch (std::exception& e) {
|
||||
throw std::runtime_error("Could not create directory " + path);
|
||||
}
|
||||
}
|
||||
static std::string bestResultsFile(const std::string& score, const std::string& model)
|
||||
{
|
||||
return "best_results_" + score + "_" + model + ".json";
|
||||
}
|
||||
static std::string bestResultsExcel(const std::string& score)
|
||||
{
|
||||
return "BestResults_" + score + ".xlsx";
|
||||
}
|
||||
static std::string excelResults() { return "some_results.xlsx"; }
|
||||
static std::string excelDatasets() { return "datasets.xlsx"; }
|
||||
static std::string grid_input(const std::string& model)
|
||||
{
|
||||
return grid() + "grid_" + model + "_input.json";
|
||||
}
|
||||
static std::string grid_output(const std::string& model)
|
||||
{
|
||||
return grid() + "grid_" + model + "_output.json";
|
||||
}
|
||||
static std::string tex_output()
|
||||
{
|
||||
return "results.tex";
|
||||
}
|
||||
static std::string md_output()
|
||||
{
|
||||
return "results.md";
|
||||
}
|
||||
static std::string tex_post_hoc()
|
||||
{
|
||||
return "post_hoc.tex";
|
||||
}
|
||||
static std::string md_post_hoc()
|
||||
{
|
||||
return "post_hoc.md";
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
#endif
|
38
src/common/SourceData.h.in
Normal file
38
src/common/SourceData.h.in
Normal file
@@ -0,0 +1,38 @@
|
||||
#ifndef SOURCEDATA_H
|
||||
#define SOURCEDATA_H
|
||||
namespace platform {
|
||||
enum fileType_t { CSV, ARFF, RDATA };
|
||||
class SourceData {
|
||||
public:
|
||||
SourceData(std::string source)
|
||||
{
|
||||
if (source == "Surcov") {
|
||||
path = "datasets/";
|
||||
fileType = CSV;
|
||||
} else if (source == "Arff") {
|
||||
path = "datasets/";
|
||||
fileType = ARFF;
|
||||
} else if (source == "Tanveer") {
|
||||
path = "data/";
|
||||
fileType = RDATA;
|
||||
} else if (source == "Test") {
|
||||
path = "@TEST_DATA_PATH@/";
|
||||
fileType = ARFF;
|
||||
} else {
|
||||
throw std::invalid_argument("Unknown source.");
|
||||
}
|
||||
}
|
||||
std::string getPath()
|
||||
{
|
||||
return path;
|
||||
}
|
||||
fileType_t getFileType()
|
||||
{
|
||||
return fileType;
|
||||
}
|
||||
private:
|
||||
std::string path;
|
||||
fileType_t fileType;
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -9,9 +9,13 @@ namespace platform {
|
||||
inline static const std::string black_star{ "\u2605" };
|
||||
inline static const std::string cross{ "\u2717" };
|
||||
inline static const std::string upward_arrow{ "\u27B6" };
|
||||
inline static const std::string down_arrow{ "\u27B4" };
|
||||
inline static const std::string downward_arrow{ "\u27B4" };
|
||||
inline static const std::string up_arrow{ "\u2B06" };
|
||||
inline static const std::string down_arrow{ "\u2B07" };
|
||||
inline static const std::string ellipsis{ "\u2026" };
|
||||
inline static const std::string equal_best{ check_mark };
|
||||
inline static const std::string better_best{ black_star };
|
||||
inline static const std::string notebook{ "\U0001F5C8" };
|
||||
};
|
||||
}
|
||||
#endif // !SYMBOLS_H
|
||||
#endif
|
106
src/common/TensorUtils.hpp
Normal file
106
src/common/TensorUtils.hpp
Normal file
@@ -0,0 +1,106 @@
|
||||
#ifndef TENSORUTILS_HPP
|
||||
#define TENSORUTILS_HPP
|
||||
#include <torch/torch.h>
|
||||
#include <vector>
|
||||
namespace platform {
|
||||
class TensorUtils {
|
||||
public:
|
||||
template <typename T>
|
||||
static std::vector<T> tensorToVector(const torch::Tensor& tensor)
|
||||
{
|
||||
torch::Tensor contig_tensor = tensor.contiguous();
|
||||
auto num_elements = contig_tensor.numel();
|
||||
const T* tensor_data = contig_tensor.data_ptr<T>();
|
||||
std::vector<T> result(tensor_data, tensor_data + num_elements);
|
||||
return result;
|
||||
}
|
||||
static std::vector<std::vector<int>> to_matrix(const torch::Tensor& X)
|
||||
{
|
||||
// Ensure tensor is contiguous in memory
|
||||
auto X_contig = X.contiguous();
|
||||
|
||||
// Access tensor data pointer directly
|
||||
auto data_ptr = X_contig.data_ptr<int>();
|
||||
|
||||
// IF you are using int64_t as the data type, use the following line
|
||||
//auto data_ptr = X_contig.data_ptr<int64_t>();
|
||||
//std::vector<std::vector<int64_t>> data(X.size(0), std::vector<int64_t>(X.size(1)));
|
||||
|
||||
// Prepare output container
|
||||
std::vector<std::vector<int>> data(X.size(0), std::vector<int>(X.size(1)));
|
||||
|
||||
// Fill the 2D vector in a single loop using pointer arithmetic
|
||||
int rows = X.size(0);
|
||||
int cols = X.size(1);
|
||||
for (int i = 0; i < rows; ++i) {
|
||||
std::copy(data_ptr + i * cols, data_ptr + (i + 1) * cols, data[i].begin());
|
||||
}
|
||||
return data;
|
||||
}
|
||||
template <typename T>
|
||||
static std::vector<T> to_vector(const torch::Tensor& y)
|
||||
{
|
||||
// Ensure the tensor is contiguous in memory
|
||||
auto y_contig = y.contiguous();
|
||||
|
||||
// Access data pointer
|
||||
auto data_ptr = y_contig.data_ptr<T>();
|
||||
|
||||
// Prepare output container
|
||||
std::vector<T> data(y.size(0));
|
||||
|
||||
// Copy data efficiently
|
||||
std::copy(data_ptr, data_ptr + y.size(0), data.begin());
|
||||
|
||||
return data;
|
||||
}
|
||||
static torch::Tensor to_matrix(const std::vector<std::vector<int>>& data)
|
||||
{
|
||||
if (data.empty()) return torch::empty({ 0, 0 }, torch::kInt64);
|
||||
size_t rows = data.size();
|
||||
size_t cols = data[0].size();
|
||||
torch::Tensor tensor = torch::empty({ static_cast<long>(rows), static_cast<long>(cols) }, torch::kInt64);
|
||||
for (size_t i = 0; i < rows; ++i) {
|
||||
for (size_t j = 0; j < cols; ++j) {
|
||||
tensor.index_put_({static_cast<int64_t>(i), static_cast<int64_t>(j)}, torch::scalar_tensor(data[i][j]));
|
||||
}
|
||||
}
|
||||
return tensor;
|
||||
}
|
||||
};
|
||||
static void dumpVector(const std::vector<std::vector<int>>& vec, const std::string& name)
|
||||
{
|
||||
std::cout << name << ": " << std::endl;
|
||||
for (const auto& row : vec) {
|
||||
std::cout << "[";
|
||||
for (const auto& val : row) {
|
||||
std::cout << val << " ";
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
static void dumpTensor(const torch::Tensor& tensor, const std::string& name)
|
||||
{
|
||||
std::cout << name << ": " << std::endl;
|
||||
for (auto i = 0; i < tensor.size(0); i++) {
|
||||
std::cout << "[";
|
||||
for (auto j = 0; j < tensor.size(1); j++) {
|
||||
std::cout << tensor[i][j].item<int>() << " ";
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
static void dumpTensorV(const torch::Tensor& tensor, const std::string& name)
|
||||
{
|
||||
std::cout << name << ": " << std::endl;
|
||||
std::cout << "[";
|
||||
for (int i = 0; i < tensor.size(0); i++) {
|
||||
std::cout << tensor[i].item<int>() << " ";
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // TENSORUTILS_HPP
|
@@ -40,4 +40,4 @@ namespace platform {
|
||||
}
|
||||
};
|
||||
} /* namespace platform */
|
||||
#endif /* TIMER_H */
|
||||
#endif
|
129
src/common/Utils.h
Normal file
129
src/common/Utils.h
Normal file
@@ -0,0 +1,129 @@
|
||||
#ifndef UTILS_H
|
||||
#define UTILS_H
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
#include <iomanip>
|
||||
#include <string.h>
|
||||
|
||||
extern char** environ;
|
||||
|
||||
namespace platform {
|
||||
static std::string trim(const std::string& str)
|
||||
{
|
||||
std::string result = str;
|
||||
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
|
||||
return !std::isspace(ch);
|
||||
}));
|
||||
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
|
||||
return !std::isspace(ch);
|
||||
}).base(), result.end());
|
||||
return result;
|
||||
}
|
||||
static std::vector<std::string> split(const std::string& text, char delimiter)
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
std::stringstream ss(text);
|
||||
std::string token;
|
||||
while (std::getline(ss, token, delimiter)) {
|
||||
result.push_back(trim(token));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
inline double compute_std(std::vector<double> values, double mean)
|
||||
{
|
||||
// Compute standard devation of the values
|
||||
double sum = 0.0;
|
||||
for (const auto& value : values) {
|
||||
sum += std::pow(value - mean, 2);
|
||||
}
|
||||
double variance = sum / values.size();
|
||||
return std::sqrt(variance);
|
||||
}
|
||||
inline std::string get_date()
|
||||
{
|
||||
time_t rawtime;
|
||||
tm* timeinfo;
|
||||
time(&rawtime);
|
||||
timeinfo = std::localtime(&rawtime);
|
||||
std::ostringstream oss;
|
||||
oss << std::put_time(timeinfo, "%Y-%m-%d");
|
||||
return oss.str();
|
||||
}
|
||||
inline std::string get_time()
|
||||
{
|
||||
time_t rawtime;
|
||||
tm* timeinfo;
|
||||
time(&rawtime);
|
||||
timeinfo = std::localtime(&rawtime);
|
||||
std::ostringstream oss;
|
||||
oss << std::put_time(timeinfo, "%H:%M:%S");
|
||||
return oss.str();
|
||||
}
|
||||
static void openFile(const std::string& fileName)
|
||||
{
|
||||
// #ifdef __APPLE__
|
||||
// // macOS uses the "open" command
|
||||
// std::string command = "open";
|
||||
// #elif defined(__linux__)
|
||||
// // Linux typically uses "xdg-open"
|
||||
// std::string command = "xdg-open";
|
||||
// #else
|
||||
// // For other OSes, do nothing or handle differently
|
||||
// std::cerr << "Unsupported platform." << std::endl;
|
||||
// return;
|
||||
// #endif
|
||||
// execlp(command.c_str(), command.c_str(), fileName.c_str(), NULL);
|
||||
#ifdef __APPLE__
|
||||
const char* tool = "/usr/bin/open";
|
||||
#elif defined(__linux__)
|
||||
const char* tool = "/usr/bin/xdg-open";
|
||||
#else
|
||||
std::cerr << "Unsupported platform." << std::endl;
|
||||
return;
|
||||
#endif
|
||||
|
||||
// We'll build an argv array for execve:
|
||||
std::vector<char*> argv;
|
||||
argv.push_back(const_cast<char*>(tool)); // argv[0]
|
||||
argv.push_back(const_cast<char*>(fileName.c_str())); // argv[1]
|
||||
argv.push_back(nullptr);
|
||||
|
||||
// Make a new environment array, skipping BASH_FUNC_ variables
|
||||
std::vector<std::string> filteredEnv;
|
||||
for (char** env = environ; *env != nullptr; ++env) {
|
||||
// *env is a string like "NAME=VALUE"
|
||||
// We want to skip those starting with "BASH_FUNC_"
|
||||
if (strncmp(*env, "BASH_FUNC_", 10) == 0) {
|
||||
// skip it
|
||||
continue;
|
||||
}
|
||||
filteredEnv.push_back(*env);
|
||||
}
|
||||
|
||||
// Convert filteredEnv into a char* array
|
||||
std::vector<char*> envp;
|
||||
for (auto& var : filteredEnv) {
|
||||
envp.push_back(const_cast<char*>(var.c_str()));
|
||||
}
|
||||
envp.push_back(nullptr);
|
||||
|
||||
// Now call execve with the cleaned environment
|
||||
// NOTE: You may need a full path to the tool if it's not in PATH, or use which() logic
|
||||
// For now, let's assume "open" or "xdg-open" is found in the default PATH:
|
||||
execve(tool, argv.data(), envp.data());
|
||||
|
||||
// If we reach here, execve failed
|
||||
perror("execve failed");
|
||||
// This would terminate your current process if it's not in a child
|
||||
// Usually you'd do something like:
|
||||
_exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
492
src/experimental_clfs/AdaBoost.cpp
Normal file
492
src/experimental_clfs/AdaBoost.cpp
Normal file
@@ -0,0 +1,492 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#include "AdaBoost.h"
|
||||
#include "DecisionTree.h"
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
#include "common/TensorUtils.hpp"
|
||||
|
||||
// Conditional debug macro for performance-critical sections
|
||||
#define DEBUG_LOG(condition, ...) \
|
||||
do { \
|
||||
if (__builtin_expect((condition), 0)) { \
|
||||
std::cout << __VA_ARGS__ << std::endl; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
namespace bayesnet {
|
||||
|
||||
AdaBoost::AdaBoost(int n_estimators, int max_depth)
|
||||
: Ensemble(true), n_estimators(n_estimators), base_max_depth(max_depth), n(0), n_classes(0)
|
||||
{
|
||||
validHyperparameters = { "n_estimators", "base_max_depth" };
|
||||
}
|
||||
|
||||
// Versión optimizada de buildModel - Reemplazar en AdaBoost.cpp:
|
||||
|
||||
void AdaBoost::buildModel(const torch::Tensor& weights)
|
||||
{
|
||||
// Initialize variables
|
||||
models.clear();
|
||||
alphas.clear();
|
||||
training_errors.clear();
|
||||
|
||||
// Initialize n (number of features) and n_classes
|
||||
n = dataset.size(0) - 1; // Exclude the label row
|
||||
n_classes = states[className].size();
|
||||
|
||||
// Initialize sample weights uniformly
|
||||
int n_samples = dataset.size(1);
|
||||
sample_weights = torch::ones({ n_samples }) / n_samples;
|
||||
|
||||
// If initial weights are provided, incorporate them
|
||||
if (weights.defined() && weights.numel() > 0) {
|
||||
if (weights.size(0) != n_samples) {
|
||||
throw std::runtime_error("weights must have the same length as number of samples");
|
||||
}
|
||||
sample_weights = weights.clone();
|
||||
normalizeWeights();
|
||||
}
|
||||
|
||||
// Conditional debug information (only when debug is enabled)
|
||||
DEBUG_LOG(debug, "Starting AdaBoost training with " << n_estimators << " estimators\n"
|
||||
<< "Number of classes: " << n_classes << "\n"
|
||||
<< "Number of features: " << n << "\n"
|
||||
<< "Number of samples: " << n_samples);
|
||||
|
||||
// Pre-compute random guess error threshold
|
||||
const double random_guess_error = 1.0 - (1.0 / static_cast<double>(n_classes));
|
||||
|
||||
// Main AdaBoost training loop (SAMME algorithm)
|
||||
for (int iter = 0; iter < n_estimators; ++iter) {
|
||||
// Train base estimator with current sample weights
|
||||
auto estimator = trainBaseEstimator(sample_weights);
|
||||
|
||||
// Calculate weighted error
|
||||
double weighted_error = calculateWeightedError(estimator.get(), sample_weights);
|
||||
training_errors.push_back(weighted_error);
|
||||
|
||||
// According to SAMME, we need error < random_guess_error
|
||||
if (weighted_error >= random_guess_error) {
|
||||
DEBUG_LOG(debug, "Error >= random guess (" << random_guess_error << "), stopping");
|
||||
// If only one estimator and it's worse than random, keep it with zero weight
|
||||
if (models.empty()) {
|
||||
models.push_back(std::move(estimator));
|
||||
alphas.push_back(0.0);
|
||||
}
|
||||
break; // Stop boosting
|
||||
}
|
||||
|
||||
// Check for perfect classification BEFORE calculating alpha
|
||||
if (weighted_error <= 1e-10) {
|
||||
DEBUG_LOG(debug, "Perfect classification achieved (error=" << weighted_error << ")");
|
||||
|
||||
// For perfect classification, use a large but finite alpha
|
||||
double alpha = 10.0 + std::log(static_cast<double>(n_classes - 1));
|
||||
|
||||
// Store the estimator and its weight
|
||||
models.push_back(std::move(estimator));
|
||||
alphas.push_back(alpha);
|
||||
|
||||
DEBUG_LOG(debug, "Iteration " << iter << ":\n"
|
||||
<< " Weighted error: " << weighted_error << "\n"
|
||||
<< " Alpha (finite): " << alpha << "\n"
|
||||
<< " Random guess error: " << random_guess_error);
|
||||
|
||||
break; // Stop training as we have a perfect classifier
|
||||
}
|
||||
|
||||
// Calculate alpha (estimator weight) using SAMME formula
|
||||
// alpha = log((1 - err) / err) + log(K - 1)
|
||||
// Clamp weighted_error to avoid division by zero and infinite alpha
|
||||
double clamped_error = std::max(1e-15, std::min(1.0 - 1e-15, weighted_error));
|
||||
double alpha = std::log((1.0 - clamped_error) / clamped_error) +
|
||||
std::log(static_cast<double>(n_classes - 1));
|
||||
|
||||
// Clamp alpha to reasonable bounds to avoid numerical issues
|
||||
alpha = std::max(-10.0, std::min(10.0, alpha));
|
||||
|
||||
// Store the estimator and its weight
|
||||
models.push_back(std::move(estimator));
|
||||
alphas.push_back(alpha);
|
||||
|
||||
// Update sample weights (only if this is not the last iteration)
|
||||
if (iter < n_estimators - 1) {
|
||||
updateSampleWeights(models.back().get(), alpha);
|
||||
normalizeWeights();
|
||||
}
|
||||
|
||||
DEBUG_LOG(debug, "Iteration " << iter << ":\n"
|
||||
<< " Weighted error: " << weighted_error << "\n"
|
||||
<< " Alpha: " << alpha << "\n"
|
||||
<< " Random guess error: " << random_guess_error);
|
||||
}
|
||||
|
||||
// Set the number of models actually trained
|
||||
n_models = models.size();
|
||||
DEBUG_LOG(debug, "AdaBoost training completed with " << n_models << " models");
|
||||
}
|
||||
|
||||
void AdaBoost::trainModel(const torch::Tensor& weights, const Smoothing_t smoothing)
|
||||
{
|
||||
// Call buildModel which does the actual training
|
||||
buildModel(weights);
|
||||
fitted = true;
|
||||
}
|
||||
|
||||
std::unique_ptr<Classifier> AdaBoost::trainBaseEstimator(const torch::Tensor& weights)
|
||||
{
|
||||
// Create a decision tree with specified max depth
|
||||
auto tree = std::make_unique<DecisionTree>(base_max_depth);
|
||||
|
||||
// Ensure weights are properly normalized
|
||||
auto normalized_weights = weights / weights.sum();
|
||||
|
||||
// Fit the tree with the current sample weights
|
||||
tree->fit(dataset, features, className, states, normalized_weights, Smoothing_t::NONE);
|
||||
|
||||
return tree;
|
||||
}
|
||||
|
||||
double AdaBoost::calculateWeightedError(Classifier* estimator, const torch::Tensor& weights)
|
||||
{
|
||||
// Get features and labels from dataset (avoid repeated indexing)
|
||||
auto X = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), torch::indexing::Slice() });
|
||||
auto y_true = dataset.index({ -1, torch::indexing::Slice() });
|
||||
|
||||
// Get predictions from the estimator
|
||||
auto y_pred = estimator->predict(X);
|
||||
|
||||
// Vectorized error calculation using PyTorch operations
|
||||
auto incorrect = (y_pred != y_true).to(torch::kDouble);
|
||||
|
||||
// Direct dot product for weighted error (more efficient than sum)
|
||||
double weighted_error = torch::dot(incorrect, weights).item<double>();
|
||||
|
||||
// Clamp to valid range in one operation
|
||||
return std::clamp(weighted_error, 1e-15, 1.0 - 1e-15);
|
||||
}
|
||||
|
||||
void AdaBoost::updateSampleWeights(Classifier* estimator, double alpha)
|
||||
{
|
||||
// Get predictions from the estimator (reuse from calculateWeightedError if possible)
|
||||
auto X = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), torch::indexing::Slice() });
|
||||
auto y_true = dataset.index({ -1, torch::indexing::Slice() });
|
||||
auto y_pred = estimator->predict(X);
|
||||
|
||||
// Vectorized weight update using PyTorch operations
|
||||
auto incorrect = (y_pred != y_true).to(torch::kDouble);
|
||||
|
||||
// Single vectorized operation instead of element-wise multiplication
|
||||
sample_weights *= torch::exp(alpha * incorrect);
|
||||
|
||||
// Vectorized clamping for numerical stability
|
||||
sample_weights = torch::clamp(sample_weights, 1e-15, 1e15);
|
||||
}
|
||||
|
||||
void AdaBoost::normalizeWeights()
|
||||
{
|
||||
// Single-pass normalization using PyTorch operations
|
||||
double sum_weights = torch::sum(sample_weights).item<double>();
|
||||
|
||||
if (__builtin_expect(sum_weights <= 0, 0)) {
|
||||
// Reset to uniform if all weights are zero/negative (rare case)
|
||||
sample_weights = torch::ones_like(sample_weights) / sample_weights.size(0);
|
||||
} else {
|
||||
// Vectorized normalization
|
||||
sample_weights /= sum_weights;
|
||||
|
||||
// Vectorized minimum weight enforcement
|
||||
sample_weights = torch::clamp_min(sample_weights, 1e-15);
|
||||
|
||||
// Renormalize after clamping (if any weights were clamped)
|
||||
double new_sum = torch::sum(sample_weights).item<double>();
|
||||
if (new_sum != 1.0) {
|
||||
sample_weights /= new_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> AdaBoost::graph(const std::string& title) const
|
||||
{
|
||||
// Create a graph representation of the AdaBoost ensemble
|
||||
std::vector<std::string> graph_lines;
|
||||
|
||||
// Header
|
||||
graph_lines.push_back("digraph AdaBoost {");
|
||||
graph_lines.push_back(" rankdir=TB;");
|
||||
graph_lines.push_back(" node [shape=box];");
|
||||
|
||||
if (!title.empty()) {
|
||||
graph_lines.push_back(" label=\"" + title + "\";");
|
||||
graph_lines.push_back(" labelloc=t;");
|
||||
}
|
||||
|
||||
// Add input node
|
||||
graph_lines.push_back(" Input [shape=ellipse, label=\"Input Features\"];");
|
||||
|
||||
// Add base estimators
|
||||
for (size_t i = 0; i < models.size(); ++i) {
|
||||
std::stringstream ss;
|
||||
ss << " Estimator" << i << " [label=\"Base Estimator " << i + 1
|
||||
<< "\\nα = " << std::fixed << std::setprecision(3) << alphas[i] << "\"];";
|
||||
graph_lines.push_back(ss.str());
|
||||
|
||||
// Connect input to estimator
|
||||
ss.str("");
|
||||
ss << " Input -> Estimator" << i << ";";
|
||||
graph_lines.push_back(ss.str());
|
||||
}
|
||||
|
||||
// Add combination node
|
||||
graph_lines.push_back(" Combination [shape=diamond, label=\"Weighted Vote\"];");
|
||||
|
||||
// Connect estimators to combination
|
||||
for (size_t i = 0; i < models.size(); ++i) {
|
||||
std::stringstream ss;
|
||||
ss << " Estimator" << i << " -> Combination;";
|
||||
graph_lines.push_back(ss.str());
|
||||
}
|
||||
|
||||
// Add output node
|
||||
graph_lines.push_back(" Output [shape=ellipse, label=\"Final Prediction\"];");
|
||||
graph_lines.push_back(" Combination -> Output;");
|
||||
|
||||
// Close graph
|
||||
graph_lines.push_back("}");
|
||||
|
||||
return graph_lines;
|
||||
}
|
||||
|
||||
void AdaBoost::checkValues() const
|
||||
{
|
||||
if (n_estimators <= 0) {
|
||||
throw std::invalid_argument("n_estimators must be positive");
|
||||
}
|
||||
if (base_max_depth <= 0) {
|
||||
throw std::invalid_argument("base_max_depth must be positive");
|
||||
}
|
||||
}
|
||||
|
||||
void AdaBoost::setHyperparameters(const nlohmann::json& hyperparameters_)
|
||||
{
|
||||
auto hyperparameters = hyperparameters_;
|
||||
// Set hyperparameters from JSON
|
||||
auto it = hyperparameters.find("n_estimators");
|
||||
if (it != hyperparameters.end()) {
|
||||
n_estimators = it->get<int>();
|
||||
hyperparameters.erase("n_estimators");
|
||||
}
|
||||
|
||||
it = hyperparameters.find("base_max_depth");
|
||||
if (it != hyperparameters.end()) {
|
||||
base_max_depth = it->get<int>();
|
||||
hyperparameters.erase("base_max_depth");
|
||||
}
|
||||
checkValues();
|
||||
Ensemble::setHyperparameters(hyperparameters);
|
||||
}
|
||||
|
||||
int AdaBoost::predictSample(const torch::Tensor& x) const
|
||||
{
|
||||
// Early validation (keep essential checks only)
|
||||
if (!fitted || models.empty()) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
// Pre-allocate and reuse memory
|
||||
static thread_local std::vector<double> class_votes_cache;
|
||||
if (class_votes_cache.size() != static_cast<size_t>(n_classes)) {
|
||||
class_votes_cache.resize(n_classes);
|
||||
}
|
||||
std::fill(class_votes_cache.begin(), class_votes_cache.end(), 0.0);
|
||||
|
||||
// Optimized voting loop - avoid exception handling in hot path
|
||||
for (size_t i = 0; i < models.size(); ++i) {
|
||||
double alpha = alphas[i];
|
||||
if (alpha <= 0 || !std::isfinite(alpha)) continue;
|
||||
|
||||
// Direct cast and call - avoid virtual dispatch overhead
|
||||
int predicted_class = static_cast<DecisionTree*>(models[i].get())->predictSample(x);
|
||||
|
||||
// Bounds check with branch prediction hint
|
||||
if (__builtin_expect(predicted_class >= 0 && predicted_class < n_classes, 1)) {
|
||||
class_votes_cache[predicted_class] += alpha;
|
||||
}
|
||||
}
|
||||
|
||||
// Fast argmax using iterators
|
||||
return std::distance(class_votes_cache.begin(),
|
||||
std::max_element(class_votes_cache.begin(), class_votes_cache.end()));
|
||||
}
|
||||
|
||||
torch::Tensor AdaBoost::predictProbaSample(const torch::Tensor& x) const
|
||||
{
|
||||
// Early validation
|
||||
if (!fitted || models.empty()) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
// Use stack allocation for small arrays (typical case: n_classes <= 32)
|
||||
constexpr int STACK_THRESHOLD = 32;
|
||||
double stack_votes[STACK_THRESHOLD];
|
||||
std::vector<double> heap_votes;
|
||||
double* class_votes;
|
||||
|
||||
if (n_classes <= STACK_THRESHOLD) {
|
||||
class_votes = stack_votes;
|
||||
std::fill_n(class_votes, n_classes, 0.0);
|
||||
} else {
|
||||
heap_votes.resize(n_classes, 0.0);
|
||||
class_votes = heap_votes.data();
|
||||
}
|
||||
|
||||
double total_votes = 0.0;
|
||||
|
||||
// Optimized voting loop
|
||||
for (size_t i = 0; i < models.size(); ++i) {
|
||||
double alpha = alphas[i];
|
||||
if (alpha <= 0 || !std::isfinite(alpha)) continue;
|
||||
|
||||
int predicted_class = static_cast<DecisionTree*>(models[i].get())->predictSample(x);
|
||||
|
||||
if (__builtin_expect(predicted_class >= 0 && predicted_class < n_classes, 1)) {
|
||||
class_votes[predicted_class] += alpha;
|
||||
total_votes += alpha;
|
||||
}
|
||||
}
|
||||
|
||||
// Direct tensor creation with pre-computed size
|
||||
torch::Tensor class_probs = torch::empty({ n_classes }, torch::TensorOptions().dtype(torch::kFloat32));
|
||||
auto probs_accessor = class_probs.accessor<float, 1>();
|
||||
|
||||
if (__builtin_expect(total_votes > 0.0, 1)) {
|
||||
// Vectorized probability calculation
|
||||
const double inv_total = 1.0 / total_votes;
|
||||
for (int j = 0; j < n_classes; ++j) {
|
||||
probs_accessor[j] = static_cast<float>(class_votes[j] * inv_total);
|
||||
}
|
||||
} else {
|
||||
// Uniform distribution fallback
|
||||
const float uniform_prob = 1.0f / n_classes;
|
||||
for (int j = 0; j < n_classes; ++j) {
|
||||
probs_accessor[j] = uniform_prob;
|
||||
}
|
||||
}
|
||||
|
||||
return class_probs;
|
||||
}
|
||||
|
||||
torch::Tensor AdaBoost::predict_proba(torch::Tensor& X)
|
||||
{
|
||||
if (!fitted || models.empty()) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
// Input validation
|
||||
if (X.size(0) != n) {
|
||||
throw std::runtime_error("Input has wrong number of features. Expected " +
|
||||
std::to_string(n) + " but got " + std::to_string(X.size(0)));
|
||||
}
|
||||
|
||||
const int n_samples = X.size(1);
|
||||
|
||||
// Pre-allocate output tensor with correct layout
|
||||
torch::Tensor probabilities = torch::empty({ n_samples, n_classes },
|
||||
torch::TensorOptions().dtype(torch::kFloat32));
|
||||
|
||||
// Convert to contiguous memory if needed (optimization for memory access)
|
||||
if (!X.is_contiguous()) {
|
||||
X = X.contiguous();
|
||||
}
|
||||
|
||||
// Batch processing with memory-efficient sample extraction
|
||||
for (int i = 0; i < n_samples; ++i) {
|
||||
// Extract sample without unnecessary copies
|
||||
auto sample = X.select(1, i);
|
||||
|
||||
// Direct assignment to pre-allocated tensor
|
||||
probabilities[i] = predictProbaSample(sample);
|
||||
}
|
||||
|
||||
return probabilities;
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>> AdaBoost::predict_proba(std::vector<std::vector<int>>& X)
|
||||
{
|
||||
const size_t n_samples = X[0].size();
|
||||
|
||||
// Pre-allocate result with exact size
|
||||
std::vector<std::vector<double>> result;
|
||||
result.reserve(n_samples);
|
||||
|
||||
// Avoid repeated allocations
|
||||
for (size_t i = 0; i < n_samples; ++i) {
|
||||
result.emplace_back(n_classes, 0.0);
|
||||
}
|
||||
|
||||
// Convert to tensor only once (batch conversion is more efficient)
|
||||
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
|
||||
torch::Tensor proba_tensor = predict_proba(X_tensor);
|
||||
|
||||
// Optimized tensor-to-vector conversion
|
||||
auto proba_accessor = proba_tensor.accessor<float, 2>();
|
||||
for (size_t i = 0; i < n_samples; ++i) {
|
||||
for (int j = 0; j < n_classes; ++j) {
|
||||
result[i][j] = static_cast<double>(proba_accessor[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
torch::Tensor AdaBoost::predict(torch::Tensor& X)
|
||||
{
|
||||
if (!fitted || models.empty()) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
if (X.size(0) != n) {
|
||||
throw std::runtime_error("Input has wrong number of features. Expected " +
|
||||
std::to_string(n) + " but got " + std::to_string(X.size(0)));
|
||||
}
|
||||
|
||||
const int n_samples = X.size(1);
|
||||
|
||||
// Pre-allocate with correct dtype
|
||||
torch::Tensor predictions = torch::empty({ n_samples }, torch::TensorOptions().dtype(torch::kInt32));
|
||||
auto pred_accessor = predictions.accessor<int32_t, 1>();
|
||||
|
||||
// Ensure contiguous memory layout
|
||||
if (!X.is_contiguous()) {
|
||||
X = X.contiguous();
|
||||
}
|
||||
|
||||
// Optimized prediction loop
|
||||
for (int i = 0; i < n_samples; ++i) {
|
||||
auto sample = X.select(1, i);
|
||||
pred_accessor[i] = predictSample(sample);
|
||||
}
|
||||
|
||||
return predictions;
|
||||
}
|
||||
|
||||
std::vector<int> AdaBoost::predict(std::vector<std::vector<int>>& X)
|
||||
{
|
||||
// Single tensor conversion for batch processing
|
||||
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
|
||||
torch::Tensor predictions_tensor = predict(X_tensor);
|
||||
|
||||
// Optimized tensor-to-vector conversion
|
||||
std::vector<int> result = platform::TensorUtils::to_vector<int>(predictions_tensor);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace bayesnet
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user