Compare commits
270 Commits
67f1feb71f
...
main
Author | SHA1 | Date | |
---|---|---|---|
c3c580a611
|
|||
515455695b
|
|||
f68d216150
|
|||
b990684581
|
|||
5fd0ef692d
|
|||
dfcdadbf38
|
|||
613f4b6813
|
|||
dc324fe5f7
|
|||
9816896240
|
|||
a3f765ce3c
|
|||
3d814a79c6
|
|||
1ef7ca6180 | |||
9448a971e8
|
|||
24cef7496d
|
|||
a1a6d3d612
|
|||
dda9740e83
|
|||
41afa1b888
|
|||
4e18dc87be
|
|||
56af1a5f85
|
|||
415a7ae608
|
|||
023d5613b4
|
|||
8c413a1eb0
|
|||
3b158e9fc1
|
|||
514968a082
|
|||
dcde8c01be
|
|||
a6b6efce95
|
|||
473d194dde
|
|||
a56ec98ef9
|
|||
70d8022926
|
|||
f5107abea7
|
|||
e64e281b63
|
|||
b639a2d79a
|
|||
d6603dd638
|
|||
321e2a2f28
|
|||
36c72491e7
|
|||
aa19ab6c21
|
|||
16b4923851
|
|||
b1965c8ae5
|
|||
7d3a2dd713
|
|||
50fde9521b | |||
cd2f47c58b | |||
facf6f6ddd
|
|||
c9ab88e475
|
|||
c2a4e3e64e
|
|||
664a6a5aeb
|
|||
ae7b89b134
|
|||
9c1852c6c3
|
|||
7a23782b05
|
|||
b2002d341c
|
|||
9a8b960ce8
|
|||
7bc8633ed1
|
|||
11155463b9
|
|||
12e69a7f53
|
|||
c127cb670a
|
|||
610c2a6a4a
|
|||
2dcd073299
|
|||
f51d5b5e40
|
|||
4e3043b2d1
|
|||
b055065e59
|
|||
0d1e4b3c6f
|
|||
1a688f90b4
|
|||
c63baf419f
|
|||
de7cf091be
|
|||
475a819a87
|
|||
ce6e192a33
|
|||
5daf7cbd69
|
|||
1b26de1e38
|
|||
d3de429f2c
|
|||
f48864a415
|
|||
c1531dba2a
|
|||
5556fbab03
|
|||
ac89cefab3
|
|||
14dd8ebb66
|
|||
bd5ba14f04
|
|||
17728212c1
|
|||
86b4558f9d
|
|||
505edc79ac
|
|||
73a4b3d5e5
|
|||
cbe8f4c79c
|
|||
0d08a526fa
|
|||
d0706da887
|
|||
07e3cc9599
|
|||
2a9652b450
|
|||
3397d0962f
|
|||
7aaf6d1bf8
|
|||
eb430a84c4
|
|||
d0e65348e0
|
|||
c1d5dd74e3
|
|||
9a9a9fb17a
|
|||
386faf960e
|
|||
28894004c8
|
|||
ae41975fb4
|
|||
0e475e4488
|
|||
909cec712c
|
|||
4901bb1f32
|
|||
0318dcf8e5
|
|||
1cc19a7b19
|
|||
f88944de36
|
|||
1a336a094e
|
|||
8705adf3ee
|
|||
017cb8a0dc
|
|||
e966c880e6
|
|||
70ea32dc9a
|
|||
ba455bb934
|
|||
a65955248a
|
|||
84930b0537
|
|||
10c65f44a0
|
|||
6d112f01e7
|
|||
401296293b
|
|||
9566ae4cf6
|
|||
55187ee521
|
|||
68ea06d129
|
|||
6c1d1d0d32
|
|||
b0853d169b
|
|||
26f8e07774
|
|||
315dfb104f
|
|||
381f226d53
|
|||
ea13835701
|
|||
d75468cf78
|
|||
c58bd9d60d
|
|||
148a3b831a
|
|||
69063badbb
|
|||
6ae2b2182a
|
|||
4dbd76df55
|
|||
4545f76667
|
|||
8372987dae
|
|||
d72943c749
|
|||
800246acd2
|
|||
0ea967dd9d
|
|||
97abec8b69
|
|||
17c9522e77
|
|||
45af550cf9
|
|||
5d5f49777e
|
|||
540a8ea06d
|
|||
1924c4392b
|
|||
f2556a30af
|
|||
2f2ed00ca1
|
|||
28f6a0d7a7
|
|||
028522f180
|
|||
84adf13a79
|
|||
26dfe6d056
|
|||
3acc34e4c6
|
|||
8f92b74260
|
|||
3d900f8c81
|
|||
e628d80f4c
|
|||
0f06f8971e
|
|||
f800772149
|
|||
b8a8ddaf8c
|
|||
90555489ff
|
|||
080f3cee34
|
|||
643633e6dd
|
|||
361c51d864
|
|||
5dd3deca1a
|
|||
2202a81782
|
|||
c4f4e332f6
|
|||
a7ec930fa0
|
|||
6858b3d89a
|
|||
5fb176d78a
|
|||
f5d5c35002
|
|||
b34af13eea
|
|||
e3a06264a9
|
|||
df82f82e88
|
|||
886dde7a06
|
|||
88468434e7
|
|||
ad5c3319bd
|
|||
594adb0534
|
|||
b9e0c92334
|
|||
25bd7a42c6
|
|||
c165a4bdda
|
|||
49a36904dc
|
|||
577351eda5
|
|||
a3c4bde460
|
|||
696c0564a7
|
|||
30a6d5e60d
|
|||
f8f3ca28dc
|
|||
5c190d7c66
|
|||
99c9c6731f
|
|||
8d20545fd2
|
|||
2b480cdcb7 | |||
ebaddf1a6c
|
|||
07a2efb298
|
|||
f88b223c46
|
|||
69b9609154
|
|||
6d4117d188
|
|||
ec0268c514
|
|||
dd94fd51f7
|
|||
009ed037b8
|
|||
6d1b78ada7
|
|||
3882ebd6e4
|
|||
423242d280
|
|||
b9381aa453
|
|||
33cfb78554
|
|||
1caa39c071
|
|||
018c94bfe6
|
|||
a54d6b8716
|
|||
6cde09d81e
|
|||
7be95d889d
|
|||
42d61c6fc4
|
|||
e5e947779f
|
|||
ad168d13ba
|
|||
78b8a8ae66
|
|||
7ed9073d15
|
|||
ee93789ca3
|
|||
375ed437ed
|
|||
5ec7fe8d00
|
|||
72ea62f783
|
|||
4b91f2bde0
|
|||
3bc51cb7b0
|
|||
cf83d1f8f4
|
|||
0dd10bcbe4
|
|||
622b36b2c7
|
|||
ea29a96ca1
|
|||
673a41fc4d
|
|||
634ea36169
|
|||
20fef5b6b3
|
|||
7cf864c3f3
|
|||
4a0fa33917
|
|||
d47da27571
|
|||
faccb09c43
|
|||
fa4f47ff35
|
|||
106a36109e
|
|||
37eba57765
|
|||
67487ffce1
|
|||
9c11dee019
|
|||
58ae2c7690
|
|||
fa366a4c22
|
|||
b9af086c29
|
|||
6a285b149b
|
|||
ad402ac21e
|
|||
38978aa7b7
|
|||
3691363b8e
|
|||
fe24aa0b3e
|
|||
175e0eb591
|
|||
1912d17498
|
|||
54249e5304
|
|||
d7f92c9682
|
|||
00bb7f4680
|
|||
bf5dabb169
|
|||
cdf339856a
|
|||
3ceea5677c
|
|||
260fd122eb
|
|||
eff0be1c1c
|
|||
0ade72a37a
|
|||
72cda3784a
|
|||
52d689666a
|
|||
26e87c9cb1 | |||
03cd6e5a51
|
|||
cd9ff89b52
|
|||
05d05e25c2
|
|||
5cd6e3d1a5
|
|||
d9e9356d92
|
|||
0010c840d1
|
|||
51f32113c0
|
|||
b3b3d9f1b9
|
|||
4c847fc3f6
|
|||
7e4ee0a9a9
|
|||
b7398db9b1
|
|||
0a9bd0d9c4
|
|||
7a3adaf4a9
|
|||
5c4efa08db
|
|||
576016bbd9 | |||
e26b3c0970
|
|||
183cf12300
|
|||
4eb08cd281
|
|||
4f5f629124
|
|||
df011f7e6b
|
|||
42648f3125
|
|||
d2832ed2b3
|
|||
ec323d86ab
|
|||
e4a6575722
|
@@ -4,8 +4,8 @@ diagrams:
|
||||
Platform:
|
||||
type: class
|
||||
glob:
|
||||
- src/*.cc
|
||||
- src/modules/*.cc
|
||||
- src/*.cpp
|
||||
- src/modules/*.cpp
|
||||
using_namespace: platform
|
||||
include:
|
||||
namespaces:
|
||||
@@ -17,7 +17,7 @@ diagrams:
|
||||
sequence:
|
||||
type: sequence
|
||||
glob:
|
||||
- src/b_main.cc
|
||||
- src/b_main.cpp
|
||||
combine_free_functions_into_file_participants: true
|
||||
using_namespace:
|
||||
- std
|
||||
|
4
.gitignore
vendored
4
.gitignore
vendored
@@ -41,3 +41,7 @@ puml/**
|
||||
*.dot
|
||||
diagrams/html/**
|
||||
diagrams/latex/**
|
||||
.cache
|
||||
vcpkg_installed
|
||||
.claude/settings.local.json
|
||||
CMakeUserPresets.json
|
||||
|
19
.gitmodules
vendored
19
.gitmodules
vendored
@@ -1,19 +0,0 @@
|
||||
[submodule "lib/catch2"]
|
||||
path = lib/catch2
|
||||
url = https://github.com/catchorg/Catch2.git
|
||||
[submodule "lib/argparse"]
|
||||
path = lib/argparse
|
||||
url = https://github.com/p-ranav/argparse
|
||||
[submodule "lib/json"]
|
||||
path = lib/json
|
||||
url = https://github.com/nlohmann/json
|
||||
[submodule "lib/libxlsxwriter"]
|
||||
path = lib/libxlsxwriter
|
||||
url = https://github.com/jmcnamara/libxlsxwriter.git
|
||||
[submodule "lib/mdlp"]
|
||||
path = lib/mdlp
|
||||
url = https://github.com/rmontanana/mdlp
|
||||
update = merge
|
||||
[submodule "lib/PyClassifiers"]
|
||||
path = lib/PyClassifiers
|
||||
url = git@github.com:rmontanana/PyClassifiers
|
13
.vscode/c_cpp_properties.json
vendored
13
.vscode/c_cpp_properties.json
vendored
@@ -11,7 +11,18 @@
|
||||
],
|
||||
"cStandard": "c17",
|
||||
"cppStandard": "c++17",
|
||||
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json"
|
||||
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json",
|
||||
"configurationProvider": "ms-vscode.cmake-tools"
|
||||
},
|
||||
{
|
||||
"name": "Linux",
|
||||
"includePath": [
|
||||
"${workspaceFolder}/**"
|
||||
],
|
||||
"defines": [],
|
||||
"cStandard": "c17",
|
||||
"cppStandard": "c++17",
|
||||
"configurationProvider": "ms-vscode.cmake-tools"
|
||||
}
|
||||
],
|
||||
"version": 4
|
||||
|
15
.vscode/launch.json
vendored
15
.vscode/launch.json
vendored
@@ -62,9 +62,9 @@
|
||||
"--stratified",
|
||||
"--discretize",
|
||||
"-d",
|
||||
"iris",
|
||||
"glass",
|
||||
"--hyperparameters",
|
||||
"{\"repeatSparent\": true, \"maxModels\": 12}"
|
||||
"{\"block_update\": true}"
|
||||
],
|
||||
"cwd": "/home/rmontanana/Code/discretizbench",
|
||||
},
|
||||
@@ -99,7 +99,9 @@
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/build_debug/src/b_list",
|
||||
"args": [
|
||||
"--excel"
|
||||
"results",
|
||||
"-d",
|
||||
"mfeat-morphological"
|
||||
],
|
||||
//"cwd": "/Users/rmontanana/Code/discretizbench",
|
||||
"cwd": "${workspaceFolder}/../discretizbench",
|
||||
@@ -108,12 +110,13 @@
|
||||
"name": "test",
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/build_debug/tests/unit_tests",
|
||||
"program": "${workspaceFolder}/build_debug/tests/unit_tests_platform",
|
||||
"args": [
|
||||
"-c=\"Metrics Test\"",
|
||||
"[Scores]",
|
||||
// "-c=\"Metrics Test\"",
|
||||
// "-s",
|
||||
],
|
||||
"cwd": "${workspaceFolder}/build/tests",
|
||||
"cwd": "${workspaceFolder}/build_debug/tests",
|
||||
},
|
||||
{
|
||||
"name": "Build & debug active file",
|
||||
|
93
CHANGELOG.md
Normal file
93
CHANGELOG.md
Normal file
@@ -0,0 +1,93 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Changed
|
||||
- **BREAKING**: Migrated dependency management from vcpkg to Conan
|
||||
- Updated build system to use Conan toolchain files instead of vcpkg
|
||||
- Updated `make init` command to use `conan install` instead of `vcpkg install`
|
||||
- Modified CMakeLists.txt to use Conan's find_package mechanism
|
||||
- Updated documentation in CLAUDE.md to reflect Conan usage
|
||||
|
||||
### Added
|
||||
- `conanfile.py` - Conan recipe for dependency management with all required dependencies
|
||||
- CMakeUserPresets.json (generated by Conan)
|
||||
- Support for Conan build profiles (Release/Debug)
|
||||
|
||||
### Removed
|
||||
- `vcpkg.json` - vcpkg manifest file
|
||||
- `vcpkg-configuration.json` - vcpkg registry configuration
|
||||
- vcpkg toolchain dependency in build system
|
||||
|
||||
### Notes
|
||||
- The migration maintains compatibility with existing make targets and workflow
|
||||
- All dependencies now managed through Conan package manager
|
||||
|
||||
## [1.1.0] - 2025-07-02
|
||||
|
||||
### Added
|
||||
- **AdaBoost Implementation**: Complete multi-class SAMME AdaBoost classifier with optimization
|
||||
- Optimized AdaBoostPredict with 100 estimators as default
|
||||
- Enhanced predictProbaSample functionality
|
||||
- Full predict_proba support for probabilistic predictions
|
||||
- **Decision Tree Classifier**: New base classifier implementation with comprehensive tests
|
||||
- **XA1DE Model Family**: Extended Averaged One-Dependence Estimators
|
||||
- XA1DE, XBAODE, XSPODE variants with threading support
|
||||
- Complete integration with memory optimization
|
||||
- Prior probability computation in prediction
|
||||
- **Wilcoxon Statistical Test**: Statistical significance testing for model comparison
|
||||
- **Folder Management**: Enhanced file organization with folder parameter support across tools
|
||||
- Added folder parameter to b_best, b_grid, b_main, and b_manage
|
||||
- **vcpkg Integration**: Package management system integration (now migrated to Conan)
|
||||
|
||||
### Enhanced
|
||||
- **Grid Search System**: Complete refactoring with MPI parallelization
|
||||
- Grid experiment functionality with conditional result saving
|
||||
- Fixed smoothing problems and dataset ordering
|
||||
- Enhanced reporting and summary generation
|
||||
- **Excel Reporting**: Advanced Excel export capabilities
|
||||
- ReportExcelCompared class for side-by-side result comparison
|
||||
- Enhanced formatting with colors and fixed headers
|
||||
- Automatic file opening after generation
|
||||
- **Results Management**: Comprehensive result handling and validation
|
||||
- JSON schema validation for result format integrity
|
||||
- Improved console reporting with classification reports
|
||||
- Pagination support for large result sets
|
||||
- **Statistical Analysis**: Enhanced statistical testing and reporting
|
||||
- AUC (Area Under Curve) computation and reporting
|
||||
- Confusion matrix generation and visualization
|
||||
- Classification reports with color coding
|
||||
|
||||
### Performance Improvements
|
||||
- Optimized AdaBoost training and prediction algorithms
|
||||
- Enhanced memory management in XA1DE implementations
|
||||
- Improved discretization algorithms with MDLP integration
|
||||
- Faster ROC-AUC computation for binary classification problems
|
||||
|
||||
### Developer Experience
|
||||
- **Testing Framework**: Comprehensive test suite with Catch2
|
||||
- **Build System**: Streamlined CMake configuration with dependency management
|
||||
- **Documentation**: Enhanced project documentation and build instructions
|
||||
- **Code Quality**: Refactored codebase with improved error handling and logging
|
||||
|
||||
### Bug Fixes
|
||||
- Fixed predict_proba implementations across multiple classifiers
|
||||
- Resolved grid search dataset ordering issues
|
||||
- Fixed Excel report formatting and column width problems
|
||||
- Corrected time output formatting in various tools
|
||||
- Fixed memory leaks and stability issues in model implementations
|
||||
|
||||
## [1.0.0] - 2024-01-09
|
||||
|
||||
### Initial Release
|
||||
- **Core Framework**: Machine learning experimentation platform for Bayesian Networks
|
||||
- **Basic Classifiers**: Initial set of Bayesian network classifiers
|
||||
- **Experiment Management**: Basic experiment orchestration and result storage
|
||||
- **Dataset Support**: ARFF file format support with discretization
|
||||
- **Build System**: CMake-based build system with external library integration
|
||||
- **Command Line Tools**: Initial versions of b_main, b_best, b_list utilities
|
139
CLAUDE.md
Normal file
139
CLAUDE.md
Normal file
@@ -0,0 +1,139 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
Platform is a C++ machine learning framework for running experiments with Bayesian Networks and other classifiers. It supports both research-focused experimental classifiers and production-ready models through a unified interface.
|
||||
|
||||
## Build System
|
||||
|
||||
The project uses CMake with Make as the primary build system:
|
||||
|
||||
- **Release build**: `make release` (creates `build_Release/` directory)
|
||||
- **Debug build**: `make debug` (creates `build_Debug/` directory with testing and coverage enabled)
|
||||
- **Install binaries**: `make install` (copies executables to `~/bin` by default)
|
||||
- **Clean project**: `make clean` (removes build directories)
|
||||
- **Initialize dependencies**: `make init` (runs conan install for both Release and Debug)
|
||||
|
||||
### Testing
|
||||
|
||||
- **Run tests**: `make test` (builds debug version and runs all tests)
|
||||
- **Coverage report**: `make coverage` (runs tests and generates coverage with gcovr)
|
||||
- **Single test with options**: `make test opt="-s"` (verbose) or `make test opt="-c='Test Name'"` (specific test)
|
||||
|
||||
### Build Targets
|
||||
|
||||
Main executables (built from `src/commands/`):
|
||||
- `b_main`: Main experiment runner
|
||||
- `b_grid`: Grid search over hyperparameters
|
||||
- `b_best`: Best results analysis and comparison
|
||||
- `b_list`: Dataset listing and properties
|
||||
- `b_manage`: Results management interface
|
||||
- `b_results`: Results processing
|
||||
|
||||
## Dependencies
|
||||
|
||||
The project uses Conan for package management with these key dependencies:
|
||||
- **libtorch**: PyTorch C++ backend for tensor operations
|
||||
- **nlohmann_json**: JSON processing
|
||||
- **catch2**: Unit testing framework
|
||||
- **cli11**: Command-line argument parsing (replacement for argparse)
|
||||
|
||||
Custom dependencies (not available in ConanCenter):
|
||||
- **fimdlp**: MDLP discretization library (needs manual integration)
|
||||
- **folding**: Cross-validation utilities (needs manual integration)
|
||||
- **arff-files**: ARFF dataset file handling (needs manual integration)
|
||||
|
||||
External dependencies (managed separately):
|
||||
- **BayesNet**: Core Bayesian network classifiers (from `../lib/`)
|
||||
- **PyClassifiers**: Python classifier wrappers (from `../lib/`)
|
||||
- **MPI**: Message Passing Interface for parallel processing
|
||||
- **Boost**: Python integration and utilities
|
||||
|
||||
**Note**: Some dependencies (fimdlp, folding, arff-files) are not available in ConanCenter and need to be:
|
||||
- Built as custom Conan packages, or
|
||||
- Integrated using CMake FetchContent, or
|
||||
- Built separately and found via find_package
|
||||
|
||||
## Architecture
|
||||
|
||||
### Core Components
|
||||
|
||||
**Experiment Framework** (`src/main/`):
|
||||
- `Experiment.cpp/h`: Main experiment orchestration
|
||||
- `Models.cpp/h`: Classifier factory and registration system
|
||||
- `Scores.cpp/h`: Performance metrics calculation
|
||||
- `HyperParameters.cpp/h`: Parameter management
|
||||
- `ArgumentsExperiment.cpp/h`: Command-line argument handling
|
||||
|
||||
**Data Handling** (`src/common/`):
|
||||
- `Dataset.cpp/h`: Individual dataset representation
|
||||
- `Datasets.cpp/h`: Dataset collection management
|
||||
- `Discretization.cpp/h`: Data discretization utilities
|
||||
|
||||
**Classifiers** (`src/experimental_clfs/`):
|
||||
- `AdaBoost.cpp/h`: Multi-class SAMME AdaBoost implementation
|
||||
- `DecisionTree.cpp/h`: Decision tree base classifier
|
||||
- `XA1DE.cpp/h`: Extended AODE variants
|
||||
- Experimental implementations of Bayesian network classifiers
|
||||
|
||||
**Grid Search** (`src/grid/`):
|
||||
- `GridSearch.cpp/h`: Hyperparameter optimization
|
||||
- `GridExperiment.cpp/h`: Grid search experiment management
|
||||
- Uses MPI for parallel hyperparameter evaluation
|
||||
|
||||
**Results & Reporting** (`src/results/`, `src/reports/`):
|
||||
- JSON-based result storage with schema validation
|
||||
- Excel export capabilities via libxlsxwriter
|
||||
- Console and paginated result display
|
||||
|
||||
### Model Registration System
|
||||
|
||||
The framework uses a factory pattern with automatic registration:
|
||||
- All classifiers inherit from `bayesnet::BaseClassifier`
|
||||
- Registration happens in `src/main/modelRegister.h`
|
||||
- Factory creates instances by string name via `Models::create()`
|
||||
|
||||
## Configuration
|
||||
|
||||
**Environment Configuration** (`.env` file):
|
||||
- `experiment`: Experiment name/type
|
||||
- `n_folds`: Cross-validation folds (default: 5)
|
||||
- `seeds`: Random seeds for reproducibility
|
||||
- `model`: Default classifier name
|
||||
- `score`: Primary evaluation metric
|
||||
- `platform`: System identifier for results
|
||||
|
||||
**Grid Search Configuration**:
|
||||
- `grid_<model_name>_input.json`: Hyperparameter search space
|
||||
- `grid_<model_name>_output.json`: Search results
|
||||
|
||||
## Data Format
|
||||
|
||||
**Dataset Requirements**:
|
||||
- ARFF format files in `datasets/` directory
|
||||
- `all.txt` file listing datasets: `<name>,<class_name>,<real_features>`
|
||||
- Supports both discrete and continuous features
|
||||
- Automatic discretization available via MDLP
|
||||
|
||||
**Experimental Data**:
|
||||
- Results stored in JSON format with versioned schemas
|
||||
- Test data in `tests/data/` for unit testing
|
||||
- Sample datasets: iris, diabetes, ecoli, glass, etc.
|
||||
|
||||
## Development Workflow
|
||||
|
||||
1. **Setup**: Run `make init` to install dependencies via Conan
|
||||
2. **Development**: Use `make debug` for development builds with testing
|
||||
3. **Testing**: Run `make test` after changes
|
||||
4. **Release**: Use `make release` for optimized builds
|
||||
5. **Experiments**: Use `.env` configuration and run `b_main` with appropriate flags
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Multi-threaded**: Uses MPI for parallel grid search and experiments
|
||||
- **Cross-platform**: Supports Linux and macOS via vcpkg
|
||||
- **Extensible**: Easy classifier registration and integration
|
||||
- **Research-focused**: Designed for machine learning experimentation
|
||||
- **Visualization**: DOT graph generation for decision trees and networks
|
@@ -1,95 +1,99 @@
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
|
||||
project(Platform
|
||||
VERSION 1.0.2
|
||||
VERSION 1.1.0
|
||||
DESCRIPTION "Platform to run Experiments with classifiers."
|
||||
HOMEPAGE_URL "https://github.com/rmontanana/platform"
|
||||
LANGUAGES CXX
|
||||
)
|
||||
|
||||
if (CODE_COVERAGE AND NOT ENABLE_TESTING)
|
||||
MESSAGE(FATAL_ERROR "Code coverage requires testing enabled")
|
||||
endif (CODE_COVERAGE AND NOT ENABLE_TESTING)
|
||||
|
||||
find_package(Torch REQUIRED)
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
endif ()
|
||||
|
||||
# Global CMake variables
|
||||
# ----------------------
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
|
||||
|
||||
# Options
|
||||
# -------
|
||||
option(ENABLE_CLANG_TIDY "Enable to add clang tidy." OFF)
|
||||
option(ENABLE_TESTING "Unit testing build" OFF)
|
||||
option(CODE_COVERAGE "Collect coverage from test library" OFF)
|
||||
|
||||
# CMakes modules
|
||||
# --------------
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
|
||||
|
||||
# MPI
|
||||
find_package(MPI REQUIRED)
|
||||
message("MPI_CXX_LIBRARIES=${MPI_CXX_LIBRARIES}")
|
||||
message("MPI_CXX_INCLUDE_DIRS=${MPI_CXX_INCLUDE_DIRS}")
|
||||
|
||||
# Boost Library
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
cmake_policy(SET CMP0167 NEW) # For FindBoost
|
||||
set(Boost_USE_STATIC_LIBS OFF)
|
||||
set(Boost_USE_MULTITHREADED ON)
|
||||
set(Boost_USE_STATIC_RUNTIME OFF)
|
||||
|
||||
find_package(Boost 1.66.0 REQUIRED COMPONENTS python3 numpy3)
|
||||
|
||||
# # Python
|
||||
find_package(Python3 REQUIRED COMPONENTS Development)
|
||||
|
||||
# # Boost Python
|
||||
# find_package(boost_python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR} CONFIG REQUIRED COMPONENTS python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR})
|
||||
# # target_link_libraries(MyTarget PRIVATE Boost::python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR})
|
||||
|
||||
|
||||
if(Boost_FOUND)
|
||||
message("Boost_INCLUDE_DIRS=${Boost_INCLUDE_DIRS}")
|
||||
message("Boost_LIBRARIES=${Boost_LIBRARIES}")
|
||||
message("Boost_VERSION=${Boost_VERSION}")
|
||||
include_directories(${Boost_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
# Python
|
||||
find_package(Python3 3.11...3.11.9 COMPONENTS Interpreter Development REQUIRED)
|
||||
message("Python3_LIBRARIES=${Python3_LIBRARIES}")
|
||||
|
||||
# CMakes modules
|
||||
# --------------
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
|
||||
include(AddGitSubmodule)
|
||||
|
||||
if (CODE_COVERAGE)
|
||||
enable_testing()
|
||||
include(CodeCoverage)
|
||||
MESSAGE("Code coverage enabled")
|
||||
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
|
||||
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
|
||||
endif (CODE_COVERAGE)
|
||||
|
||||
if (ENABLE_CLANG_TIDY)
|
||||
include(StaticAnalyzers) # clang-tidy
|
||||
endif (ENABLE_CLANG_TIDY)
|
||||
|
||||
# External libraries - dependencies of Platform
|
||||
# ---------------------------------------------
|
||||
add_git_submodule("lib/PyClassifiers")
|
||||
add_git_submodule("lib/argparse")
|
||||
|
||||
find_library(XLSXWRITER_LIB NAMES libxlsxwriter.dylib libxlsxwriter.so PATHS ${Platform_SOURCE_DIR}/lib/libxlsxwriter/lib)
|
||||
message("XLSXWRITER_LIB=${XLSXWRITER_LIB}")
|
||||
find_package(nlohmann_json CONFIG REQUIRED)
|
||||
find_package(argparse CONFIG REQUIRED)
|
||||
find_package(Torch CONFIG REQUIRED)
|
||||
find_package(arff-files CONFIG REQUIRED)
|
||||
find_package(fimdlp CONFIG REQUIRED)
|
||||
find_package(folding CONFIG REQUIRED)
|
||||
find_package(bayesnet CONFIG REQUIRED)
|
||||
find_package(pyclassifiers CONFIG REQUIRED)
|
||||
find_package(libxlsxwriter CONFIG REQUIRED)
|
||||
find_package(Boost REQUIRED COMPONENTS python)
|
||||
|
||||
# Subdirectories
|
||||
# --------------
|
||||
## Configure test data path
|
||||
cmake_path(SET TEST_DATA_PATH "${CMAKE_CURRENT_SOURCE_DIR}/tests/data")
|
||||
configure_file(src/common/SourceData.h.in "${CMAKE_BINARY_DIR}/configured_files/include/SourceData.h")
|
||||
add_subdirectory(config)
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(sample)
|
||||
file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${Platform_SOURCE_DIR}/src/*.cc)
|
||||
# add_subdirectory(sample)
|
||||
file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${Platform_SOURCE_DIR}/src/*.cpp)
|
||||
|
||||
# Testing
|
||||
# -------
|
||||
if (ENABLE_TESTING)
|
||||
MESSAGE("Testing enabled")
|
||||
if (NOT TARGET Catch2::Catch2)
|
||||
add_git_submodule("lib/catch2")
|
||||
endif (NOT TARGET Catch2::Catch2)
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
|
||||
enable_testing()
|
||||
find_package(Catch2 CONFIG REQUIRED)
|
||||
set(CODE_COVERAGE ON)
|
||||
include(CTest)
|
||||
add_subdirectory(tests)
|
||||
endif (ENABLE_TESTING)
|
||||
if (CODE_COVERAGE)
|
||||
MESSAGE("Code coverage enabled")
|
||||
include(CodeCoverage)
|
||||
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
|
||||
endif (CODE_COVERAGE)
|
||||
|
4
Doxyfile
4
Doxyfile
@@ -976,7 +976,7 @@ INPUT_FILE_ENCODING =
|
||||
# Note the list of default checked file patterns might differ from the list of
|
||||
# default file extension mappings.
|
||||
#
|
||||
# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
|
||||
# If left blank the following patterns are tested:*.c, *.cpp, *.cxx, *.cpp,
|
||||
# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
|
||||
# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
|
||||
# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
|
||||
@@ -984,7 +984,7 @@ INPUT_FILE_ENCODING =
|
||||
# *.vhdl, *.ucf, *.qsf and *.ice.
|
||||
|
||||
FILE_PATTERNS = *.c \
|
||||
*.cc \
|
||||
*.cpp \
|
||||
*.cxx \
|
||||
*.cpp \
|
||||
*.c++ \
|
||||
|
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 rmontanana
|
||||
Copyright (c) 2024 Ricardo Montañana Gómez
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
|
102
Makefile
102
Makefile
@@ -1,12 +1,18 @@
|
||||
SHELL := /bin/bash
|
||||
.DEFAULT_GOAL := help
|
||||
.PHONY: coverage setup help build test clean debug release submodules buildr buildd install dependency testp testb clang-uml
|
||||
.PHONY: init clean coverage setup help build test clean debug release buildr buildd install dependency testp testb clang-uml example
|
||||
|
||||
f_release = build_release
|
||||
f_debug = build_debug
|
||||
app_targets = b_best b_list b_main b_manage b_grid
|
||||
test_targets = unit_tests_bayesnet unit_tests_platform
|
||||
n_procs = -j 16
|
||||
f_release = build_Release
|
||||
f_debug = build_Debug
|
||||
app_targets = b_best b_list b_main b_manage b_grid b_results
|
||||
test_targets = unit_tests_platform
|
||||
# Set the number of parallel jobs to the number of available processors minus 7
|
||||
CPUS := $(shell getconf _NPROCESSORS_ONLN 2>/dev/null \
|
||||
|| nproc --all 2>/dev/null \
|
||||
|| sysctl -n hw.ncpu)
|
||||
|
||||
# --- Your desired job count: CPUs – 7, but never less than 1 --------------
|
||||
JOBS := $(shell n=$(CPUS); [ $${n} -gt 7 ] && echo $$((n-7)) || echo 1)
|
||||
|
||||
define ClearTests
|
||||
@for t in $(test_targets); do \
|
||||
@@ -21,14 +27,43 @@ define ClearTests
|
||||
fi ;
|
||||
endef
|
||||
|
||||
define build_target
|
||||
@echo ">>> Building the project for $(1)..."
|
||||
@if [ -d $(2) ]; then rm -fr $(2); fi
|
||||
@conan install . --build=missing -of $(2) -s build_type=$(1)
|
||||
@cmake -S . -B $(2) -DCMAKE_TOOLCHAIN_FILE=$(2)/build/$(1)/generators/conan_toolchain.cmake -DCMAKE_BUILD_TYPE=$(1) -D$(3)
|
||||
@echo ">>> Will build using $(JOBS) parallel jobs"
|
||||
echo ">>> Done"
|
||||
endef
|
||||
|
||||
sub-init: ## Initialize submodules
|
||||
@git submodule update --init --recursive
|
||||
define compile_target
|
||||
@echo ">>> Compiling for $(1)..."
|
||||
if [ "$(3)" != "" ]; then \
|
||||
target="-t$(3)"; \
|
||||
else \
|
||||
target=""; \
|
||||
fi
|
||||
@cmake --build $(2) --config $(1) --parallel $(JOBS) $(target)
|
||||
@echo ">>> Done"
|
||||
endef
|
||||
|
||||
sub-update: ## Initialize submodules
|
||||
@git submodule update --remote --merge
|
||||
@git submodule foreach git pull origin master
|
||||
init: ## Initialize the project installing dependencies
|
||||
@echo ">>> Installing dependencies with Conan"
|
||||
@conan install . --output-folder=build --build=missing -s build_type=Release
|
||||
@conan install . --output-folder=build_debug --build=missing -s build_type=Debug
|
||||
@echo ">>> Done"
|
||||
|
||||
clean: ## Clean the project
|
||||
@echo ">>> Cleaning the project..."
|
||||
@if test -f CMakeCache.txt ; then echo "- Deleting CMakeCache.txt"; rm -f CMakeCache.txt; fi
|
||||
@for folder in $(f_release) $(f_debug) build build_debug install_test ; do \
|
||||
if test -d "$$folder" ; then \
|
||||
echo "- Deleting $$folder folder" ; \
|
||||
rm -rf "$$folder"; \
|
||||
fi; \
|
||||
done
|
||||
$(call ClearTests)
|
||||
@echo ">>> Done";
|
||||
setup: ## Install dependencies for tests and coverage
|
||||
@if [ "$(shell uname)" = "Darwin" ]; then \
|
||||
brew install gcovr; \
|
||||
@@ -41,13 +76,15 @@ setup: ## Install dependencies for tests and coverage
|
||||
dest ?= ${HOME}/bin
|
||||
install: ## Copy binary files to bin folder
|
||||
@echo "Destination folder: $(dest)"
|
||||
make buildr
|
||||
@make buildr
|
||||
@echo "*******************************************"
|
||||
@echo ">>> Copying files to $(dest)"
|
||||
@echo "*******************************************"
|
||||
@for item in $(app_targets); do \
|
||||
echo ">>> Copying $$item" ; \
|
||||
cp $(f_release)/src/$$item $(dest) ; \
|
||||
cp $(f_release)/src/$$item $(dest) || { \
|
||||
echo "*** Error copying $$item" ; \
|
||||
} ; \
|
||||
done
|
||||
|
||||
dependency: ## Create a dependency graph diagram of the project (build/dependency.png)
|
||||
@@ -56,38 +93,27 @@ dependency: ## Create a dependency graph diagram of the project (build/dependenc
|
||||
cd $(f_debug) && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png
|
||||
|
||||
buildd: ## Build the debug targets
|
||||
cmake --build $(f_debug) -t $(app_targets) PlatformSample $(n_procs)
|
||||
@$(call compile_target,"Debug","$(f_debug)")
|
||||
|
||||
buildr: ## Build the release targets
|
||||
cmake --build $(f_release) -t $(app_targets) $(n_procs)
|
||||
|
||||
clean: ## Clean the tests info
|
||||
@echo ">>> Cleaning Debug Platform tests...";
|
||||
$(call ClearTests)
|
||||
@echo ">>> Done";
|
||||
@$(call compile_target,"Release","$(f_release)")
|
||||
|
||||
clang-uml: ## Create uml class and sequence diagrams
|
||||
clang-uml -p --add-compile-flag -I /usr/lib/gcc/x86_64-redhat-linux/8/include/
|
||||
|
||||
debug: ## Build a debug version of the project
|
||||
@echo ">>> Building Debug Platform...";
|
||||
@if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi
|
||||
@mkdir $(f_debug);
|
||||
@cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON
|
||||
@echo ">>> Done";
|
||||
debug: ## Build a debug version of the project with Conan
|
||||
@$(call build_target,"Debug","$(f_debug)", "ENABLE_TESTING=ON")
|
||||
|
||||
release: ## Build a Release version of the project with Conan
|
||||
@$(call build_target,"Release","$(f_release)", "ENABLE_TESTING=OFF")
|
||||
|
||||
release: ## Build a Release version of the project
|
||||
@echo ">>> Building Release Platform...";
|
||||
@if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi
|
||||
@mkdir $(f_release);
|
||||
@cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release
|
||||
@echo ">>> Done";
|
||||
|
||||
opt = ""
|
||||
test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
|
||||
@echo ">>> Running Platform tests...";
|
||||
@$(MAKE) clean
|
||||
@cmake --build $(f_debug) -t $(test_targets) $(n_procs)
|
||||
@$(MAKE) debug
|
||||
@$(call "Compile_target", "Debug", "$(f_debug)", $(test_targets))
|
||||
@for t in $(test_targets); do \
|
||||
if [ -f $(f_debug)/tests/$$t ]; then \
|
||||
cd $(f_debug)/tests ; \
|
||||
@@ -96,6 +122,14 @@ test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximu
|
||||
done
|
||||
@echo ">>> Done";
|
||||
|
||||
fname = iris
|
||||
example: ## Build sample
|
||||
@echo ">>> Building Sample...";
|
||||
@cmake --build $(f_release) -t sample
|
||||
$(f_release)/sample/PlatformSample --model BoostAODE --dataset $(fname) --discretize --stratified
|
||||
@echo ">>> Done";
|
||||
|
||||
|
||||
coverage: ## Run tests and generate coverage report (build/index.html)
|
||||
@echo ">>> Building tests with coverage..."
|
||||
@$(MAKE) test
|
||||
@@ -105,7 +139,7 @@ coverage: ## Run tests and generate coverage report (build/index.html)
|
||||
|
||||
help: ## Show help message
|
||||
@IFS=$$'\n' ; \
|
||||
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||
help_lines=(`grep -Fh "##" $(MAKEFILE_LIST) | grep -Fv fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||
printf "%s\n\n" "Usage: make [task]"; \
|
||||
printf "%-20s %s\n" "task" "help" ; \
|
||||
printf "%-20s %s\n" "------" "----" ; \
|
||||
|
86
README.md
86
README.md
@@ -1,10 +1,9 @@
|
||||
# Platform
|
||||
# <img src="logo.png" alt="logo" width="50"/> Platform
|
||||
|
||||
Platform to run Bayesian Networks and Machine Learning Classifiers experiments.
|
||||
|
||||
# Platform
|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||

|
||||
[](<https://opensource.org/licenses/MIT>)
|
||||
[](https://deepwiki.com/rmontanana/Platform)
|
||||

|
||||
|
||||
Platform to run Bayesian Networks and Machine Learning Classifiers experiments.
|
||||
|
||||
@@ -22,11 +21,18 @@ In Linux sometimes the library libstdc++ is mistaken from the miniconda installa
|
||||
libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by b_xxxx)
|
||||
```
|
||||
|
||||
The solution is to erase the libstdc++ library from the miniconda installation:
|
||||
The solution is to erase the libstdc++ library from the miniconda installation and no further compilation is needed.
|
||||
|
||||
### MPI
|
||||
|
||||
In Linux just install openmpi & openmpi-devel packages. Only if cmake can't find openmpi installation (like in Oracle Linux) set the following variable:
|
||||
In Linux just install openmpi & openmpi-devel packages.
|
||||
|
||||
```bash
|
||||
source /etc/profile.d/modules.sh
|
||||
module load mpi/openmpi-x86_64
|
||||
```
|
||||
|
||||
If cmake can't find openmpi installation (like in Oracle Linux) set the following variable:
|
||||
|
||||
```bash
|
||||
export MPI_HOME="/usr/lib64/openmpi"
|
||||
@@ -35,7 +41,7 @@ export MPI_HOME="/usr/lib64/openmpi"
|
||||
In Mac OS X, install mpich with brew and if cmake doesn't find it, edit mpicxx wrapper to remove the ",-commons,use_dylibs" from final_ldflags
|
||||
|
||||
```bash
|
||||
vi /opt/homebrew/bin/mpicx
|
||||
vi /opt/homebrew/bin/mpicxx
|
||||
```
|
||||
|
||||
### boost library
|
||||
@@ -86,4 +92,64 @@ make release
|
||||
make debug
|
||||
```
|
||||
|
||||
## 1. Introduction
|
||||
### Configuration
|
||||
|
||||
The configuration file is named .env and it should be located in the folder where the experiments should be run. In the root folder of the project there is a file named .env.example that can be used as a template.
|
||||
|
||||
## 1. Commands
|
||||
|
||||
### b_list
|
||||
|
||||
List all the datasets and its properties. The datasets are located in the _datasets_ folder under the experiments root folder. A special file called all.txt with the names of the datasets has to be created. This all file is built wih lines of the form:
|
||||
<name>,<class_name>,<real_features>
|
||||
|
||||
where <real_features> can be either the word _all_ or a list of numbers separated by commas, i.e. [0,3,6,7]
|
||||
|
||||
### b_grid
|
||||
|
||||
Run a grid search over the parameters of the classifiers. The parameters are defined in the file _grid.txt_ located in the grid folder of the experiments. The file has to be created with the following format:
|
||||
|
||||
```json
|
||||
{
|
||||
"all": [
|
||||
<set of hyperparams>, ...
|
||||
],
|
||||
"<dataset_name>": [
|
||||
<specific set of hyperparams for <dataset_name>>, ...
|
||||
],
|
||||
}
|
||||
```
|
||||
|
||||
The file has to be named _grid_<model_name>_input.json_
|
||||
|
||||
As a result it builds a file named _grid_<model_name>_output.json_ with the results of the grid search.
|
||||
|
||||
The computation is done in parallel using MPI.
|
||||
|
||||

|
||||
|
||||
### b_main
|
||||
|
||||
Run the main experiment. There are several hyperparameters that can set in command line:
|
||||
|
||||
- -d, -\-dataset <dataset_name> : Name of the dataset to run the experiment with. If no dataset is specificied the experiment will run with all the datasets in the all.txt file.
|
||||
- -m, -\-model <classifier_name> : Name of the classifier to run the experiment with (i.e. BoostAODE, TAN, Odte, etc.).
|
||||
- -\-discretize: Discretize the dataset before running the experiment.
|
||||
- -\-stratified: Use stratified cross validation.
|
||||
- -\-folds <folds>: Number of folds for cross validation (optional, default value is in .env file).
|
||||
- -s, -\-seeds <seed>: Seeds for the random number generator (optional, default values are in .env file).
|
||||
- -\-no-train-score: Do not calculate the train score (optional), this is useful when the dataset is big and the training score is not needed.
|
||||
- -\-hyperparameters <hyperparameters>: Hyperparameters for the experiment in json format.
|
||||
- -\-hyper-file <hyperparameters_file>: File with the hyperparameters for the experiment in json format. This file uses the output format of the b_grid command.
|
||||
- -\-title <title_text>: Title of the experiment (optional if only one dataset is specificied).
|
||||
- -\-quiet: Don't display detailed progress and result of the experiment.
|
||||
|
||||
### b_manage
|
||||
|
||||
Manage the results of the experiments.
|
||||
|
||||
### b_best
|
||||
|
||||
Get and optionally compare the best results of the experiments. The results can be stored in an MS Excel file.
|
||||
|
||||

|
||||
|
@@ -137,7 +137,7 @@
|
||||
|
||||
include(CMakeParseArguments)
|
||||
|
||||
option(CODE_COVERAGE_VERBOSE "Verbose information" FALSE)
|
||||
option(CODE_COVERAGE_VERBOSE "Verbose information" TRUE)
|
||||
|
||||
# Check prereqs
|
||||
find_program( GCOV_PATH gcov )
|
||||
@@ -160,7 +160,11 @@ foreach(LANG ${LANGUAGES})
|
||||
endif()
|
||||
elseif(NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "GNU"
|
||||
AND NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "(LLVM)?[Ff]lang")
|
||||
message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
|
||||
if ("${LANG}" MATCHES "CUDA")
|
||||
message(STATUS "Ignoring CUDA")
|
||||
else()
|
||||
message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
|
42
conanfile.py
Normal file
42
conanfile.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from conan import ConanFile
|
||||
from conan.tools.cmake import CMakeToolchain, CMakeDeps, cmake_layout
|
||||
|
||||
|
||||
class PlatformConan(ConanFile):
|
||||
name = "platform"
|
||||
version = "1.1.0"
|
||||
|
||||
# Binary configuration
|
||||
settings = "os", "compiler", "build_type", "arch"
|
||||
|
||||
# Sources are located in the same place as this recipe, copy them to the recipe
|
||||
exports_sources = "CMakeLists.txt", "src/*", "tests/*", "config/*", "cmake/*"
|
||||
|
||||
def requirements(self):
|
||||
# Core dependencies from vcpkg.json
|
||||
self.requires("argparse/3.2")
|
||||
self.requires("libtorch/2.7.1")
|
||||
self.requires("nlohmann_json/3.11.3")
|
||||
self.requires("folding/1.1.2")
|
||||
self.requires("fimdlp/2.1.1")
|
||||
self.requires("arff-files/1.2.1")
|
||||
self.requires("bayesnet/1.2.1")
|
||||
self.requires("pyclassifiers/1.0.3")
|
||||
self.requires("libxlsxwriter/1.2.2")
|
||||
|
||||
def build_requirements(self):
|
||||
self.tool_requires("cmake/[>=3.30]")
|
||||
self.test_requires("catch2/3.8.1")
|
||||
|
||||
def layout(self):
|
||||
cmake_layout(self)
|
||||
|
||||
def generate(self):
|
||||
deps = CMakeDeps(self)
|
||||
deps.generate()
|
||||
tc = CMakeToolchain(self)
|
||||
tc.generate()
|
||||
|
||||
def configure(self):
|
||||
# C++20 requirement
|
||||
self.settings.compiler.cppstd = "20"
|
@@ -1,4 +1,4 @@
|
||||
configure_file(
|
||||
"config.h.in"
|
||||
"${CMAKE_BINARY_DIR}/configured_files/include/config.h" ESCAPE_QUOTES
|
||||
"${CMAKE_BINARY_DIR}/configured_files/include/config_platform.h" ESCAPE_QUOTES
|
||||
)
|
||||
|
@@ -1,14 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef PLATFORM_H
|
||||
#define PLATFORM_H
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#define PROJECT_VERSION_MAJOR @PROJECT_VERSION_MAJOR @
|
||||
#define PROJECT_VERSION_MINOR @PROJECT_VERSION_MINOR @
|
||||
#define PROJECT_VERSION_PATCH @PROJECT_VERSION_PATCH @
|
||||
|
||||
static constexpr std::string_view project_name = "@PROJECT_NAME@";
|
||||
static constexpr std::string_view project_version = "@PROJECT_VERSION@";
|
||||
static constexpr std::string_view project_description = "@PROJECT_DESCRIPTION@";
|
||||
static constexpr std::string_view git_sha = "@GIT_SHA@";
|
||||
static constexpr std::string_view data_path = "@Platform_SOURCE_DIR@/tests/data/";
|
||||
static constexpr std::string_view platform_project_name = "@PROJECT_NAME@";
|
||||
static constexpr std::string_view platform_project_version = "@PROJECT_VERSION@";
|
||||
static constexpr std::string_view platform_project_description = "@PROJECT_DESCRIPTION@";
|
||||
static constexpr std::string_view platform_git_sha = "@GIT_SHA@";
|
||||
static constexpr std::string_view platform_data_path = "@Platform_SOURCE_DIR@/tests/data/";
|
||||
#endif
|
@@ -1,4 +1,4 @@
|
||||
filter = src/
|
||||
exclude-directories = build/lib/
|
||||
exclude-directories = build_debug/lib/
|
||||
print-summary = yes
|
||||
sort-percentage = yes
|
||||
|
31
gitmodules
31
gitmodules
@@ -1,31 +0,0 @@
|
||||
[submodule "lib/mdlp"]
|
||||
path = lib/mdlp
|
||||
url = https://github.com/rmontanana/mdlp
|
||||
main = main
|
||||
update = merge
|
||||
[submodule "lib/catch2"]
|
||||
path = lib/catch2
|
||||
main = v2.x
|
||||
update = merge
|
||||
url = https://github.com/catchorg/Catch2.git
|
||||
[submodule "lib/argparse"]
|
||||
path = lib/argparse
|
||||
url = https://github.com/p-ranav/argparse
|
||||
master = master
|
||||
update = merge
|
||||
[submodule "lib/json"]
|
||||
path = lib/json
|
||||
url = https://github.com/nlohmann/json.git
|
||||
master = master
|
||||
update = merge
|
||||
[submodule "lib/libxlsxwriter"]
|
||||
path = lib/libxlsxwriter
|
||||
url = https://github.com/jmcnamara/libxlsxwriter.git
|
||||
main = main
|
||||
update = merge
|
||||
[submodule "lib/PyClassifiers"]
|
||||
path = lib/PyClassifiers
|
||||
url = https://github.com/rmontanana/PyClassifiers
|
||||
[submodule "lib/folding"]
|
||||
path = lib/folding
|
||||
url = https://github.com/rmontanana/Folding
|
BIN
img/bbest.gif
Normal file
BIN
img/bbest.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.9 MiB |
BIN
img/bgrid.gif
Normal file
BIN
img/bgrid.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 349 KiB |
BIN
img/blist.gif
Normal file
BIN
img/blist.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.7 MiB |
BIN
img/bmain.gif
Normal file
BIN
img/bmain.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.3 MiB |
BIN
img/bmanage.gif
Normal file
BIN
img/bmanage.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 8.7 MiB |
@@ -1,168 +0,0 @@
|
||||
#include "ArffFiles.h"
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
|
||||
ArffFiles::ArffFiles() = default;
|
||||
|
||||
std::vector<std::string> ArffFiles::getLines() const
|
||||
{
|
||||
return lines;
|
||||
}
|
||||
|
||||
unsigned long int ArffFiles::getSize() const
|
||||
{
|
||||
return lines.size();
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> ArffFiles::getAttributes() const
|
||||
{
|
||||
return attributes;
|
||||
}
|
||||
|
||||
std::string ArffFiles::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
|
||||
std::string ArffFiles::getClassType() const
|
||||
{
|
||||
return classType;
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>>& ArffFiles::getX()
|
||||
{
|
||||
return X;
|
||||
}
|
||||
|
||||
std::vector<int>& ArffFiles::getY()
|
||||
{
|
||||
return y;
|
||||
}
|
||||
|
||||
void ArffFiles::loadCommon(std::string fileName)
|
||||
{
|
||||
std::ifstream file(fileName);
|
||||
if (!file.is_open()) {
|
||||
throw std::invalid_argument("Unable to open file");
|
||||
}
|
||||
std::string line;
|
||||
std::string keyword;
|
||||
std::string attribute;
|
||||
std::string type;
|
||||
std::string type_w;
|
||||
while (getline(file, line)) {
|
||||
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
|
||||
continue;
|
||||
}
|
||||
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
|
||||
std::stringstream ss(line);
|
||||
ss >> keyword >> attribute;
|
||||
type = "";
|
||||
while (ss >> type_w)
|
||||
type += type_w + " ";
|
||||
attributes.emplace_back(trim(attribute), trim(type));
|
||||
continue;
|
||||
}
|
||||
if (line[0] == '@') {
|
||||
continue;
|
||||
}
|
||||
lines.push_back(line);
|
||||
}
|
||||
file.close();
|
||||
if (attributes.empty())
|
||||
throw std::invalid_argument("No attributes found");
|
||||
}
|
||||
|
||||
void ArffFiles::load(const std::string& fileName, bool classLast)
|
||||
{
|
||||
int labelIndex;
|
||||
loadCommon(fileName);
|
||||
if (classLast) {
|
||||
className = std::get<0>(attributes.back());
|
||||
classType = std::get<1>(attributes.back());
|
||||
attributes.pop_back();
|
||||
labelIndex = static_cast<int>(attributes.size());
|
||||
} else {
|
||||
className = std::get<0>(attributes.front());
|
||||
classType = std::get<1>(attributes.front());
|
||||
attributes.erase(attributes.begin());
|
||||
labelIndex = 0;
|
||||
}
|
||||
generateDataset(labelIndex);
|
||||
}
|
||||
void ArffFiles::load(const std::string& fileName, const std::string& name)
|
||||
{
|
||||
int labelIndex;
|
||||
loadCommon(fileName);
|
||||
bool found = false;
|
||||
for (int i = 0; i < attributes.size(); ++i) {
|
||||
if (attributes[i].first == name) {
|
||||
className = std::get<0>(attributes[i]);
|
||||
classType = std::get<1>(attributes[i]);
|
||||
attributes.erase(attributes.begin() + i);
|
||||
labelIndex = i;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
throw std::invalid_argument("Class name not found");
|
||||
}
|
||||
generateDataset(labelIndex);
|
||||
}
|
||||
|
||||
void ArffFiles::generateDataset(int labelIndex)
|
||||
{
|
||||
X = std::vector<std::vector<float>>(attributes.size(), std::vector<float>(lines.size()));
|
||||
auto yy = std::vector<std::string>(lines.size(), "");
|
||||
auto removeLines = std::vector<int>(); // Lines with missing values
|
||||
for (size_t i = 0; i < lines.size(); i++) {
|
||||
std::stringstream ss(lines[i]);
|
||||
std::string value;
|
||||
int pos = 0;
|
||||
int xIndex = 0;
|
||||
while (getline(ss, value, ',')) {
|
||||
if (pos++ == labelIndex) {
|
||||
yy[i] = value;
|
||||
} else {
|
||||
if (value == "?") {
|
||||
X[xIndex++][i] = -1;
|
||||
removeLines.push_back(i);
|
||||
} else
|
||||
X[xIndex++][i] = stof(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto i : removeLines) {
|
||||
yy.erase(yy.begin() + i);
|
||||
for (auto& x : X) {
|
||||
x.erase(x.begin() + i);
|
||||
}
|
||||
}
|
||||
y = factorize(yy);
|
||||
}
|
||||
|
||||
std::string ArffFiles::trim(const std::string& source)
|
||||
{
|
||||
std::string s(source);
|
||||
s.erase(0, s.find_first_not_of(" '\n\r\t"));
|
||||
s.erase(s.find_last_not_of(" '\n\r\t") + 1);
|
||||
return s;
|
||||
}
|
||||
|
||||
std::vector<int> ArffFiles::factorize(const std::vector<std::string>& labels_t)
|
||||
{
|
||||
std::vector<int> yy;
|
||||
yy.reserve(labels_t.size());
|
||||
std::map<std::string, int> labelMap;
|
||||
int i = 0;
|
||||
for (const std::string& label : labels_t) {
|
||||
if (labelMap.find(label) == labelMap.end()) {
|
||||
labelMap[label] = i++;
|
||||
}
|
||||
yy.push_back(labelMap[label]);
|
||||
}
|
||||
return yy;
|
||||
}
|
@@ -1,32 +0,0 @@
|
||||
#ifndef ARFFFILES_H
|
||||
#define ARFFFILES_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
class ArffFiles {
|
||||
private:
|
||||
std::vector<std::string> lines;
|
||||
std::vector<std::pair<std::string, std::string>> attributes;
|
||||
std::string className;
|
||||
std::string classType;
|
||||
std::vector<std::vector<float>> X;
|
||||
std::vector<int> y;
|
||||
void generateDataset(int);
|
||||
void loadCommon(std::string);
|
||||
public:
|
||||
ArffFiles();
|
||||
void load(const std::string&, bool = true);
|
||||
void load(const std::string&, const std::string&);
|
||||
std::vector<std::string> getLines() const;
|
||||
unsigned long int getSize() const;
|
||||
std::string getClassName() const;
|
||||
std::string getClassType() const;
|
||||
static std::string trim(const std::string&);
|
||||
std::vector<std::vector<float>>& getX();
|
||||
std::vector<int>& getY();
|
||||
std::vector<std::pair<std::string, std::string>> getAttributes() const;
|
||||
static std::vector<int> factorize(const std::vector<std::string>& labels_t);
|
||||
};
|
||||
|
||||
#endif
|
@@ -1 +0,0 @@
|
||||
add_library(ArffFiles ArffFiles.cc)
|
Submodule lib/PyClassifiers deleted from 0608c0a52a
Submodule lib/argparse deleted from 1b3abd9b92
Submodule lib/catch2 deleted from ed6ac8a629
1
lib/json
1
lib/json
Submodule lib/json deleted from 0457de21cf
Submodule lib/libxlsxwriter deleted from b0c76b3396
1
lib/mdlp
1
lib/mdlp
Submodule lib/mdlp deleted from 5708dc3de9
14
remove_submodules.sh
Normal file
14
remove_submodules.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
git config --file .gitmodules --get-regexp path | awk '{ print $2 }' | while read line; do
|
||||
echo "Removing $line"
|
||||
# Deinit the submodule
|
||||
git submodule deinit -f "$line"
|
||||
|
||||
# Remove the submodule from the working tree
|
||||
git rm -f "$line"
|
||||
|
||||
# Remove the submodule from .git/modules
|
||||
rm -rf ".git/modules/$line"
|
||||
done
|
||||
|
||||
# Remove the .gitmodules file
|
||||
git rm -f .gitmodules
|
@@ -1,15 +1,11 @@
|
||||
include_directories(
|
||||
${TORCH_INCLUDE_DIRS}
|
||||
${Platform_SOURCE_DIR}/src/common
|
||||
${Platform_SOURCE_DIR}/src/main
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/src
|
||||
${Python3_INCLUDE_DIRS}
|
||||
${Platform_SOURCE_DIR}/lib/Files
|
||||
${Platform_SOURCE_DIR}/lib/argparse/include
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/src
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/folding
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/mdlp
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/json/include
|
||||
${CMAKE_BINARY_DIR}/configured_files/include
|
||||
${PyClassifiers_INCLUDE_DIRS}
|
||||
${bayesnet_INCLUDE_DIRS}
|
||||
)
|
||||
add_executable(PlatformSample sample.cc ${Platform_SOURCE_DIR}/src/main/Models.cc)
|
||||
target_link_libraries(PlatformSample PyClassifiers ArffFiles mdlp "${TORCH_LIBRARIES}")
|
||||
add_executable(PlatformSample sample.cpp ${Platform_SOURCE_DIR}/src/main/Models.cpp)
|
||||
target_link_libraries(PlatformSample "${PyClassifiers}" "${BayesNet}" fimdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} ${Boost_LIBRARIES})
|
240
sample/sample.cc
240
sample/sample.cc
@@ -1,240 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <torch/torch.h>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "ArffFiles.h"
|
||||
#include "BayesMetrics.h"
|
||||
#include "CPPFImdlp.h"
|
||||
#include "folding.hpp"
|
||||
#include "Models.h"
|
||||
#include "modelRegister.h"
|
||||
#include <fstream>
|
||||
#include "config.h"
|
||||
|
||||
const std::string PATH = { data_path.begin(), data_path.end() };
|
||||
|
||||
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
|
||||
{
|
||||
std::vector<mdlp::labels_t>Xd;
|
||||
map<std::string, int> maxes;
|
||||
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return { Xd, maxes };
|
||||
}
|
||||
|
||||
bool file_exists(const std::string& name)
|
||||
{
|
||||
if (FILE* file = fopen(name.c_str(), "r")) {
|
||||
fclose(file);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<int>>, std::vector<int>> extract_indices(std::vector<int> indices, std::vector<std::vector<int>> X, std::vector<int> y)
|
||||
{
|
||||
std::vector<std::vector<int>> Xr; // nxm
|
||||
std::vector<int> yr;
|
||||
for (int col = 0; col < X.size(); ++col) {
|
||||
Xr.push_back(std::vector<int>());
|
||||
}
|
||||
for (auto index : indices) {
|
||||
for (int col = 0; col < X.size(); ++col) {
|
||||
Xr[col].push_back(X[col][index]);
|
||||
}
|
||||
yr.push_back(y[index]);
|
||||
}
|
||||
return { Xr, yr };
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
map<std::string, bool> datasets = {
|
||||
{"diabetes", true},
|
||||
{"ecoli", true},
|
||||
{"glass", true},
|
||||
{"iris", true},
|
||||
{"kdd_JapaneseVowels", false},
|
||||
{"letter", true},
|
||||
{"liver-disorders", true},
|
||||
{"mfeat-factors", true},
|
||||
};
|
||||
auto valid_datasets = std::vector<std::string>();
|
||||
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
|
||||
[](const pair<std::string, bool>& pair) { return pair.first; });
|
||||
argparse::ArgumentParser program("PlatformSample");
|
||||
program.add_argument("-d", "--dataset")
|
||||
.help("Dataset file name")
|
||||
.action([valid_datasets](const std::string& value) {
|
||||
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
|
||||
return value;
|
||||
}
|
||||
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
|
||||
}
|
||||
);
|
||||
program.add_argument("-p", "--path")
|
||||
.help(" folder where the data files are located, default")
|
||||
.default_value(std::string{ PATH }
|
||||
);
|
||||
program.add_argument("-m", "--model")
|
||||
.help("Model to use " + platform::Models::instance()->tostring())
|
||||
.action([](const std::string& value) {
|
||||
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
|
||||
if (find(choices.begin(), choices.end(), value) != choices.end()) {
|
||||
return value;
|
||||
}
|
||||
throw runtime_error("Model must be one of " + platform::Models::instance()->tostring());
|
||||
}
|
||||
);
|
||||
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
|
||||
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
|
||||
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
|
||||
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
|
||||
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw runtime_error("Number of folds must be greater than 1");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const runtime_error& err) {
|
||||
throw runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw runtime_error("Number of folds must be an integer");
|
||||
}});
|
||||
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
|
||||
bool class_last, stratified, tensors, dump_cpt;
|
||||
std::string model_name, file_name, path, complete_file_name;
|
||||
int nFolds, seed;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
file_name = program.get<std::string>("dataset");
|
||||
path = program.get<std::string>("path");
|
||||
model_name = program.get<std::string>("model");
|
||||
complete_file_name = path + file_name + ".arff";
|
||||
stratified = program.get<bool>("stratified");
|
||||
tensors = program.get<bool>("tensors");
|
||||
nFolds = program.get<int>("folds");
|
||||
seed = program.get<int>("seed");
|
||||
dump_cpt = program.get<bool>("dumpcpt");
|
||||
class_last = datasets[file_name];
|
||||
if (!file_exists(complete_file_name)) {
|
||||
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
|
||||
}
|
||||
}
|
||||
catch (const exception& err) {
|
||||
cerr << err.what() << std::endl;
|
||||
cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Begin Processing
|
||||
*/
|
||||
auto handler = ArffFiles();
|
||||
handler.load(complete_file_name, class_last);
|
||||
// Get Dataset X, y
|
||||
std::vector<mdlp::samples_t>& X = handler.getX();
|
||||
mdlp::labels_t& y = handler.getY();
|
||||
// Get className & Features
|
||||
auto className = handler.getClassName();
|
||||
std::vector<std::string> features;
|
||||
auto attributes = handler.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features),
|
||||
[](const pair<std::string, std::string>& item) { return item.first; });
|
||||
// Discretize Dataset
|
||||
auto [Xd, maxes] = discretize(X, y, features);
|
||||
maxes[className] = *max_element(y.begin(), y.end()) + 1;
|
||||
map<std::string, std::vector<int>> states;
|
||||
for (auto feature : features) {
|
||||
states[feature] = std::vector<int>(maxes[feature]);
|
||||
}
|
||||
states[className] = std::vector<int>(maxes[className]);
|
||||
auto clf = platform::Models::instance()->create(model_name);
|
||||
clf->fit(Xd, y, features, className, states);
|
||||
if (dump_cpt) {
|
||||
std::cout << "--- CPT Tables ---" << std::endl;
|
||||
clf->dump_cpt();
|
||||
}
|
||||
auto lines = clf->show();
|
||||
for (auto line : lines) {
|
||||
std::cout << line << std::endl;
|
||||
}
|
||||
std::cout << "--- Topological Order ---" << std::endl;
|
||||
auto order = clf->topological_order();
|
||||
for (auto name : order) {
|
||||
std::cout << name << ", ";
|
||||
}
|
||||
std::cout << "end." << std::endl;
|
||||
auto score = clf->score(Xd, y);
|
||||
std::cout << "Score: " << score << std::endl;
|
||||
auto graph = clf->graph();
|
||||
auto dot_file = model_name + "_" + file_name;
|
||||
ofstream file(dot_file + ".dot");
|
||||
file << graph;
|
||||
file.close();
|
||||
std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl;
|
||||
std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl;
|
||||
std::string stratified_string = stratified ? " Stratified" : "";
|
||||
std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl;
|
||||
std::cout << "==========================================" << std::endl;
|
||||
torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
|
||||
torch::Tensor yt = torch::tensor(y, torch::kInt32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
|
||||
}
|
||||
float total_score = 0, total_score_train = 0, score_train, score_test;
|
||||
folding::Fold* fold;
|
||||
double nodes = 0.0;
|
||||
if (stratified)
|
||||
fold = new folding::StratifiedKFold(nFolds, y, seed);
|
||||
else
|
||||
fold = new folding::KFold(nFolds, y.size(), seed);
|
||||
for (auto i = 0; i < nFolds; ++i) {
|
||||
auto [train, test] = fold->getFold(i);
|
||||
std::cout << "Fold: " << i + 1 << std::endl;
|
||||
if (tensors) {
|
||||
auto ttrain = torch::tensor(train, torch::kInt64);
|
||||
auto ttest = torch::tensor(test, torch::kInt64);
|
||||
torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
|
||||
torch::Tensor ytraint = yt.index({ ttrain });
|
||||
torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
|
||||
torch::Tensor ytestt = yt.index({ ttest });
|
||||
clf->fit(Xtraint, ytraint, features, className, states);
|
||||
auto temp = clf->predict(Xtraint);
|
||||
score_train = clf->score(Xtraint, ytraint);
|
||||
score_test = clf->score(Xtestt, ytestt);
|
||||
} else {
|
||||
auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
|
||||
auto [Xtest, ytest] = extract_indices(test, Xd, y);
|
||||
clf->fit(Xtrain, ytrain, features, className, states);
|
||||
std::cout << "Nodes: " << clf->getNumberOfNodes() << std::endl;
|
||||
nodes += clf->getNumberOfNodes();
|
||||
score_train = clf->score(Xtrain, ytrain);
|
||||
score_test = clf->score(Xtest, ytest);
|
||||
}
|
||||
if (dump_cpt) {
|
||||
std::cout << "--- CPT Tables ---" << std::endl;
|
||||
clf->dump_cpt();
|
||||
}
|
||||
total_score_train += score_train;
|
||||
total_score += score_test;
|
||||
std::cout << "Score Train: " << score_train << std::endl;
|
||||
std::cout << "Score Test : " << score_test << std::endl;
|
||||
std::cout << "-------------------------------------------------------------------------------" << std::endl;
|
||||
}
|
||||
std::cout << "Nodes: " << nodes / nFolds << std::endl;
|
||||
std::cout << "**********************************************************************************" << std::endl;
|
||||
std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl;
|
||||
std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0;
|
||||
}
|
279
sample/sample.cpp
Normal file
279
sample/sample.cpp
Normal file
@@ -0,0 +1,279 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <torch/torch.h>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <ArffFiles.hpp>
|
||||
#include <fimdlp/CPPFImdlp.h>
|
||||
#include <folding.hpp>
|
||||
#include <bayesnet/utils/BayesMetrics.h>
|
||||
#include <bayesnet/classifiers/SPODE.h>
|
||||
#include "Models.h"
|
||||
#include "modelRegister.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
const std::string PATH = { platform_data_path.begin(), platform_data_path.end() };
|
||||
|
||||
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
|
||||
{
|
||||
std::vector<mdlp::labels_t>Xd;
|
||||
map<std::string, int> maxes;
|
||||
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return { Xd, maxes };
|
||||
}
|
||||
|
||||
bool file_exists(const std::string& name)
|
||||
{
|
||||
if (FILE* file = fopen(name.c_str(), "r")) {
|
||||
fclose(file);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<int>>, std::vector<int>> extract_indices(std::vector<int> indices, std::vector<std::vector<int>> X, std::vector<int> y)
|
||||
{
|
||||
std::vector<std::vector<int>> Xr; // nxm
|
||||
std::vector<int> yr;
|
||||
for (int col = 0; col < X.size(); ++col) {
|
||||
Xr.push_back(std::vector<int>());
|
||||
}
|
||||
for (auto index : indices) {
|
||||
for (int col = 0; col < X.size(); ++col) {
|
||||
Xr[col].push_back(X[col][index]);
|
||||
}
|
||||
yr.push_back(y[index]);
|
||||
}
|
||||
return { Xr, yr };
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
map<std::string, bool> datasets = {
|
||||
{"diabetes", true},
|
||||
{"ecoli", true},
|
||||
{"glass", true},
|
||||
{"iris", true},
|
||||
{"kdd_JapaneseVowels", false},
|
||||
{"letter", true},
|
||||
{"liver-disorders", true},
|
||||
{"mfeat-factors", true},
|
||||
};
|
||||
auto valid_datasets = std::vector<std::string>();
|
||||
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
|
||||
[](const pair<std::string, bool>& pair) { return pair.first; });
|
||||
argparse::ArgumentParser program("PlatformSample");
|
||||
program.add_argument("-d", "--dataset")
|
||||
.help("Dataset file name")
|
||||
.action([valid_datasets](const std::string& value) {
|
||||
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
|
||||
return value;
|
||||
}
|
||||
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
|
||||
}
|
||||
);
|
||||
program.add_argument("-p", "--path")
|
||||
.help(" folder where the data files are located, default")
|
||||
.default_value(std::string{ PATH }
|
||||
);
|
||||
program.add_argument("-m", "--model")
|
||||
.help("Model to use " + platform::Models::instance()->toString())
|
||||
.action([](const std::string& value) {
|
||||
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
|
||||
if (find(choices.begin(), choices.end(), value) != choices.end()) {
|
||||
return value;
|
||||
}
|
||||
throw runtime_error("Model must be one of " + platform::Models::instance()->toString());
|
||||
}
|
||||
);
|
||||
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
|
||||
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
|
||||
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
|
||||
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
|
||||
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw runtime_error("Number of folds must be greater than 1");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const runtime_error& err) {
|
||||
throw runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw runtime_error("Number of folds must be an integer");
|
||||
}});
|
||||
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
|
||||
bool class_last, stratified, tensors, dump_cpt;
|
||||
std::string model_name, file_name, path, complete_file_name;
|
||||
int nFolds, seed;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
file_name = program.get<std::string>("dataset");
|
||||
path = program.get<std::string>("path");
|
||||
model_name = program.get<std::string>("model");
|
||||
complete_file_name = path + file_name + ".arff";
|
||||
stratified = program.get<bool>("stratified");
|
||||
tensors = program.get<bool>("tensors");
|
||||
nFolds = program.get<int>("folds");
|
||||
seed = program.get<int>("seed");
|
||||
dump_cpt = program.get<bool>("dumpcpt");
|
||||
class_last = datasets[file_name];
|
||||
if (!file_exists(complete_file_name)) {
|
||||
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
|
||||
}
|
||||
}
|
||||
catch (const exception& err) {
|
||||
cerr << err.what() << std::endl;
|
||||
cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Begin Processing
|
||||
*/
|
||||
auto handler = ArffFiles();
|
||||
handler.load(complete_file_name, class_last);
|
||||
// Get Dataset X, y
|
||||
std::vector<mdlp::samples_t>& X = handler.getX();
|
||||
mdlp::labels_t& y = handler.getY();
|
||||
// Get className & Features
|
||||
auto className = handler.getClassName();
|
||||
std::vector<std::string> features;
|
||||
auto attributes = handler.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features),
|
||||
[](const pair<std::string, std::string>& item) { return item.first; });
|
||||
// Discretize Dataset
|
||||
auto [Xd, maxes] = discretize(X, y, features);
|
||||
maxes[className] = *max_element(y.begin(), y.end()) + 1;
|
||||
map<std::string, std::vector<int>> states;
|
||||
for (auto feature : features) {
|
||||
states[feature] = std::vector<int>(maxes[feature]);
|
||||
}
|
||||
states[className] = std::vector<int>(maxes[className]);
|
||||
// Output the states
|
||||
std::cout << std::string(80, '-') << std::endl;
|
||||
std::cout << "States" << std::endl;
|
||||
for (auto feature : features) {
|
||||
std::cout << feature << ": " << states[feature].size() << std::endl;
|
||||
}
|
||||
std::cout << std::string(80, '-') << std::endl;
|
||||
//auto clf = platform::Models::instance()->create("SPODE");
|
||||
auto clf = bayesnet::SPODE(2);
|
||||
|
||||
bayesnet::Smoothing_t smoothing = bayesnet::Smoothing_t::ORIGINAL;
|
||||
clf.fit(Xd, y, features, className, states, smoothing);
|
||||
if (dump_cpt) {
|
||||
std::cout << "--- CPT Tables ---" << std::endl;
|
||||
std::cout << clf.dump_cpt();
|
||||
}
|
||||
std::cout << "--- Datos predicción ---" << std::endl;
|
||||
std::cout << "Orden de variables: " << std::endl;
|
||||
for (auto feature : features) {
|
||||
std::cout << feature << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << "X[0]: ";
|
||||
for (int i = 0; i < Xd.size(); ++i) {
|
||||
std::cout << Xd[i][0] << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << std::string(80, '-') << std::endl;
|
||||
|
||||
auto lines = clf.show();
|
||||
for (auto line : lines) {
|
||||
std::cout << line << std::endl;
|
||||
}
|
||||
std::cout << "--- Topological Order ---" << std::endl;
|
||||
auto order = clf.topological_order();
|
||||
for (auto name : order) {
|
||||
std::cout << name << ", ";
|
||||
}
|
||||
auto predict_proba = clf.predict_proba(Xd);
|
||||
std::cout << "Instances predict_proba: ";
|
||||
for (int i = 0; i < predict_proba.size(); i++) {
|
||||
std::cout << "Instance " << i << ": ";
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
std::cout << Xd[j][i] << ", ";
|
||||
}
|
||||
std::cout << ": ";
|
||||
for (auto score : predict_proba[i]) {
|
||||
std::cout << score << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
// std::cout << std::endl;
|
||||
// std::cout << "end." << std::endl;
|
||||
// auto score = clf->score(Xd, y);
|
||||
// std::cout << "Score: " << score << std::endl;
|
||||
// auto graph = clf->graph();
|
||||
// auto dot_file = model_name + "_" + file_name;
|
||||
// ofstream file(dot_file + ".dot");
|
||||
// file << graph;
|
||||
// file.close();
|
||||
// std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl;
|
||||
// std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl;
|
||||
// std::string stratified_string = stratified ? " Stratified" : "";
|
||||
// std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl;
|
||||
// std::cout << "==========================================" << std::endl;
|
||||
// torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
|
||||
// torch::Tensor yt = torch::tensor(y, torch::kInt32);
|
||||
// for (int i = 0; i < features.size(); ++i) {
|
||||
// Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
|
||||
// }
|
||||
// float total_score = 0, total_score_train = 0, score_train, score_test;
|
||||
// folding::Fold* fold;
|
||||
// double nodes = 0.0;
|
||||
// if (stratified)
|
||||
// fold = new folding::StratifiedKFold(nFolds, y, seed);
|
||||
// else
|
||||
// fold = new folding::KFold(nFolds, y.size(), seed);
|
||||
// for (auto i = 0; i < nFolds; ++i) {
|
||||
// auto [train, test] = fold->getFold(i);
|
||||
// std::cout << "Fold: " << i + 1 << std::endl;
|
||||
// if (tensors) {
|
||||
// auto ttrain = torch::tensor(train, torch::kInt64);
|
||||
// auto ttest = torch::tensor(test, torch::kInt64);
|
||||
// torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
|
||||
// torch::Tensor ytraint = yt.index({ ttrain });
|
||||
// torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
|
||||
// torch::Tensor ytestt = yt.index({ ttest });
|
||||
// clf->fit(Xtraint, ytraint, features, className, states, smoothing);
|
||||
// auto temp = clf->predict(Xtraint);
|
||||
// score_train = clf->score(Xtraint, ytraint);
|
||||
// score_test = clf->score(Xtestt, ytestt);
|
||||
// } else {
|
||||
// auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
|
||||
// auto [Xtest, ytest] = extract_indices(test, Xd, y);
|
||||
// clf->fit(Xtrain, ytrain, features, className, states, smoothing);
|
||||
// std::cout << "Nodes: " << clf->getNumberOfNodes() << std::endl;
|
||||
// nodes += clf->getNumberOfNodes();
|
||||
// score_train = clf->score(Xtrain, ytrain);
|
||||
// score_test = clf->score(Xtest, ytest);
|
||||
// }
|
||||
// // if (dump_cpt) {
|
||||
// // std::cout << "--- CPT Tables ---" << std::endl;
|
||||
// // std::cout << clf->dump_cpt();
|
||||
// // }
|
||||
// total_score_train += score_train;
|
||||
// total_score += score_test;
|
||||
// std::cout << "Score Train: " << score_train << std::endl;
|
||||
// std::cout << "Score Test : " << score_test << std::endl;
|
||||
// std::cout << "-------------------------------------------------------------------------------" << std::endl;
|
||||
// }
|
||||
|
||||
// std::cout << "Nodes: " << nodes / nFolds << std::endl;
|
||||
// std::cout << "**********************************************************************************" << std::endl;
|
||||
// std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl;
|
||||
// std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0;
|
||||
}
|
@@ -1,53 +1,80 @@
|
||||
include_directories(
|
||||
## Libs
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/src
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/folding
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/mdlp
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/lib/BayesNet/lib/json/include
|
||||
${Platform_SOURCE_DIR}/lib/PyClassifiers/src
|
||||
${Platform_SOURCE_DIR}/lib/Files
|
||||
${Platform_SOURCE_DIR}/lib/mdlp
|
||||
${Platform_SOURCE_DIR}/lib/argparse/include
|
||||
${Platform_SOURCE_DIR}/lib/json/include
|
||||
${Platform_SOURCE_DIR}/lib/libxlsxwriter/include
|
||||
${Python3_INCLUDE_DIRS}
|
||||
${MPI_CXX_INCLUDE_DIRS}
|
||||
${CMAKE_BINARY_DIR}/configured_files/include
|
||||
## Platform
|
||||
${Platform_SOURCE_DIR}/src/common
|
||||
${Platform_SOURCE_DIR}/src/best
|
||||
${Platform_SOURCE_DIR}/src/grid
|
||||
${Platform_SOURCE_DIR}/src/main
|
||||
${Platform_SOURCE_DIR}/src/manage
|
||||
${Platform_SOURCE_DIR}/src/reports
|
||||
${Platform_SOURCE_DIR}/src
|
||||
)
|
||||
|
||||
# b_best
|
||||
set(best_sources b_best.cc BestResults.cc Statistics.cc BestResultsExcel.cc)
|
||||
list(TRANSFORM best_sources PREPEND best/)
|
||||
add_executable(b_best ${best_sources} main/Result.cc reports/ReportExcel.cc reports/ReportBase.cc reports/ExcelFile.cc common/Datasets.cc common/Dataset.cc)
|
||||
target_link_libraries(b_best Boost::boost "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)
|
||||
add_executable(
|
||||
b_best commands/b_best.cpp best/Statistics.cpp
|
||||
best/BestResultsExcel.cpp best/BestResultsTex.cpp best/BestResultsMd.cpp best/BestResults.cpp
|
||||
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||
main/Models.cpp main/Scores.cpp
|
||||
reports/ReportExcel.cpp reports/ReportBase.cpp reports/ExcelFile.cpp
|
||||
results/Result.cpp
|
||||
experimental_clfs/XA1DE.cpp
|
||||
experimental_clfs/ExpClf.cpp
|
||||
experimental_clfs/DecisionTree.cpp
|
||||
experimental_clfs/AdaBoost.cpp
|
||||
)
|
||||
target_link_libraries(b_best Boost::boost pyclassifiers::pyclassifiers bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy libxlsxwriter::libxlsxwriter)
|
||||
|
||||
# b_grid
|
||||
set(grid_sources b_grid.cc GridSearch.cc GridData.cc)
|
||||
set(grid_sources GridSearch.cpp GridData.cpp GridExperiment.cpp GridBase.cpp )
|
||||
list(TRANSFORM grid_sources PREPEND grid/)
|
||||
add_executable(b_grid ${grid_sources} main/HyperParameters.cc main/Models.cc common/Datasets.cc common/Dataset.cc)
|
||||
target_link_libraries(b_grid PyClassifiers ${MPI_CXX_LIBRARIES} ArffFiles)
|
||||
add_executable(b_grid commands/b_grid.cpp ${grid_sources}
|
||||
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||
main/HyperParameters.cpp main/Models.cpp main/Experiment.cpp main/Scores.cpp main/ArgumentsExperiment.cpp
|
||||
reports/ReportConsole.cpp reports/ReportBase.cpp
|
||||
results/Result.cpp
|
||||
experimental_clfs/XA1DE.cpp
|
||||
experimental_clfs/ExpClf.cpp
|
||||
experimental_clfs/DecisionTree.cpp
|
||||
experimental_clfs/AdaBoost.cpp
|
||||
)
|
||||
target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} pyclassifiers::pyclassifiers bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy)
|
||||
|
||||
# b_list
|
||||
set(list_sources b_list.cc DatasetsExcel.cc)
|
||||
list(TRANSFORM list_sources PREPEND list/)
|
||||
add_executable(b_list ${list_sources} common/Datasets.cc common/Dataset.cc reports/ReportExcel.cc reports/ExcelFile.cc reports/ReportBase.cc)
|
||||
target_link_libraries(b_list "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)
|
||||
add_executable(b_list commands/b_list.cpp
|
||||
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||
main/Models.cpp main/Scores.cpp
|
||||
reports/ReportExcel.cpp reports/ExcelFile.cpp reports/ReportBase.cpp reports/DatasetsExcel.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
|
||||
results/Result.cpp results/ResultsDatasetExcel.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
|
||||
experimental_clfs/XA1DE.cpp
|
||||
experimental_clfs/ExpClf.cpp
|
||||
experimental_clfs/DecisionTree.cpp
|
||||
experimental_clfs/AdaBoost.cpp
|
||||
)
|
||||
target_link_libraries(b_list pyclassifiers::pyclassifiers bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy libxlsxwriter::libxlsxwriter)
|
||||
|
||||
# b_main
|
||||
set(main_sources b_main.cc Experiment.cc Models.cc HyperParameters.cc)
|
||||
set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp ArgumentsExperiment.cpp)
|
||||
list(TRANSFORM main_sources PREPEND main/)
|
||||
add_executable(b_main ${main_sources} common/Datasets.cc common/Dataset.cc reports/ReportConsole.cc reports/ReportBase.cc main/Result.cc)
|
||||
target_link_libraries(b_main PyClassifiers BayesNet ArffFiles mdlp)
|
||||
add_executable(b_main commands/b_main.cpp ${main_sources}
|
||||
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||
reports/ReportConsole.cpp reports/ReportBase.cpp
|
||||
results/Result.cpp
|
||||
experimental_clfs/XA1DE.cpp
|
||||
experimental_clfs/ExpClf.cpp
|
||||
experimental_clfs/ExpClf.cpp
|
||||
experimental_clfs/DecisionTree.cpp
|
||||
experimental_clfs/AdaBoost.cpp
|
||||
)
|
||||
target_link_libraries(b_main PRIVATE nlohmann_json::nlohmann_json pyclassifiers::pyclassifiers bayesnet::bayesnet argparse::argparse fimdlp::fimdlp ${Python3_LIBRARIES} torch::torch Boost::python Boost::numpy)
|
||||
|
||||
# b_manage
|
||||
set(manage_sources b_manage.cc ManageResults.cc CommandParser.cc Results.cc)
|
||||
set(manage_sources ManageScreen.cpp OptionsMenu.cpp ResultsManager.cpp)
|
||||
list(TRANSFORM manage_sources PREPEND manage/)
|
||||
add_executable(b_manage ${manage_sources} main/Result.cc reports/ReportConsole.cc reports/ReportExcel.cc reports/ReportBase.cc reports/ExcelFile.cc common/Datasets.cc common/Dataset.cc)
|
||||
target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)
|
||||
add_executable(
|
||||
b_manage commands/b_manage.cpp ${manage_sources}
|
||||
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||
reports/ReportConsole.cpp reports/ReportExcel.cpp reports/ReportExcelCompared.cpp reports/ReportBase.cpp reports/ExcelFile.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
|
||||
results/Result.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
|
||||
main/Scores.cpp
|
||||
)
|
||||
target_link_libraries(b_manage torch::torch libxlsxwriter::libxlsxwriter fimdlp::fimdlp bayesnet::bayesnet argparse::argparse)
|
||||
|
||||
# b_results
|
||||
add_executable(b_results commands/b_results.cpp)
|
||||
target_link_libraries(b_results torch::torch libxlsxwriter::libxlsxwriter fimdlp::fimdlp bayesnet::bayesnet argparse::argparse)
|
||||
|
@@ -4,12 +4,17 @@
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include "BestResults.h"
|
||||
#include "Result.h"
|
||||
#include "Colors.h"
|
||||
#include "Statistics.h"
|
||||
#include <cctype>
|
||||
#include "common/Colors.h"
|
||||
#include "common/CLocale.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Utils.h" // compute_std
|
||||
#include "results/Result.h"
|
||||
#include "BestResultsExcel.h"
|
||||
#include "CLocale.h"
|
||||
#include "BestResultsTex.h"
|
||||
#include "BestResultsMd.h"
|
||||
#include "best/Statistics.h"
|
||||
#include "BestResults.h"
|
||||
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
@@ -42,26 +47,29 @@ namespace platform {
|
||||
for (auto const& item : data.at("results")) {
|
||||
bool update = true;
|
||||
auto datasetName = item.at("dataset").get<std::string>();
|
||||
if (dataset != "any" && dataset != datasetName) {
|
||||
continue;
|
||||
}
|
||||
if (bests.contains(datasetName)) {
|
||||
if (item.at("score").get<double>() < bests[datasetName].at(0).get<double>()) {
|
||||
update = false;
|
||||
}
|
||||
}
|
||||
if (update) {
|
||||
bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file };
|
||||
bests[datasetName] = { item.at("score").get<double>(), item.at("hyperparameters"), file, item.at("score_std").get<double>() };
|
||||
}
|
||||
}
|
||||
}
|
||||
std::string bestFileName = path + bestResultFile();
|
||||
if (bests.empty()) {
|
||||
std::cerr << Colors::MAGENTA() << "No results found for model " << model << " and score " << score << Colors::RESET() << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
std::string bestFileName = path + Paths::bestResultsFile(score, model);
|
||||
std::ofstream file(bestFileName);
|
||||
file << bests;
|
||||
file.close();
|
||||
return bestFileName;
|
||||
}
|
||||
std::string BestResults::bestResultFile()
|
||||
{
|
||||
return "best_results_" + score + "_" + model + ".json";
|
||||
}
|
||||
std::pair<std::string, std::string> getModelScore(std::string name)
|
||||
{
|
||||
// results_accuracy_BoostAODE_MacBookpro16_2023-09-06_12:27:00_1.json
|
||||
@@ -116,15 +124,24 @@ namespace platform {
|
||||
}
|
||||
result = std::vector<std::string>(models.begin(), models.end());
|
||||
maxModelName = (*max_element(result.begin(), result.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
maxModelName = std::max(12, maxModelName);
|
||||
maxModelName = std::max(minLength, maxModelName);
|
||||
return result;
|
||||
}
|
||||
std::string toLower(std::string data)
|
||||
{
|
||||
std::transform(data.begin(), data.end(), data.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
return data;
|
||||
}
|
||||
std::vector<std::string> BestResults::getDatasets(json table)
|
||||
{
|
||||
std::vector<std::string> datasets;
|
||||
for (const auto& dataset : table.items()) {
|
||||
datasets.push_back(dataset.key());
|
||||
for (const auto& dataset_ : table.items()) {
|
||||
datasets.push_back(dataset_.key());
|
||||
}
|
||||
std::stable_sort(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) {
|
||||
return toLower(a) < toLower(b);
|
||||
});
|
||||
maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
maxDatasetName = std::max(7, maxDatasetName);
|
||||
return datasets;
|
||||
@@ -143,7 +160,7 @@ namespace platform {
|
||||
}
|
||||
void BestResults::listFile()
|
||||
{
|
||||
std::string bestFileName = path + bestResultFile();
|
||||
std::string bestFileName = path + Paths::bestResultsFile(score, model);
|
||||
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
|
||||
fclose(fileTest);
|
||||
} else {
|
||||
@@ -167,10 +184,9 @@ namespace platform {
|
||||
std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << "Dataset" << "Score " << std::setw(maxFileName) << "File" << " Hyperparameters" << std::endl;
|
||||
std::cout << "=== " << std::string(maxDatasetName, '=') << " =========== " << std::string(maxFileName, '=') << " " << std::string(maxHyper, '=') << std::endl;
|
||||
auto i = 0;
|
||||
bool odd = true;
|
||||
double total = 0;
|
||||
for (auto const& item : data.items()) {
|
||||
auto color = odd ? Colors::BLUE() : Colors::CYAN();
|
||||
auto color = (i % 2) ? Colors::BLUE() : Colors::CYAN();
|
||||
double value = item.value().at(0).get<double>();
|
||||
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
|
||||
std::cout << std::setw(maxDatasetName) << std::left << item.key() << " ";
|
||||
@@ -179,7 +195,6 @@ namespace platform {
|
||||
std::cout << item.value().at(1) << " ";
|
||||
std::cout << std::endl;
|
||||
total += value;
|
||||
odd = !odd;
|
||||
}
|
||||
std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ===========" << std::endl;
|
||||
std::cout << Colors::GREEN() << " Total" << std::string(maxDatasetName - 5, '.') << " " << std::setw(11) << std::setprecision(8) << std::fixed << total << std::endl;
|
||||
@@ -191,7 +206,7 @@ namespace platform {
|
||||
auto maxDate = std::filesystem::file_time_type::max();
|
||||
for (const auto& model : models) {
|
||||
this->model = model;
|
||||
std::string bestFileName = path + bestResultFile();
|
||||
std::string bestFileName = path + Paths::bestResultsFile(score, model);
|
||||
if (FILE* fileTest = fopen(bestFileName.c_str(), "r")) {
|
||||
fclose(fileTest);
|
||||
} else {
|
||||
@@ -208,13 +223,20 @@ namespace platform {
|
||||
table["dateTable"] = ftime_to_string(maxDate);
|
||||
return table;
|
||||
}
|
||||
void BestResults::printTableResults(std::vector<std::string> models, json table)
|
||||
|
||||
void BestResults::printTableResults(std::vector<std::string> models, json table, bool tex, bool index)
|
||||
{
|
||||
std::stringstream oss;
|
||||
oss << Colors::GREEN() << "Best results for " << score << " as of " << table.at("dateTable").get<std::string>() << std::endl;
|
||||
std::cout << oss.str();
|
||||
std::cout << std::string(oss.str().size() - 8, '-') << std::endl;
|
||||
std::cout << Colors::GREEN() << " # " << std::setw(maxDatasetName + 1) << std::left << std::string("Dataset");
|
||||
auto bestResultsTex = BestResultsTex(score);
|
||||
auto bestResultsMd = BestResultsMd();
|
||||
if (tex) {
|
||||
bestResultsTex.results_header(models, table.at("dateTable").get<std::string>(), index);
|
||||
bestResultsMd.results_header(models, table.at("dateTable").get<std::string>());
|
||||
}
|
||||
for (const auto& model : models) {
|
||||
std::cout << std::setw(maxModelName) << std::left << model << " ";
|
||||
}
|
||||
@@ -225,23 +247,23 @@ namespace platform {
|
||||
}
|
||||
std::cout << std::endl;
|
||||
auto i = 0;
|
||||
bool odd = true;
|
||||
std::map<std::string, double> totals;
|
||||
std::map<std::string, std::vector<double>> totals;
|
||||
int nDatasets = table.begin().value().size();
|
||||
for (const auto& model : models) {
|
||||
totals[model] = 0.0;
|
||||
}
|
||||
auto datasets = getDatasets(table.begin().value());
|
||||
for (auto const& dataset : datasets) {
|
||||
auto color = odd ? Colors::BLUE() : Colors::CYAN();
|
||||
if (tex) {
|
||||
bestResultsTex.results_body(datasets, table, index);
|
||||
bestResultsMd.results_body(datasets, table);
|
||||
}
|
||||
for (auto const& dataset_ : datasets) {
|
||||
auto color = (i % 2) ? Colors::BLUE() : Colors::CYAN();
|
||||
std::cout << color << std::setw(3) << std::fixed << std::right << i++ << " ";
|
||||
std::cout << std::setw(maxDatasetName) << std::left << dataset << " ";
|
||||
std::cout << std::setw(maxDatasetName) << std::left << dataset_ << " ";
|
||||
double maxValue = 0;
|
||||
// Find out the max value for this dataset
|
||||
for (const auto& model : models) {
|
||||
double value;
|
||||
try {
|
||||
value = table[model].at(dataset).at(0).get<double>();
|
||||
value = table[model].at(dataset_).at(0).get<double>();
|
||||
}
|
||||
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
|
||||
value = -1.0;
|
||||
@@ -253,12 +275,14 @@ namespace platform {
|
||||
// Print the row with red colors on max values
|
||||
for (const auto& model : models) {
|
||||
std::string efectiveColor = color;
|
||||
double value;
|
||||
double value, std;
|
||||
try {
|
||||
value = table[model].at(dataset).at(0).get<double>();
|
||||
value = table[model].at(dataset_).at(0).get<double>();
|
||||
std = table[model].at(dataset_).at(3).get<double>();
|
||||
}
|
||||
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
|
||||
value = -1.0;
|
||||
std = -1.0;
|
||||
}
|
||||
if (value == maxValue) {
|
||||
efectiveColor = Colors::RED();
|
||||
@@ -266,31 +290,38 @@ namespace platform {
|
||||
if (value == -1) {
|
||||
std::cout << Colors::YELLOW() << std::setw(maxModelName) << std::right << "N/A" << " ";
|
||||
} else {
|
||||
totals[model] += value;
|
||||
std::cout << efectiveColor << std::setw(maxModelName) << std::setprecision(maxModelName - 2) << std::fixed << value << " ";
|
||||
totals[model].push_back(value);
|
||||
std::cout << efectiveColor << std::setw(maxModelName - 6) << std::setprecision(maxModelName - 8) << std::fixed << value;
|
||||
std::cout << efectiveColor << "±" << std::setw(5) << std::setprecision(3) << std::fixed << std << " ";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
odd = !odd;
|
||||
}
|
||||
std::cout << Colors::GREEN() << "=== " << std::string(maxDatasetName, '=') << " ";
|
||||
for (const auto& model : models) {
|
||||
std::cout << std::string(maxModelName, '=') << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << Colors::GREEN() << " Totals" << std::string(maxDatasetName - 6, '.') << " ";
|
||||
std::cout << Colors::GREEN() << " Average" << std::string(maxDatasetName - 7, '.') << " ";
|
||||
double max_value = 0.0;
|
||||
std::string best_model = "";
|
||||
for (const auto& total : totals) {
|
||||
if (total.second > max_value) {
|
||||
max_value = total.second;
|
||||
auto actual = std::reduce(total.second.begin(), total.second.end());
|
||||
if (actual > max_value) {
|
||||
max_value = actual;
|
||||
best_model = total.first;
|
||||
}
|
||||
}
|
||||
if (tex) {
|
||||
bestResultsTex.results_footer(totals, best_model);
|
||||
bestResultsMd.results_footer(totals, best_model);
|
||||
}
|
||||
for (const auto& model : models) {
|
||||
std::string efectiveColor = Colors::GREEN();
|
||||
if (totals[model] == max_value) {
|
||||
efectiveColor = Colors::RED();
|
||||
}
|
||||
std::cout << efectiveColor << std::right << std::setw(maxModelName) << std::setprecision(maxModelName - 4) << std::fixed << totals[model] << " ";
|
||||
std::string efectiveColor = model == best_model ? Colors::RED() : Colors::GREEN();
|
||||
double value = std::reduce(totals[model].begin(), totals[model].end()) / nDatasets;
|
||||
double std = compute_std(totals[model], value);
|
||||
std::cout << efectiveColor << std::right << std::setw(maxModelName - 6) << std::setprecision(maxModelName - 8) << std::fixed << value;
|
||||
std::cout << efectiveColor << "±" << std::setw(5) << std::setprecision(3) << std::fixed << std << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
@@ -302,54 +333,53 @@ namespace platform {
|
||||
// Build the table of results
|
||||
json table = buildTableResults(models);
|
||||
std::vector<std::string> datasets = getDatasets(table.begin().value());
|
||||
BestResultsExcel excel_report(score, datasets);
|
||||
excel_report.reportSingle(model, path + bestResultFile());
|
||||
messageExcelFile(excel_report.getFileName());
|
||||
BestResultsExcel excel_report(path, score, datasets);
|
||||
excel_report.reportSingle(model, path + Paths::bestResultsFile(score, model));
|
||||
messageOutputFile("Excel", excel_report.getFileName());
|
||||
excelFileName = excel_report.getFileName();
|
||||
}
|
||||
}
|
||||
void BestResults::reportAll(bool excel)
|
||||
void BestResults::reportAll(bool excel, bool tex, bool index)
|
||||
{
|
||||
auto models = getModels();
|
||||
// Build the table of results
|
||||
json table = buildTableResults(models);
|
||||
std::vector<std::string> datasets = getDatasets(table.begin().value());
|
||||
// Print the table of results
|
||||
printTableResults(models, table);
|
||||
printTableResults(models, table, tex, index);
|
||||
// Compute the Friedman test
|
||||
std::map<std::string, std::map<std::string, float>> ranksModels;
|
||||
if (friedman) {
|
||||
Statistics stats(models, datasets, table, significance);
|
||||
Statistics stats(score, models, datasets, table, significance);
|
||||
auto result = stats.friedmanTest();
|
||||
stats.postHocHolmTest(result);
|
||||
stats.postHocTest();
|
||||
stats.postHocTestReport(result, tex);
|
||||
ranksModels = stats.getRanks();
|
||||
}
|
||||
if (tex) {
|
||||
messageOutputFile("TeX", Paths::tex() + Paths::tex_output());
|
||||
messageOutputFile("MarkDown", Paths::tex() + Paths::md_output());
|
||||
if (friedman) {
|
||||
messageOutputFile("TeX", Paths::tex() + Paths::tex_post_hoc());
|
||||
messageOutputFile("MarkDown", Paths::tex() + Paths::md_post_hoc());
|
||||
}
|
||||
}
|
||||
if (excel) {
|
||||
BestResultsExcel excel(score, datasets);
|
||||
BestResultsExcel excel(path, score, datasets);
|
||||
excel.reportAll(models, table, ranksModels, friedman, significance);
|
||||
if (friedman) {
|
||||
int idx = -1;
|
||||
double min = 2000;
|
||||
// Find out the control model
|
||||
auto totals = std::vector<double>(models.size(), 0.0);
|
||||
for (const auto& dataset : datasets) {
|
||||
for (int i = 0; i < models.size(); ++i) {
|
||||
totals[i] += ranksModels[dataset][models[i]];
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < models.size(); ++i) {
|
||||
if (totals[i] < min) {
|
||||
min = totals[i];
|
||||
idx = i;
|
||||
}
|
||||
}
|
||||
Statistics stats(score, models, datasets, table, significance);
|
||||
int idx = stats.getControlIdx();
|
||||
model = models.at(idx);
|
||||
excel.reportSingle(model, path + bestResultFile());
|
||||
excel.reportSingle(model, path + Paths::bestResultsFile(score, model));
|
||||
}
|
||||
messageExcelFile(excel.getFileName());
|
||||
messageOutputFile("Excel", excel.getFileName());
|
||||
excelFileName = excel.getFileName();
|
||||
}
|
||||
}
|
||||
void BestResults::messageExcelFile(const std::string& fileName)
|
||||
void BestResults::messageOutputFile(const std::string& title, const std::string& fileName)
|
||||
{
|
||||
std::cout << Colors::YELLOW() << "** Excel file generated: " << fileName << Colors::RESET() << std::endl;
|
||||
std::cout << Colors::YELLOW() << "** " << std::setw(8) << std::left << title
|
||||
<< " file generated: " << fileName << Colors::RESET() << std::endl;
|
||||
}
|
||||
}
|
@@ -2,35 +2,39 @@
|
||||
#define BESTRESULTS_H
|
||||
#include <string>
|
||||
#include <nlohmann/json.hpp>
|
||||
using json = nlohmann::json;
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
class BestResults {
|
||||
public:
|
||||
explicit BestResults(const std::string& path, const std::string& score, const std::string& model, bool friedman, double significance = 0.05)
|
||||
: path(path), score(score), model(model), friedman(friedman), significance(significance)
|
||||
explicit BestResults(const std::string& path, const std::string& score, const std::string& model, const std::string& dataset, bool friedman, double significance = 0.05)
|
||||
: path(path), score(score), model(model), dataset(dataset), friedman(friedman), significance(significance)
|
||||
{
|
||||
}
|
||||
std::string build();
|
||||
void reportSingle(bool excel);
|
||||
void reportAll(bool excel);
|
||||
void reportAll(bool excel, bool tex, bool index);
|
||||
void buildAll();
|
||||
std::string getExcelFileName() const { return excelFileName; }
|
||||
private:
|
||||
std::vector<std::string> getModels();
|
||||
std::vector<std::string> getDatasets(json table);
|
||||
std::vector<std::string> loadResultFiles();
|
||||
void messageExcelFile(const std::string& fileName);
|
||||
void messageOutputFile(const std::string& title, const std::string& fileName);
|
||||
json buildTableResults(std::vector<std::string> models);
|
||||
void printTableResults(std::vector<std::string> models, json table);
|
||||
std::string bestResultFile();
|
||||
void printTableResults(std::vector<std::string> models, json table, bool tex, bool index);
|
||||
json loadFile(const std::string& fileName);
|
||||
void listFile();
|
||||
std::string path;
|
||||
std::string score;
|
||||
std::string model;
|
||||
std::string dataset;
|
||||
bool friedman;
|
||||
double significance;
|
||||
int maxModelName = 0;
|
||||
int maxDatasetName = 0;
|
||||
int minLength = 13; // Minimum length for scores
|
||||
std::string excelFileName;
|
||||
};
|
||||
}
|
||||
#endif //BESTRESULTS_H
|
||||
#endif
|
@@ -1,10 +1,10 @@
|
||||
#include <sstream>
|
||||
#include "BestResultsExcel.h"
|
||||
#include "Paths.h"
|
||||
#include <map>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "Statistics.h"
|
||||
#include "ReportExcel.h"
|
||||
#include "common/Paths.h"
|
||||
#include "reports/ReportExcel.h"
|
||||
#include "best/Statistics.h"
|
||||
#include "BestResultsExcel.h"
|
||||
|
||||
namespace platform {
|
||||
json loadResultData(const std::string& fileName)
|
||||
@@ -30,9 +30,9 @@ namespace platform {
|
||||
}
|
||||
return columnName;
|
||||
}
|
||||
BestResultsExcel::BestResultsExcel(const std::string& score, const std::vector<std::string>& datasets) : score(score), datasets(datasets)
|
||||
BestResultsExcel::BestResultsExcel(const std::string& path, const std::string& score, const std::vector<std::string>& datasets) : path(path), score(score), datasets(datasets)
|
||||
{
|
||||
file_name = "BestResults.xlsx";
|
||||
file_name = Paths::bestResultsExcel(score);
|
||||
workbook = workbook_new(getFileName().c_str());
|
||||
setProperties("Best Results");
|
||||
int maxDatasetName = (*max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
@@ -64,19 +64,21 @@ namespace platform {
|
||||
json data = loadResultData(fileName);
|
||||
|
||||
std::string title = "Best results for " + model;
|
||||
worksheet_merge_range(worksheet, 0, 0, 0, 4, title.c_str(), styles["headerFirst"]);
|
||||
worksheet_merge_range(worksheet, 0, 0, 0, 5, title.c_str(), styles["headerFirst"]);
|
||||
// Body header
|
||||
row = 3;
|
||||
int col = 1;
|
||||
writeString(row, 0, "Nº", "bodyHeader");
|
||||
writeString(row, 0, "#", "bodyHeader");
|
||||
writeString(row, 1, "Dataset", "bodyHeader");
|
||||
writeString(row, 2, "Score", "bodyHeader");
|
||||
writeString(row, 3, "File", "bodyHeader");
|
||||
writeString(row, 4, "Hyperparameters", "bodyHeader");
|
||||
writeString(row, 5, "F", "bodyHeader");
|
||||
auto i = 0;
|
||||
std::string hyperparameters;
|
||||
int hypSize = 22;
|
||||
std::map<std::string, std::string> files; // map of files imported and their tabs
|
||||
int numLines = data.size();
|
||||
for (auto const& item : data.items()) {
|
||||
row++;
|
||||
writeInt(row, 0, i++, "ints");
|
||||
@@ -90,7 +92,7 @@ namespace platform {
|
||||
catch (const std::out_of_range& oor) {
|
||||
auto tabName = "table_" + std::to_string(i);
|
||||
auto worksheetNew = workbook_add_worksheet(workbook, tabName.c_str());
|
||||
json data = loadResultData(Paths::results() + fileName);
|
||||
json data = loadResultData(path + fileName);
|
||||
auto report = ReportExcel(data, false, workbook, worksheetNew);
|
||||
report.show();
|
||||
hyperlink = "#table_" + std::to_string(i);
|
||||
@@ -104,6 +106,8 @@ namespace platform {
|
||||
hypSize = hyperparameters.size();
|
||||
}
|
||||
writeString(row, 4, hyperparameters, "text");
|
||||
std::string countHyperparameters = "=COUNTIF(e5:e" + std::to_string(numLines + 4) + ", e" + std::to_string(row + 1) + ")";
|
||||
worksheet_write_formula(worksheet, row, 5, countHyperparameters.c_str(), efectiveStyle("ints"));
|
||||
}
|
||||
row++;
|
||||
// Set Totals
|
||||
@@ -160,13 +164,15 @@ namespace platform {
|
||||
addConditionalFormat("max");
|
||||
footer(false);
|
||||
if (friedman) {
|
||||
// Create Sheet with ranks
|
||||
worksheet = workbook_add_worksheet(workbook, "Ranks");
|
||||
formatColumns();
|
||||
header(true);
|
||||
body(true);
|
||||
addConditionalFormat("min");
|
||||
footer(true);
|
||||
if (score == "accuracy") {
|
||||
// Create Sheet with ranks
|
||||
worksheet = workbook_add_worksheet(workbook, "Ranks");
|
||||
formatColumns();
|
||||
header(true);
|
||||
body(true);
|
||||
addConditionalFormat("min");
|
||||
footer(true);
|
||||
}
|
||||
// Create Sheet with Friedman Test
|
||||
doFriedman();
|
||||
}
|
||||
@@ -180,7 +186,7 @@ namespace platform {
|
||||
// Body header
|
||||
row = 3;
|
||||
int col = 1;
|
||||
writeString(row, 0, "Nº", "bodyHeader");
|
||||
writeString(row, 0, "#", "bodyHeader");
|
||||
writeString(row, 1, "Dataset", "bodyHeader");
|
||||
for (const auto& model : models) {
|
||||
writeString(row, ++col, model.c_str(), "bodyHeader");
|
||||
@@ -237,11 +243,12 @@ namespace platform {
|
||||
}
|
||||
worksheet_merge_range(worksheet, 0, 0, 0, 7, "Friedman Test", styles["headerFirst"]);
|
||||
row = 2;
|
||||
Statistics stats(models, datasets, table, significance, false);
|
||||
Statistics stats(score, models, datasets, table, significance, false); // No output
|
||||
auto result = stats.friedmanTest();
|
||||
stats.postHocHolmTest(result);
|
||||
stats.postHocTest();
|
||||
stats.postHocTestReport(result, false); // No tex output
|
||||
auto friedmanResult = stats.getFriedmanResult();
|
||||
auto holmResult = stats.getHolmResult();
|
||||
auto postHocResults = stats.getPostHocResults();
|
||||
worksheet_merge_range(worksheet, row, 0, row, 7, "Null hypothesis: H0 'There is no significant differences between all the classifiers.'", styles["headerSmall"]);
|
||||
row += 2;
|
||||
writeString(row, 1, "Friedman Q", "bodyHeader");
|
||||
@@ -260,7 +267,7 @@ namespace platform {
|
||||
row += 2;
|
||||
worksheet_merge_range(worksheet, row, 0, row, 7, "Null hypothesis: H0 'There is no significant differences between the control model and the other models.'", styles["headerSmall"]);
|
||||
row += 2;
|
||||
std::string controlModel = "Control Model: " + holmResult.model;
|
||||
std::string controlModel = "Control Model: " + postHocResults.at(0).model;
|
||||
worksheet_merge_range(worksheet, row, 1, row, 7, controlModel.c_str(), styles["bodyHeader_odd"]);
|
||||
row++;
|
||||
writeString(row, 1, "Model", "bodyHeader");
|
||||
@@ -272,7 +279,7 @@ namespace platform {
|
||||
writeString(row, 7, "Reject H0", "bodyHeader");
|
||||
row++;
|
||||
bool first = true;
|
||||
for (const auto& item : holmResult.holmLines) {
|
||||
for (const auto& item : postHocResults) {
|
||||
writeString(row, 1, item.model, "text");
|
||||
if (first) {
|
||||
// Control model info
|
@@ -1,17 +1,16 @@
|
||||
#ifndef BESTRESULTS_EXCEL_H
|
||||
#define BESTRESULTS_EXCEL_H
|
||||
#include "ExcelFile.h"
|
||||
#ifndef BESTRESULTSEXCEL_H
|
||||
#define BESTRESULTSEXCEL_H
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "reports/ExcelFile.h"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace platform {
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
class BestResultsExcel : public ExcelFile {
|
||||
public:
|
||||
BestResultsExcel(const std::string& score, const std::vector<std::string>& datasets);
|
||||
BestResultsExcel(const std::string& path, const std::string& score, const std::vector<std::string>& datasets);
|
||||
~BestResultsExcel();
|
||||
void reportAll(const std::vector<std::string>& models, const json& table, const std::map<std::string, std::map<std::string, float>>& ranks, bool friedman, double significance);
|
||||
void reportSingle(const std::string& model, const std::string& fileName);
|
||||
@@ -23,6 +22,7 @@ namespace platform {
|
||||
void formatColumns();
|
||||
void doFriedman();
|
||||
void addConditionalFormat(std::string formula);
|
||||
std::string path;
|
||||
std::string score;
|
||||
std::vector<std::string> models;
|
||||
std::vector<std::string> datasets;
|
||||
@@ -34,4 +34,4 @@ namespace platform {
|
||||
int datasetNameSize = 25; // Min size of the column
|
||||
};
|
||||
}
|
||||
#endif //BESTRESULTS_EXCEL_H
|
||||
#endif
|
105
src/best/BestResultsMd.cpp
Normal file
105
src/best/BestResultsMd.cpp
Normal file
@@ -0,0 +1,105 @@
|
||||
#include <iostream>
|
||||
#include "BestResultsMd.h"
|
||||
#include "common/Utils.h" // compute_std
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
void BestResultsMd::openMdFile(const std::string& name)
|
||||
{
|
||||
handler.open(name);
|
||||
if (!handler.is_open()) {
|
||||
std::cerr << "Error opening file " << name << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
void BestResultsMd::results_header(const std::vector<std::string>& models, const std::string& date)
|
||||
{
|
||||
this->models = models;
|
||||
auto file_name = Paths::tex() + Paths::md_output();
|
||||
openMdFile(file_name);
|
||||
handler << "<!-- This file has been generated by the platform program" << std::endl;
|
||||
handler << " Date: " << date.c_str() << std::endl;
|
||||
handler << "" << std::endl;
|
||||
handler << " Table of results" << std::endl;
|
||||
handler << "-->" << std::endl;
|
||||
handler << "| # | Dataset |";
|
||||
for (const auto& model : models) {
|
||||
handler << " " << model.c_str() << " |";
|
||||
}
|
||||
handler << std::endl;
|
||||
handler << "|--: | :--- |";
|
||||
for (const auto& model : models) {
|
||||
handler << " :---: |";
|
||||
}
|
||||
handler << std::endl;
|
||||
}
|
||||
void BestResultsMd::results_body(const std::vector<std::string>& datasets, json& table)
|
||||
{
|
||||
int i = 0;
|
||||
for (auto const& dataset : datasets) {
|
||||
// Find out max value for this dataset
|
||||
double max_value = 0;
|
||||
// Find out the max value for this dataset
|
||||
for (const auto& model : models) {
|
||||
double value;
|
||||
try {
|
||||
value = table[model].at(dataset).at(0).get<double>();
|
||||
}
|
||||
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
|
||||
value = -1.0;
|
||||
}
|
||||
if (value > max_value) {
|
||||
max_value = value;
|
||||
}
|
||||
}
|
||||
handler << "| " << ++i << " | " << dataset.c_str() << " | ";
|
||||
for (const auto& model : models) {
|
||||
double value = table[model].at(dataset).at(0).get<double>();
|
||||
double std_value = table[model].at(dataset).at(3).get<double>();
|
||||
const char* bold = value == max_value ? "**" : "";
|
||||
handler << bold << std::setprecision(4) << std::fixed << value << "±" << std::setprecision(3) << std_value << bold << " | ";
|
||||
}
|
||||
handler << std::endl;
|
||||
}
|
||||
}
|
||||
void BestResultsMd::results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model)
|
||||
{
|
||||
handler << "| | **Average Score** | ";
|
||||
int nDatasets = totals.begin()->second.size();
|
||||
for (const auto& model : models) {
|
||||
double value = std::reduce(totals.at(model).begin(), totals.at(model).end()) / nDatasets;
|
||||
double std_value = compute_std(totals.at(model), value);
|
||||
const char* bold = model == best_model ? "**" : "";
|
||||
handler << bold << std::setprecision(4) << std::fixed << value << "±" << std::setprecision(3) << std::fixed << std_value << bold << " | ";
|
||||
}
|
||||
|
||||
handler.close();
|
||||
}
|
||||
void BestResultsMd::postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date)
|
||||
{
|
||||
auto file_name = Paths::tex() + Paths::md_post_hoc();
|
||||
openMdFile(file_name);
|
||||
handler << "<!-- This file has been generated by the platform program" << std::endl;
|
||||
handler << " Date: " << date.c_str() << std::endl;
|
||||
handler << std::endl;
|
||||
handler << " Post-hoc handler test" << std::endl;
|
||||
handler << "-->" << std::endl;
|
||||
handler << "Post-hoc " << kind << " test: H<sub>0</sub>: There is no significant differences between the control model and the other models." << std::endl << std::endl;
|
||||
handler << "| classifier | pvalue | rank | win | tie | loss | H<sub>0</sub> |" << std::endl;
|
||||
handler << "| :-- | --: | --: | --:| --: | --: | :--: |" << std::endl;
|
||||
bool first = true;
|
||||
for (auto const& line : postHocResults) {
|
||||
auto textStatus = !line.reject ? "**" : " ";
|
||||
if (first) {
|
||||
handler << "| " << line.model << " | - | " << std::fixed << std::setprecision(2) << line.rank << " | - | - | - |" << std::endl;
|
||||
first = false;
|
||||
} else {
|
||||
handler << "| " << line.model << " | " << textStatus << std::scientific << std::setprecision(4) << line.pvalue << textStatus << " |";
|
||||
handler << std::fixed << std::setprecision(2) << line.rank << " | " << line.wtl.win << " | " << line.wtl.tie << " | " << line.wtl.loss << " |";
|
||||
handler << (line.reject ? "rejected" : "**accepted**") << " |" << std::endl;
|
||||
}
|
||||
}
|
||||
handler << std::endl;
|
||||
handler.close();
|
||||
}
|
||||
}
|
24
src/best/BestResultsMd.h
Normal file
24
src/best/BestResultsMd.h
Normal file
@@ -0,0 +1,24 @@
|
||||
#ifndef BEST_RESULTS_MD_H
|
||||
#define BEST_RESULTS_MD_H
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "common/Paths.h"
|
||||
#include "Statistics.h"
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
class BestResultsMd {
|
||||
public:
|
||||
BestResultsMd() = default;
|
||||
~BestResultsMd() = default;
|
||||
void results_header(const std::vector<std::string>& models, const std::string& date);
|
||||
void results_body(const std::vector<std::string>& datasets, json& table);
|
||||
void results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model);
|
||||
void postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date);
|
||||
private:
|
||||
void openMdFile(const std::string& name);
|
||||
std::ofstream handler;
|
||||
std::vector<std::string> models;
|
||||
};
|
||||
}
|
||||
#endif
|
124
src/best/BestResultsTex.cpp
Normal file
124
src/best/BestResultsTex.cpp
Normal file
@@ -0,0 +1,124 @@
|
||||
#include <iostream>
|
||||
#include "BestResultsTex.h"
|
||||
#include "common/Utils.h" // compute_std
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
void BestResultsTex::openTexFile(const std::string& name)
|
||||
{
|
||||
handler.open(name);
|
||||
if (!handler.is_open()) {
|
||||
std::cerr << "Error opening file " << name << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
void BestResultsTex::results_header(const std::vector<std::string>& models, const std::string& date, bool index)
|
||||
{
|
||||
this->models = models;
|
||||
auto file_name = Paths::tex() + Paths::tex_output();
|
||||
openTexFile(file_name);
|
||||
handler << "%% This file has been generated by the platform program" << std::endl;
|
||||
handler << "%% Date: " << date.c_str() << std::endl;
|
||||
handler << "%%" << std::endl;
|
||||
handler << "%% Table of results" << std::endl;
|
||||
handler << "%%" << std::endl;
|
||||
handler << "\\begin{table}[htbp] " << std::endl;
|
||||
handler << "\\centering " << std::endl;
|
||||
handler << "\\tiny " << std::endl;
|
||||
handler << "\\renewcommand{\\arraystretch }{1.2} " << std::endl;
|
||||
handler << "\\renewcommand{\\tabcolsep }{0.07cm} " << std::endl;
|
||||
auto umetric = score;
|
||||
umetric[0] = toupper(umetric[0]);
|
||||
handler << "\\caption{" << umetric << " results(mean $\\pm$ std) for all the algorithms and datasets} " << std::endl;
|
||||
handler << "\\label{tab:results_" << score << "}" << std::endl;
|
||||
std::string header_dataset_name = index ? "r" : "l";
|
||||
handler << "\\begin{tabular} {{" << header_dataset_name << std::string(models.size(), 'c').c_str() << "}}" << std::endl;
|
||||
handler << "\\hline " << std::endl;
|
||||
handler << "" << std::endl;
|
||||
for (const auto& model : models) {
|
||||
handler << "& " << model.c_str();
|
||||
}
|
||||
handler << "\\\\" << std::endl;
|
||||
handler << "\\hline" << std::endl;
|
||||
}
|
||||
void BestResultsTex::results_body(const std::vector<std::string>& datasets, json& table, bool index)
|
||||
{
|
||||
int i = 0;
|
||||
for (auto const& dataset : datasets) {
|
||||
// Find out max value for this dataset
|
||||
double max_value = 0;
|
||||
for (const auto& model : models) {
|
||||
double value;
|
||||
try {
|
||||
value = table[model].at(dataset).at(0).get<double>();
|
||||
}
|
||||
catch (nlohmann::json_abi_v3_11_3::detail::out_of_range err) {
|
||||
value = -1.0;
|
||||
}
|
||||
if (value > max_value) {
|
||||
max_value = value;
|
||||
}
|
||||
}
|
||||
if (index)
|
||||
handler << ++i << " ";
|
||||
else
|
||||
handler << dataset << " ";
|
||||
for (const auto& model : models) {
|
||||
double value = table[model].at(dataset).at(0).get<double>();
|
||||
double std_value = table[model].at(dataset).at(3).get<double>();
|
||||
const char* bold = value == max_value ? "\\bfseries" : "";
|
||||
handler << "& " << bold << std::setprecision(4) << std::fixed << value << "$\\pm$" << std::setprecision(3) << std_value;
|
||||
}
|
||||
handler << "\\\\" << std::endl;
|
||||
}
|
||||
}
|
||||
void BestResultsTex::results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model)
|
||||
{
|
||||
handler << "\\hline" << std::endl;
|
||||
handler << "Average ";
|
||||
int nDatasets = totals.begin()->second.size();
|
||||
for (const auto& model : models) {
|
||||
double value = std::reduce(totals.at(model).begin(), totals.at(model).end()) / nDatasets;
|
||||
double std_value = compute_std(totals.at(model), value);
|
||||
const char* bold = model == best_model ? "\\bfseries" : "";
|
||||
handler << "& " << bold << std::setprecision(4) << std::fixed << value << "$\\pm$" << std::setprecision(3) << std::fixed << std_value;
|
||||
}
|
||||
handler << "\\\\" << std::endl;
|
||||
handler << "\\hline " << std::endl;
|
||||
handler << "\\end{tabular}" << std::endl;
|
||||
handler << "\\end{table}" << std::endl;
|
||||
handler.close();
|
||||
}
|
||||
void BestResultsTex::postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date)
|
||||
{
|
||||
auto file_name = Paths::tex() + Paths::tex_post_hoc();
|
||||
openTexFile(file_name);
|
||||
handler << "%% This file has been generated by the platform program" << std::endl;
|
||||
handler << "%% Date: " << date.c_str() << std::endl;
|
||||
handler << "%%" << std::endl;
|
||||
handler << "%% Post-hoc " << kind << " test" << std::endl;
|
||||
handler << "%%" << std::endl;
|
||||
handler << "\\begin{table}[htbp]" << std::endl;
|
||||
handler << "\\centering" << std::endl;
|
||||
handler << "\\caption{Results of the post-hoc " << kind << " test for the mean " << score << " of the algorithms.}\\label{ tab:tests }" << std::endl;
|
||||
handler << "\\begin{tabular}{lrrrrr}" << std::endl;
|
||||
handler << "\\hline" << std::endl;
|
||||
handler << "classifier & pvalue & rank & win & tie & loss\\\\" << std::endl;
|
||||
handler << "\\hline" << std::endl;
|
||||
bool first = true;
|
||||
for (auto const& line : postHocResults) {
|
||||
auto textStatus = !line.reject ? "\\bf " : " ";
|
||||
if (first) {
|
||||
handler << line.model << " & - & " << std::fixed << std::setprecision(2) << line.rank << " & - & - & - \\\\" << std::endl;
|
||||
first = false;
|
||||
} else {
|
||||
handler << line.model << " & " << textStatus << std::scientific << std::setprecision(4) << line.pvalue << " & ";
|
||||
handler << std::fixed << std::setprecision(2) << line.rank << " & " << line.wtl.win << " & " << line.wtl.tie << " & " << line.wtl.loss << "\\\\" << std::endl;
|
||||
}
|
||||
}
|
||||
handler << "\\hline " << std::endl;
|
||||
handler << "\\end{tabular}" << std::endl;
|
||||
handler << "\\end{table}" << std::endl;
|
||||
handler.close();
|
||||
}
|
||||
}
|
26
src/best/BestResultsTex.h
Normal file
26
src/best/BestResultsTex.h
Normal file
@@ -0,0 +1,26 @@
|
||||
#ifndef BEST_RESULTS_TEX_H
|
||||
#define BEST_RESULTS_TEX_H
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "common/Paths.h"
|
||||
#include "Statistics.h"
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
class BestResultsTex {
|
||||
public:
|
||||
BestResultsTex(const std::string score, bool dataset_name = true) : score{ score }, dataset_name{ dataset_name } {};
|
||||
~BestResultsTex() = default;
|
||||
void results_header(const std::vector<std::string>& models, const std::string& date, bool index);
|
||||
void results_body(const std::vector<std::string>& datasets, json& table, bool index);
|
||||
void results_footer(const std::map<std::string, std::vector<double>>& totals, const std::string& best_model);
|
||||
void postHoc_test(std::vector<PostHocLine>& postHocResults, const std::string& kind, const std::string& date);
|
||||
private:
|
||||
std::string score;
|
||||
bool dataset_name;
|
||||
void openTexFile(const std::string& name);
|
||||
std::ofstream handler;
|
||||
std::vector<std::string> models;
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -3,7 +3,7 @@
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include "DotEnv.h"
|
||||
#include "common/DotEnv.h"
|
||||
namespace platform {
|
||||
class BestScore {
|
||||
public:
|
||||
@@ -24,5 +24,4 @@ namespace platform {
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,22 +1,31 @@
|
||||
#include <sstream>
|
||||
#include "Statistics.h"
|
||||
#include "Colors.h"
|
||||
#include "Symbols.h"
|
||||
#include <boost/math/distributions/chi_squared.hpp>
|
||||
#include <boost/math/distributions/normal.hpp>
|
||||
#include "CLocale.h"
|
||||
#include "common/Colors.h"
|
||||
#include "common/Symbols.h"
|
||||
#include "common/CLocale.h"
|
||||
#include "BestResultsTex.h"
|
||||
#include "BestResultsMd.h"
|
||||
#include "Statistics.h"
|
||||
#include "WilcoxonTest.hpp"
|
||||
|
||||
|
||||
namespace platform {
|
||||
|
||||
Statistics::Statistics(const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance, bool output) :
|
||||
models(models), datasets(datasets), data(data), significance(significance), output(output)
|
||||
Statistics::Statistics(const std::string& score, const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance, bool output) :
|
||||
score(score), models(models), datasets(datasets), data(data), significance(significance), output(output)
|
||||
{
|
||||
if (score == "accuracy") {
|
||||
postHocType = "Holm";
|
||||
hlen = 85;
|
||||
} else {
|
||||
postHocType = "Wilcoxon";
|
||||
hlen = 88;
|
||||
}
|
||||
nModels = models.size();
|
||||
nDatasets = datasets.size();
|
||||
auto temp = ConfigLocale();
|
||||
}
|
||||
|
||||
void Statistics::fit()
|
||||
{
|
||||
if (nModels < 3 || nDatasets < 3) {
|
||||
@@ -25,9 +34,11 @@ namespace platform {
|
||||
throw std::runtime_error("Can't make the Friedman test with less than 3 models and/or less than 3 datasets.");
|
||||
}
|
||||
ranksModels.clear();
|
||||
computeRanks();
|
||||
computeRanks(); // compute greaterAverage and ranks
|
||||
// Set the control model as the one with the lowest average rank
|
||||
controlIdx = distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; }));
|
||||
controlIdx = score == "accuracy" ?
|
||||
distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; }))
|
||||
: greaterAverage; // The model with the greater average score
|
||||
computeWTL();
|
||||
maxModelName = (*std::max_element(models.begin(), models.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
maxDatasetName = (*std::max_element(datasets.begin(), datasets.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size();
|
||||
@@ -64,11 +75,16 @@ namespace platform {
|
||||
void Statistics::computeRanks()
|
||||
{
|
||||
std::map<std::string, float> ranksLine;
|
||||
std::map<std::string, float> averages;
|
||||
for (const auto& model : models) {
|
||||
averages[model] = 0;
|
||||
}
|
||||
for (const auto& dataset : datasets) {
|
||||
std::vector<std::pair<std::string, double>> ranksOrder;
|
||||
for (const auto& model : models) {
|
||||
double value = data[model].at(dataset).at(0).get<double>();
|
||||
ranksOrder.push_back({ model, value });
|
||||
averages[model] += value;
|
||||
}
|
||||
// Assign the ranks
|
||||
ranksLine = assignRanks(ranksOrder);
|
||||
@@ -86,10 +102,17 @@ namespace platform {
|
||||
for (const auto& rank : ranks) {
|
||||
ranks[rank.first] /= nDatasets;
|
||||
}
|
||||
// Average the scores
|
||||
for (const auto& average : averages) {
|
||||
averages[average.first] /= nDatasets;
|
||||
}
|
||||
// Get the model with the greater average score
|
||||
greaterAverage = distance(averages.begin(), max_element(averages.begin(), averages.end(), [](const auto& l, const auto& r) { return l.second < r.second; }));
|
||||
}
|
||||
void Statistics::computeWTL()
|
||||
{
|
||||
// Compute the WTL matrix
|
||||
const double practical_threshold = 0.0005;
|
||||
// Compute the WTL matrix (Win Tie Loss)
|
||||
for (int i = 0; i < nModels; ++i) {
|
||||
wtl[i] = { 0, 0, 0 };
|
||||
}
|
||||
@@ -102,23 +125,85 @@ namespace platform {
|
||||
continue;
|
||||
}
|
||||
double value = data[models[i]].at(item.key()).at(0).get<double>();
|
||||
if (value < controlValue) {
|
||||
wtl[i].win++;
|
||||
} else if (value == controlValue) {
|
||||
double diff = controlValue - value; // control − comparison
|
||||
if (std::fabs(diff) <= practical_threshold) {
|
||||
wtl[i].tie++;
|
||||
} else if (diff < 0) {
|
||||
wtl[i].win++;
|
||||
} else {
|
||||
wtl[i].loss++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Statistics::postHocHolmTest(bool friedmanResult)
|
||||
int Statistics::getControlIdx()
|
||||
{
|
||||
if (!fitted) {
|
||||
fit();
|
||||
}
|
||||
return controlIdx;
|
||||
}
|
||||
void Statistics::postHocTest()
|
||||
{
|
||||
if (score == "accuracy") {
|
||||
postHocHolmTest();
|
||||
} else {
|
||||
postHocWilcoxonTest();
|
||||
}
|
||||
}
|
||||
void Statistics::postHocWilcoxonTest()
|
||||
{
|
||||
if (!fitted) {
|
||||
fit();
|
||||
}
|
||||
// Reference: Wilcoxon, F. (1945). “Individual Comparisons by Ranking Methods”. Biometrics Bulletin, 1(6), 80-83.
|
||||
auto wilcoxon = WilcoxonTest(models, datasets, data, significance);
|
||||
controlIdx = wilcoxon.getControlIdx();
|
||||
postHocResults = wilcoxon.getPostHocResults();
|
||||
setResultsOrder();
|
||||
// Fill the ranks info
|
||||
for (const auto& item : postHocResults) {
|
||||
ranks[item.model] = item.rank;
|
||||
}
|
||||
Holm_Bonferroni();
|
||||
restoreResultsOrder();
|
||||
}
|
||||
void Statistics::Holm_Bonferroni()
|
||||
{
|
||||
// The algorithm need the p-values sorted from the lowest to the highest
|
||||
// Sort the models by p-value
|
||||
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
|
||||
return a.pvalue < b.pvalue;
|
||||
});
|
||||
// Holm adjustment
|
||||
for (int i = 0; i < postHocResults.size(); ++i) {
|
||||
auto item = postHocResults.at(i);
|
||||
double before = i == 0 ? 0.0 : postHocResults.at(i - 1).pvalue;
|
||||
double p_value = std::min((long double)1.0, item.pvalue * (nModels - i));
|
||||
p_value = std::max(before, p_value);
|
||||
postHocResults[i].pvalue = p_value;
|
||||
}
|
||||
}
|
||||
void Statistics::setResultsOrder()
|
||||
{
|
||||
int c = 0;
|
||||
for (auto& item : postHocResults) {
|
||||
item.idx = c++;
|
||||
}
|
||||
|
||||
}
|
||||
void Statistics::restoreResultsOrder()
|
||||
{
|
||||
// Restore the order of the results
|
||||
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
|
||||
return a.idx < b.idx;
|
||||
});
|
||||
}
|
||||
void Statistics::postHocHolmTest()
|
||||
{
|
||||
if (!fitted) {
|
||||
fit();
|
||||
}
|
||||
std::stringstream oss;
|
||||
// Reference https://link.springer.com/article/10.1007/s44196-022-00083-8
|
||||
// Post-hoc Holm test
|
||||
// Calculate the p-value for the models paired with the control model
|
||||
@@ -126,75 +211,67 @@ namespace platform {
|
||||
boost::math::normal dist(0.0, 1.0);
|
||||
double diff = sqrt(nModels * (nModels + 1) / (6.0 * nDatasets));
|
||||
for (int i = 0; i < nModels; i++) {
|
||||
PostHocLine line;
|
||||
line.model = models[i];
|
||||
line.rank = ranks.at(models[i]);
|
||||
line.wtl = wtl.at(i);
|
||||
line.reject = false;
|
||||
if (i == controlIdx) {
|
||||
stats[i] = 0.0;
|
||||
postHocResults.push_back(line);
|
||||
continue;
|
||||
}
|
||||
double z = abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
|
||||
double p_value = (long double)2 * (1 - cdf(dist, z));
|
||||
stats[i] = p_value;
|
||||
double z = std::abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
|
||||
line.pvalue = (long double)2 * (1 - cdf(dist, z));
|
||||
line.reject = (line.pvalue < significance);
|
||||
postHocResults.push_back(line);
|
||||
}
|
||||
// Sort the models by p-value
|
||||
std::vector<std::pair<int, double>> statsOrder;
|
||||
for (const auto& stat : stats) {
|
||||
statsOrder.push_back({ stat.first, stat.second });
|
||||
}
|
||||
std::sort(statsOrder.begin(), statsOrder.end(), [](const std::pair<int, double>& a, const std::pair<int, double>& b) {
|
||||
return a.second < b.second;
|
||||
std::sort(postHocResults.begin(), postHocResults.end(), [](const PostHocLine& a, const PostHocLine& b) {
|
||||
return a.rank < b.rank;
|
||||
});
|
||||
setResultsOrder();
|
||||
Holm_Bonferroni();
|
||||
restoreResultsOrder();
|
||||
}
|
||||
|
||||
// Holm adjustment
|
||||
for (int i = 0; i < statsOrder.size(); ++i) {
|
||||
auto item = statsOrder.at(i);
|
||||
double before = i == 0 ? 0.0 : statsOrder.at(i - 1).second;
|
||||
double p_value = std::min((double)1.0, item.second * (nModels - i));
|
||||
p_value = std::max(before, p_value);
|
||||
statsOrder[i] = { item.first, p_value };
|
||||
}
|
||||
holmResult.model = models.at(controlIdx);
|
||||
void Statistics::postHocTestReport(bool friedmanResult, bool tex)
|
||||
{
|
||||
|
||||
std::stringstream oss;
|
||||
auto color = friedmanResult ? Colors::CYAN() : Colors::YELLOW();
|
||||
oss << color;
|
||||
oss << " *************************************************************************************************************" << std::endl;
|
||||
oss << " Post-hoc Holm test: H0: 'There is no significant differences between the control model and the other models.'" << std::endl;
|
||||
oss << " " << std::string(hlen + 25, '*') << std::endl;
|
||||
oss << " Post-hoc " << postHocType << " test: H0: 'There is no significant differences between the control model and the other models.'" << std::endl;
|
||||
oss << " Control model: " << models.at(controlIdx) << std::endl;
|
||||
oss << " " << std::left << std::setw(maxModelName) << std::string("Model") << " p-value rank win tie loss Status" << std::endl;
|
||||
oss << " " << std::string(maxModelName, '=') << " ============ ========= === === ==== =============" << std::endl;
|
||||
// sort ranks from lowest to highest
|
||||
std::vector<std::pair<std::string, float>> ranksOrder;
|
||||
for (const auto& rank : ranks) {
|
||||
ranksOrder.push_back({ rank.first, rank.second });
|
||||
}
|
||||
std::sort(ranksOrder.begin(), ranksOrder.end(), [](const std::pair<std::string, float>& a, const std::pair<std::string, float>& b) {
|
||||
return a.second < b.second;
|
||||
});
|
||||
// Show the control model info.
|
||||
oss << " " << Colors::BLUE() << std::left << std::setw(maxModelName) << ranksOrder.at(0).first << " ";
|
||||
oss << std::setw(12) << " " << std::setprecision(7) << std::fixed << " " << ranksOrder.at(0).second << std::endl;
|
||||
for (const auto& item : ranksOrder) {
|
||||
auto idx = distance(models.begin(), find(models.begin(), models.end(), item.first));
|
||||
double pvalue = 0.0;
|
||||
for (const auto& stat : statsOrder) {
|
||||
if (stat.first == idx) {
|
||||
pvalue = stat.second;
|
||||
}
|
||||
}
|
||||
holmResult.holmLines.push_back({ item.first, pvalue, item.second, wtl.at(idx), pvalue < significance });
|
||||
if (item.first == models.at(controlIdx)) {
|
||||
bool first = true;
|
||||
for (const auto& item : postHocResults) {
|
||||
if (first) {
|
||||
oss << " " << Colors::BLUE() << std::left << std::setw(maxModelName) << item.model << " ";
|
||||
oss << std::setw(12) << " " << std::setprecision(7) << std::fixed << " " << item.rank << std::endl;
|
||||
first = false;
|
||||
continue;
|
||||
}
|
||||
auto pvalue = item.pvalue;
|
||||
auto colorStatus = pvalue > significance ? Colors::GREEN() : Colors::MAGENTA();
|
||||
auto status = pvalue > significance ? Symbols::check_mark : Symbols::cross;
|
||||
auto textStatus = pvalue > significance ? " accepted H0" : " rejected H0";
|
||||
oss << " " << colorStatus << std::left << std::setw(maxModelName) << item.first << " ";
|
||||
oss << std::setprecision(6) << std::scientific << pvalue << std::setprecision(7) << std::fixed << " " << item.second;
|
||||
oss << " " << std::right << std::setw(3) << wtl.at(idx).win << " " << std::setw(3) << wtl.at(idx).tie << " " << std::setw(4) << wtl.at(idx).loss;
|
||||
oss << " " << colorStatus << std::left << std::setw(maxModelName) << item.model << " ";
|
||||
oss << std::setprecision(6) << std::scientific << pvalue << std::setprecision(7) << std::fixed << " " << item.rank;
|
||||
oss << " " << std::right << std::setw(3) << item.wtl.win << " " << std::setw(3) << item.wtl.tie << " " << std::setw(4) << item.wtl.loss;
|
||||
oss << " " << status << textStatus << std::endl;
|
||||
}
|
||||
oss << color << " *************************************************************************************************************" << std::endl;
|
||||
oss << color << " " << std::string(hlen + 25, '*') << std::endl;
|
||||
oss << Colors::RESET();
|
||||
if (output) {
|
||||
std::cout << oss.str();
|
||||
}
|
||||
if (tex) {
|
||||
BestResultsTex bestResultsTex(score);
|
||||
BestResultsMd bestResultsMd;
|
||||
bestResultsTex.postHoc_test(postHocResults, postHocType, get_date() + " " + get_time());
|
||||
bestResultsMd.postHoc_test(postHocResults, postHocType, get_date() + " " + get_time());
|
||||
}
|
||||
}
|
||||
bool Statistics::friedmanTest()
|
||||
{
|
||||
@@ -205,7 +282,7 @@ namespace platform {
|
||||
// Friedman test
|
||||
// Calculate the Friedman statistic
|
||||
oss << Colors::BLUE() << std::endl;
|
||||
oss << "***************************************************************************************************************" << std::endl;
|
||||
oss << std::string(hlen, '*') << std::endl;
|
||||
oss << Colors::GREEN() << "Friedman test: H0: 'There is no significant differences between all the classifiers.'" << Colors::BLUE() << std::endl;
|
||||
double degreesOfFreedom = nModels - 1.0;
|
||||
double sumSquared = 0;
|
||||
@@ -230,23 +307,11 @@ namespace platform {
|
||||
oss << Colors::YELLOW() << "The null hypothesis H0 is accepted. Computed p-values will not be significant." << std::endl;
|
||||
result = false;
|
||||
}
|
||||
oss << Colors::BLUE() << "***************************************************************************************************************" << Colors::RESET() << std::endl;
|
||||
oss << Colors::BLUE() << std::string(hlen, '*') << Colors::RESET() << std::endl;
|
||||
if (output) {
|
||||
std::cout << oss.str();
|
||||
}
|
||||
friedmanResult = { friedmanQ, criticalValue, p_value, result };
|
||||
return result;
|
||||
}
|
||||
FriedmanResult& Statistics::getFriedmanResult()
|
||||
{
|
||||
return friedmanResult;
|
||||
}
|
||||
HolmResult& Statistics::getHolmResult()
|
||||
{
|
||||
return holmResult;
|
||||
}
|
||||
std::map<std::string, std::map<std::string, float>>& Statistics::getRanks()
|
||||
{
|
||||
return ranksModels;
|
||||
}
|
||||
} // namespace platform
|
@@ -5,13 +5,13 @@
|
||||
#include <map>
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
struct WTL {
|
||||
int win;
|
||||
int tie;
|
||||
int loss;
|
||||
uint win;
|
||||
uint tie;
|
||||
uint loss;
|
||||
};
|
||||
struct FriedmanResult {
|
||||
double statistic;
|
||||
@@ -19,29 +19,36 @@ namespace platform {
|
||||
long double pvalue;
|
||||
bool reject;
|
||||
};
|
||||
struct HolmLine {
|
||||
struct PostHocLine {
|
||||
uint idx; //index of the main order
|
||||
std::string model;
|
||||
long double pvalue;
|
||||
double rank;
|
||||
WTL wtl;
|
||||
bool reject;
|
||||
};
|
||||
struct HolmResult {
|
||||
std::string model;
|
||||
std::vector<HolmLine> holmLines;
|
||||
};
|
||||
|
||||
class Statistics {
|
||||
public:
|
||||
Statistics(const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance = 0.05, bool output = true);
|
||||
Statistics(const std::string& score, const std::vector<std::string>& models, const std::vector<std::string>& datasets, const json& data, double significance = 0.05, bool output = true);
|
||||
bool friedmanTest();
|
||||
void postHocHolmTest(bool friedmanResult);
|
||||
FriedmanResult& getFriedmanResult();
|
||||
HolmResult& getHolmResult();
|
||||
std::map<std::string, std::map<std::string, float>>& getRanks();
|
||||
void postHocTest();
|
||||
void postHocTestReport(bool friedmanResult, bool tex);
|
||||
int getControlIdx();
|
||||
FriedmanResult& getFriedmanResult() { return friedmanResult; }
|
||||
std::vector<PostHocLine>& getPostHocResults() { return postHocResults; }
|
||||
std::map<std::string, std::map<std::string, float>>& getRanks() { return ranksModels; } // ranks of the models per dataset
|
||||
private:
|
||||
void fit();
|
||||
void postHocHolmTest();
|
||||
void postHocWilcoxonTest();
|
||||
void computeRanks();
|
||||
void computeWTL();
|
||||
void Holm_Bonferroni();
|
||||
void setResultsOrder(); // Set the order of the results based on the statistic analysis needed
|
||||
void restoreResultsOrder(); // Restore the order of the results after the Holm-Bonferroni adjustment
|
||||
const std::string& score;
|
||||
std::string postHocType;
|
||||
const std::vector<std::string>& models;
|
||||
const std::vector<std::string>& datasets;
|
||||
const json& data;
|
||||
@@ -51,13 +58,15 @@ namespace platform {
|
||||
int nModels = 0;
|
||||
int nDatasets = 0;
|
||||
int controlIdx = 0;
|
||||
int greaterAverage = -1; // The model with the greater average score
|
||||
std::map<int, WTL> wtl;
|
||||
std::map<std::string, float> ranks;
|
||||
int maxModelName = 0;
|
||||
int maxDatasetName = 0;
|
||||
int hlen; // length of the line
|
||||
FriedmanResult friedmanResult;
|
||||
HolmResult holmResult;
|
||||
std::vector<PostHocLine> postHocResults;
|
||||
std::map<std::string, std::map<std::string, float>> ranksModels;
|
||||
};
|
||||
}
|
||||
#endif // !STATISTICS_H
|
||||
#endif
|
245
src/best/WilcoxonTest.hpp
Normal file
245
src/best/WilcoxonTest.hpp
Normal file
@@ -0,0 +1,245 @@
|
||||
#ifndef BEST_WILCOXON_TEST_HPP
|
||||
#define BEST_WILCOXON_TEST_HPP
|
||||
// WilcoxonTest.hpp
|
||||
// Stand‑alone class for paired Wilcoxon signed‑rank post‑hoc analysis
|
||||
// ------------------------------------------------------------------
|
||||
// * Constructor takes the *already‑loaded* nlohmann::json object plus the
|
||||
// vectors of model and dataset names.
|
||||
// * Internally selects a control model (highest average AUC) and builds all
|
||||
// statistics (ranks, W/T/L counts, Wilcoxon p‑values).
|
||||
// * Public API:
|
||||
// int getControlIdx() const;
|
||||
// PostHocResult getPostHocResult() const;
|
||||
//
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <limits>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "Statistics.h"
|
||||
|
||||
namespace platform {
|
||||
class WilcoxonTest {
|
||||
public:
|
||||
WilcoxonTest(const std::vector<std::string>& models, const std::vector<std::string>& datasets,
|
||||
const json& data, double alpha = 0.05) : models_(models), datasets_(datasets), data_(data), alpha_(alpha)
|
||||
{
|
||||
buildAUCTable(); // extracts all AUCs into a dense matrix
|
||||
computeAverageAUCs(); // per‑model mean (→ control selection)
|
||||
computeAverageRanks(); // Friedman‑style ranks per model
|
||||
selectControlModel(); // sets control_idx_
|
||||
buildPostHocResult(); // fills postHocResult_
|
||||
}
|
||||
|
||||
int getControlIdx() const noexcept { return control_idx_; }
|
||||
const std::vector<PostHocLine>& getPostHocResults() const noexcept { return postHocResults_; }
|
||||
|
||||
private:
|
||||
//-------------------------------------------------- helper structs ----
|
||||
// When a value is missing we keep NaN so that ordinary arithmetic still
|
||||
// works (NaN simply propagates and we can test with std::isnan).
|
||||
using Matrix = std::vector<std::vector<double>>; // [model][dataset]
|
||||
|
||||
//------------------------------------------------- implementation ----
|
||||
void buildAUCTable()
|
||||
{
|
||||
const std::size_t M = models_.size();
|
||||
const std::size_t D = datasets_.size();
|
||||
auc_.assign(M, std::vector<double>(D, std::numeric_limits<double>::quiet_NaN()));
|
||||
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
const auto& model = models_[i];
|
||||
for (std::size_t j = 0; j < D; ++j) {
|
||||
const auto& ds = datasets_[j];
|
||||
try {
|
||||
auc_[i][j] = data_.at(model).at(ds).at(0).get<double>();
|
||||
}
|
||||
catch (...) {
|
||||
// leave as NaN when value missing
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void computeAverageAUCs()
|
||||
{
|
||||
const std::size_t M = models_.size();
|
||||
avg_auc_.resize(M, std::numeric_limits<double>::quiet_NaN());
|
||||
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
double sum = 0.0;
|
||||
std::size_t cnt = 0;
|
||||
for (double v : auc_[i]) {
|
||||
if (!std::isnan(v)) { sum += v; ++cnt; }
|
||||
}
|
||||
avg_auc_[i] = cnt ? sum / cnt : std::numeric_limits<double>::quiet_NaN();
|
||||
}
|
||||
}
|
||||
|
||||
// Average rank across datasets (1 = best).
|
||||
void computeAverageRanks()
|
||||
{
|
||||
const std::size_t M = models_.size();
|
||||
const std::size_t D = datasets_.size();
|
||||
rank_sum_.assign(M, 0.0);
|
||||
rank_cnt_.assign(M, 0);
|
||||
|
||||
const double EPS = 1e-10;
|
||||
|
||||
for (std::size_t j = 0; j < D; ++j) {
|
||||
// Collect present values for this dataset
|
||||
std::vector<std::pair<double, std::size_t>> vals; // (auc, model_idx)
|
||||
vals.reserve(M);
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
if (!std::isnan(auc_[i][j]))
|
||||
vals.emplace_back(auc_[i][j], i);
|
||||
}
|
||||
if (vals.empty()) continue; // no info for this dataset
|
||||
|
||||
// Sort descending (higher AUC better)
|
||||
std::sort(vals.begin(), vals.end(), [](auto a, auto b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
|
||||
// Assign ranks with average for ties
|
||||
std::size_t k = 0;
|
||||
while (k < vals.size()) {
|
||||
std::size_t l = k + 1;
|
||||
while (l < vals.size() && std::fabs(vals[l].first - vals[k].first) < EPS) ++l;
|
||||
const double avg_rank = (k + 1 + l) * 0.5; // average of ranks (1‑based)
|
||||
for (std::size_t m = k; m < l; ++m) {
|
||||
const auto idx = vals[m].second;
|
||||
rank_sum_[idx] += avg_rank;
|
||||
++rank_cnt_[idx];
|
||||
}
|
||||
k = l;
|
||||
}
|
||||
}
|
||||
|
||||
// Final average
|
||||
avg_rank_.resize(M, std::numeric_limits<double>::quiet_NaN());
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
avg_rank_[i] = rank_cnt_[i] ? rank_sum_[i] / rank_cnt_[i]
|
||||
: std::numeric_limits<double>::quiet_NaN();
|
||||
}
|
||||
}
|
||||
|
||||
void selectControlModel()
|
||||
{
|
||||
// pick model with highest average AUC (ties → first)
|
||||
control_idx_ = 0;
|
||||
for (std::size_t i = 1; i < avg_auc_.size(); ++i) {
|
||||
if (avg_auc_[i] > avg_auc_[control_idx_]) control_idx_ = static_cast<int>(i);
|
||||
}
|
||||
}
|
||||
|
||||
void buildPostHocResult()
|
||||
{
|
||||
const std::size_t M = models_.size();
|
||||
const std::size_t D = datasets_.size();
|
||||
const std::string& control_name = models_[control_idx_];
|
||||
|
||||
const double practical_threshold = 0.0005; // same heuristic as original code
|
||||
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
PostHocLine line;
|
||||
line.model = models_[i];
|
||||
line.rank = avg_auc_[i];
|
||||
|
||||
WTL wtl = { 0, 0, 0 }; // win, tie, loss
|
||||
std::vector<double> differences;
|
||||
differences.reserve(D);
|
||||
|
||||
for (std::size_t j = 0; j < D; ++j) {
|
||||
double auc_control = auc_[control_idx_][j];
|
||||
double auc_other = auc_[i][j];
|
||||
if (std::isnan(auc_control) || std::isnan(auc_other)) continue;
|
||||
|
||||
double diff = auc_control - auc_other; // control − comparison
|
||||
if (std::fabs(diff) <= practical_threshold) {
|
||||
++wtl.tie;
|
||||
} else if (diff < 0) {
|
||||
++wtl.win; // comparison wins
|
||||
} else {
|
||||
++wtl.loss; // control wins
|
||||
}
|
||||
differences.push_back(diff);
|
||||
}
|
||||
|
||||
line.wtl = wtl;
|
||||
line.pvalue = differences.empty() ? 1.0L : static_cast<long double>(wilcoxonSignedRankTest(differences));
|
||||
line.reject = (line.pvalue < alpha_);
|
||||
|
||||
postHocResults_.push_back(std::move(line));
|
||||
}
|
||||
// Sort results by rank (descending)
|
||||
std::sort(postHocResults_.begin(), postHocResults_.end(), [](const PostHocLine& a, const PostHocLine& b) {
|
||||
return a.rank > b.rank;
|
||||
});
|
||||
}
|
||||
|
||||
// ------------------------------------------------ Wilcoxon (private) --
|
||||
static double wilcoxonSignedRankTest(const std::vector<double>& diffs)
|
||||
{
|
||||
if (diffs.empty()) return 1.0;
|
||||
|
||||
// Build |diff| + sign vector (exclude zeros)
|
||||
struct Node { double absval; int sign; };
|
||||
std::vector<Node> v;
|
||||
v.reserve(diffs.size());
|
||||
for (double d : diffs) {
|
||||
if (d != 0.0) v.push_back({ std::fabs(d), d > 0 ? 1 : -1 });
|
||||
}
|
||||
if (v.empty()) return 1.0;
|
||||
|
||||
// Sort by absolute value
|
||||
std::sort(v.begin(), v.end(), [](const Node& a, const Node& b) { return a.absval < b.absval; });
|
||||
|
||||
const double EPS = 1e-10;
|
||||
const std::size_t n = v.size();
|
||||
std::vector<double> ranks(n, 0.0);
|
||||
|
||||
std::size_t i = 0;
|
||||
while (i < n) {
|
||||
std::size_t j = i + 1;
|
||||
while (j < n && std::fabs(v[j].absval - v[i].absval) < EPS) ++j;
|
||||
double avg_rank = (i + 1 + j) * 0.5; // 1‑based ranks
|
||||
for (std::size_t k = i; k < j; ++k) ranks[k] = avg_rank;
|
||||
i = j;
|
||||
}
|
||||
|
||||
double w_plus = 0.0, w_minus = 0.0;
|
||||
for (std::size_t k = 0; k < n; ++k) {
|
||||
if (v[k].sign > 0) w_plus += ranks[k];
|
||||
else w_minus += ranks[k];
|
||||
}
|
||||
double w = std::min(w_plus, w_minus);
|
||||
double mean_w = n * (n + 1) / 4.0;
|
||||
double sd_w = std::sqrt(n * (n + 1) * (2 * n + 1) / 24.0);
|
||||
if (sd_w == 0.0) return 1.0; // degenerate (all diffs identical)
|
||||
|
||||
double z = (w - mean_w) / sd_w;
|
||||
double p_two = std::erfc(std::fabs(z) / std::sqrt(2.0)); // 2‑sided tail
|
||||
return p_two;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------- data ----
|
||||
std::vector<std::string> models_;
|
||||
std::vector<std::string> datasets_;
|
||||
json data_;
|
||||
double alpha_;
|
||||
|
||||
Matrix auc_; // [model][dataset]
|
||||
std::vector<double> avg_auc_; // mean AUC per model
|
||||
std::vector<double> avg_rank_; // mean rank per model
|
||||
std::vector<double> rank_sum_; // helper for ranks
|
||||
std::vector<int> rank_cnt_; // datasets counted per model
|
||||
|
||||
int control_idx_ = -1;
|
||||
std::vector<PostHocLine> postHocResults_;
|
||||
};
|
||||
|
||||
} // namespace platform
|
||||
#endif // BEST_WILCOXON_TEST_HPP
|
@@ -1,16 +1,25 @@
|
||||
#include <iostream>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include "Paths.h"
|
||||
#include "BestResults.h"
|
||||
#include "Colors.h"
|
||||
#include "config.h"
|
||||
#include "main/Models.h"
|
||||
#include "main/modelRegister.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Colors.h"
|
||||
#include "common/Utils.h"
|
||||
#include "best/BestResults.h"
|
||||
#include "common/DotEnv.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
void manageArguments(argparse::ArgumentParser& program)
|
||||
{
|
||||
program.add_argument("-m", "--model").default_value("").help("Filter results of the selected model) (any for all models)");
|
||||
program.add_argument("-s", "--score").default_value("accuracy").help("Filter results of the score name supplied");
|
||||
auto env = platform::DotEnv();
|
||||
program.add_argument("-m", "--model").help("Model to use or any").default_value("any");
|
||||
program.add_argument("--folder").help("Results folder to use").default_value(platform::Paths::results());
|
||||
program.add_argument("-d", "--dataset").default_value("any").help("Filter results of the selected model) (any for all datasets)");
|
||||
program.add_argument("-s", "--score").default_value(env.get("score")).help("Filter results of the score name supplied");
|
||||
program.add_argument("--friedman").help("Friedman test").default_value(false).implicit_value(true);
|
||||
program.add_argument("--excel").help("Output to excel").default_value(false).implicit_value(true);
|
||||
program.add_argument("--tex").help("Output results to TeX & Markdown files").default_value(false).implicit_value(true);
|
||||
program.add_argument("--index").help("In tex output show the index of the dataset instead of the name to save space").default_value(false).implicit_value(true);
|
||||
program.add_argument("--level").help("significance level").default_value(0.05).scan<'g', double>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = std::stod(value);
|
||||
@@ -29,23 +38,30 @@ void manageArguments(argparse::ArgumentParser& program)
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
argparse::ArgumentParser program("b_best", { project_version.begin(), project_version.end() });
|
||||
argparse::ArgumentParser program("b_best", { platform_project_version.begin(), platform_project_version.end() });
|
||||
manageArguments(program);
|
||||
std::string model, score;
|
||||
bool build, report, friedman, excel;
|
||||
std::string model, dataset, score, folder;
|
||||
bool build, report, friedman, excel, tex, index;
|
||||
double level;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
model = program.get<std::string>("model");
|
||||
folder = program.get<std::string>("folder");
|
||||
if (folder.back() != '/') {
|
||||
folder += '/';
|
||||
}
|
||||
dataset = program.get<std::string>("dataset");
|
||||
score = program.get<std::string>("score");
|
||||
friedman = program.get<bool>("friedman");
|
||||
excel = program.get<bool>("excel");
|
||||
tex = program.get<bool>("tex");
|
||||
index = program.get<bool>("index");
|
||||
level = program.get<double>("level");
|
||||
if (model == "" || score == "") {
|
||||
throw std::runtime_error("Model and score name must be supplied");
|
||||
}
|
||||
if (friedman && model != "any") {
|
||||
std::cerr << "Friedman test can only be used with all models" << std::endl;
|
||||
if (friedman && (model != "any" || dataset != "any")) {
|
||||
std::cerr << "Friedman test can only be used with all models and all the datasets" << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
@@ -56,15 +72,20 @@ int main(int argc, char** argv)
|
||||
exit(1);
|
||||
}
|
||||
// Generate report
|
||||
auto results = platform::BestResults(platform::Paths::results(), score, model, friedman, level);
|
||||
auto results = platform::BestResults(folder, score, model, dataset, friedman, level);
|
||||
if (model == "any") {
|
||||
results.buildAll();
|
||||
results.reportAll(excel);
|
||||
results.reportAll(excel, tex, index);
|
||||
} else {
|
||||
std::string fileName = results.build();
|
||||
std::cout << Colors::GREEN() << fileName << " created!" << Colors::RESET() << std::endl;
|
||||
results.reportSingle(excel);
|
||||
}
|
||||
if (excel) {
|
||||
auto fileName = results.getExcelFileName();
|
||||
std::cout << "Opening " << fileName << std::endl;
|
||||
platform::openFile(fileName);
|
||||
}
|
||||
std::cout << Colors::RESET();
|
||||
return 0;
|
||||
}
|
@@ -1,45 +1,50 @@
|
||||
#include <iostream>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <map>
|
||||
#include <tuple>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <mpi.h>
|
||||
#include "DotEnv.h"
|
||||
#include "Models.h"
|
||||
#include "modelRegister.h"
|
||||
#include "GridSearch.h"
|
||||
#include "Paths.h"
|
||||
#include "Timer.h"
|
||||
#include "Colors.h"
|
||||
#include "config.h"
|
||||
#include "main/Models.h"
|
||||
#include "main/ArgumentsExperiment.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Timer.hpp"
|
||||
#include "common/Colors.h"
|
||||
#include "common/DotEnv.h"
|
||||
#include "grid/GridSearch.h"
|
||||
#include "grid/GridExperiment.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
using json = nlohmann::json;
|
||||
using json = nlohmann::ordered_json;
|
||||
const int MAXL = 133;
|
||||
|
||||
void assignModel(argparse::ArgumentParser& parser)
|
||||
{
|
||||
auto models = platform::Models::instance();
|
||||
parser.add_argument("-m", "--model")
|
||||
.help("Model to use " + models->tostring())
|
||||
.help("Model to use " + models->toString())
|
||||
.required()
|
||||
.action([models](const std::string& value) {
|
||||
static const std::vector<std::string> choices = models->getNames();
|
||||
if (find(choices.begin(), choices.end(), value) != choices.end()) {
|
||||
return value;
|
||||
}
|
||||
throw std::runtime_error("Model must be one of " + models->tostring());
|
||||
throw std::runtime_error("Model must be one of " + models->toString());
|
||||
}
|
||||
);
|
||||
);
|
||||
}
|
||||
void add_compute_args(argparse::ArgumentParser& program)
|
||||
void add_search_args(argparse::ArgumentParser& program)
|
||||
{
|
||||
auto env = platform::DotEnv();
|
||||
program.add_argument("--discretize").help("Discretize input datasets").default_value((bool)stoi(env.get("discretize"))).implicit_value(true);
|
||||
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
|
||||
program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true);
|
||||
program.add_argument("--continue").help("Continue computing from that dataset").default_value(platform::GridSearch::NO_CONTINUE());
|
||||
program.add_argument("--only").help("Used with continue to compute that dataset only").default_value(false).implicit_value(true);
|
||||
program.add_argument("--only").help("Used with continue to search with that dataset only").default_value(false).implicit_value(true);
|
||||
program.add_argument("--exclude").default_value("[]").help("Datasets to exclude in json format, e.g. [\"dataset1\", \"dataset2\"]");
|
||||
auto valid_choices = env.valid_tokens("smooth_strat");
|
||||
auto& smooth_arg = program.add_argument("--smooth-strat").help("Smooth strategy used in Bayes Network node initialization. Valid values: " + env.valid_values("smooth_strat")).default_value(env.get("smooth_strat"));
|
||||
for (auto choice : valid_choices) {
|
||||
smooth_arg.choices(choice);
|
||||
}
|
||||
program.add_argument("--nested").help("Set the double/nested cross validation number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
@@ -54,23 +59,23 @@ void add_compute_args(argparse::ArgumentParser& program)
|
||||
catch (...) {
|
||||
throw std::runtime_error("Number of nested folds must be an integer");
|
||||
}});
|
||||
program.add_argument("--score").help("Score used in gridsearch").default_value("accuracy");
|
||||
program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw std::runtime_error("Number of folds must be greater than 1");
|
||||
program.add_argument("--score").help("Score used in gridsearch").default_value("accuracy");
|
||||
program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) {
|
||||
try {
|
||||
auto k = stoi(value);
|
||||
if (k < 2) {
|
||||
throw std::runtime_error("Number of folds must be greater than 1");
|
||||
}
|
||||
return k;
|
||||
}
|
||||
return k;
|
||||
}
|
||||
catch (const runtime_error& err) {
|
||||
throw std::runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw std::runtime_error("Number of folds must be an integer");
|
||||
}});
|
||||
auto seed_values = env.getSeeds();
|
||||
program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
|
||||
catch (const runtime_error& err) {
|
||||
throw std::runtime_error(err.what());
|
||||
}
|
||||
catch (...) {
|
||||
throw std::runtime_error("Number of folds must be an integer");
|
||||
}});
|
||||
auto seed_values = env.getSeeds();
|
||||
program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
|
||||
}
|
||||
std::string headerLine(const std::string& text, int utf = 0)
|
||||
{
|
||||
@@ -93,21 +98,27 @@ void list_dump(std::string& model)
|
||||
if (item.first.size() > max_dataset) {
|
||||
max_dataset = item.first.size();
|
||||
}
|
||||
if (item.second.dump().size() > max_hyper) {
|
||||
max_hyper = item.second.dump().size();
|
||||
for (auto const& [key, value] : item.second.items()) {
|
||||
if (value.dump().size() > max_hyper) {
|
||||
max_hyper = value.dump().size();
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << Colors::GREEN() << left << " # " << left << setw(max_dataset) << "Dataset" << " #Com. "
|
||||
<< setw(max_hyper) << "Hyperparameters" << std::endl;
|
||||
std::cout << "=== " << string(max_dataset, '=') << " ===== " << string(max_hyper, '=') << std::endl;
|
||||
bool odd = true;
|
||||
int i = 0;
|
||||
for (auto const& item : combinations) {
|
||||
auto color = odd ? Colors::CYAN() : Colors::BLUE();
|
||||
auto color = (i++ % 2) ? Colors::CYAN() : Colors::BLUE();
|
||||
std::cout << color;
|
||||
auto num_combinations = data.getNumCombinations(item.first);
|
||||
std::cout << setw(3) << fixed << right << ++index << left << " " << setw(max_dataset) << item.first
|
||||
<< " " << setw(5) << right << num_combinations << " " << setw(max_hyper) << left << item.second.dump() << std::endl;
|
||||
odd = !odd;
|
||||
<< " " << setw(5) << right << num_combinations << " ";
|
||||
std::string prefix = "";
|
||||
for (auto const& [key, value] : item.second.items()) {
|
||||
std::cout << prefix << setw(max_hyper) << std::left << value.dump() << std::endl;
|
||||
prefix = string(11 + max_dataset, ' ');
|
||||
}
|
||||
}
|
||||
std::cout << Colors::RESET() << std::endl;
|
||||
}
|
||||
@@ -127,7 +138,8 @@ void list_results(json& results, std::string& model)
|
||||
std::cout << std::string(MAXL, '*') << std::endl;
|
||||
int spaces = 7;
|
||||
int hyperparameters_spaces = 15;
|
||||
for (const auto& item : results["results"].items()) {
|
||||
nlohmann::json temp = results["results"]; // To show in alphabetical order of the dataset
|
||||
for (const auto& item : temp.items()) {
|
||||
auto key = item.key();
|
||||
auto value = item.value();
|
||||
if (key.size() > spaces) {
|
||||
@@ -141,17 +153,15 @@ void list_results(json& results, std::string& model)
|
||||
<< "Duration " << setw(8) << "Score" << " " << "Hyperparameters" << std::endl;
|
||||
std::cout << "=== " << string(spaces, '=') << " " << string(19, '=') << " " << string(8, '=') << " "
|
||||
<< string(8, '=') << " " << string(hyperparameters_spaces, '=') << std::endl;
|
||||
bool odd = true;
|
||||
int index = 0;
|
||||
for (const auto& item : results["results"].items()) {
|
||||
auto color = odd ? Colors::CYAN() : Colors::BLUE();
|
||||
for (const auto& item : temp.items()) {
|
||||
auto color = (index % 2) ? Colors::CYAN() : Colors::BLUE();
|
||||
auto value = item.value();
|
||||
std::cout << color;
|
||||
std::cout << std::setw(3) << std::right << index++ << " ";
|
||||
std::cout << left << setw(spaces) << item.key() << " " << value["date"].get<string>()
|
||||
<< " " << setw(8) << right << value["duration"].get<string>() << " " << setw(8) << setprecision(6)
|
||||
<< fixed << right << value["score"].get<double>() << " " << value["hyperparameters"].dump() << std::endl;
|
||||
odd = !odd;
|
||||
}
|
||||
std::cout << Colors::RESET() << std::endl;
|
||||
}
|
||||
@@ -177,13 +187,14 @@ void report(argparse::ArgumentParser& program)
|
||||
list_results(results, config.model);
|
||||
}
|
||||
}
|
||||
void compute(argparse::ArgumentParser& program)
|
||||
void search(argparse::ArgumentParser& program)
|
||||
{
|
||||
struct platform::ConfigGrid config;
|
||||
config.model = program.get<std::string>("model");
|
||||
config.score = program.get<std::string>("score");
|
||||
config.discretize = program.get<bool>("discretize");
|
||||
config.stratified = program.get<bool>("stratified");
|
||||
config.smooth_strategy = program.get<std::string>("smooth-strat");
|
||||
config.n_folds = program.get<int>("folds");
|
||||
config.quiet = program.get<bool>("quiet");
|
||||
config.only = program.get<bool>("only");
|
||||
@@ -195,9 +206,6 @@ void compute(argparse::ArgumentParser& program)
|
||||
}
|
||||
auto excluded = program.get<std::string>("exclude");
|
||||
config.excluded = json::parse(excluded);
|
||||
|
||||
auto env = platform::DotEnv();
|
||||
config.platform = env.get("platform");
|
||||
platform::Paths::createPath(platform::Paths::grid());
|
||||
auto grid_search = platform::GridSearch(config);
|
||||
platform::Timer timer;
|
||||
@@ -208,22 +216,54 @@ void compute(argparse::ArgumentParser& program)
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs);
|
||||
if (mpi_config.n_procs < 2) {
|
||||
throw std::runtime_error("Cannot use --compute with less than 2 mpi processes, try mpirun -np 2 ...");
|
||||
throw std::runtime_error("Cannot use --search with less than 2 mpi processes, try mpirun -np 2 ...");
|
||||
}
|
||||
grid_search.go(mpi_config);
|
||||
if (mpi_config.rank == mpi_config.manager) {
|
||||
auto results = grid_search.loadResults();
|
||||
std::cout << Colors::RESET() << "* Report of the computed hyperparameters" << std::endl;
|
||||
list_results(results, config.model);
|
||||
std::cout << "Process took " << timer.getDurationString() << std::endl;
|
||||
}
|
||||
MPI_Finalize();
|
||||
}
|
||||
void experiment(argparse::ArgumentParser& program)
|
||||
{
|
||||
struct platform::ConfigGrid config;
|
||||
auto arguments = platform::ArgumentsExperiment(program, platform::experiment_t::GRID);
|
||||
arguments.parse();
|
||||
auto path_results = arguments.getPathResults();
|
||||
auto grid_experiment = platform::GridExperiment(arguments, config);
|
||||
platform::Timer timer;
|
||||
timer.start();
|
||||
struct platform::ConfigMPI mpi_config;
|
||||
mpi_config.manager = 0; // which process is the manager
|
||||
MPI_Init(nullptr, nullptr);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_config.rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &mpi_config.n_procs);
|
||||
if (mpi_config.n_procs < 2) {
|
||||
throw std::runtime_error("Cannot use --experiment with less than 2 mpi processes, try mpirun -np 2 ...");
|
||||
}
|
||||
grid_experiment.go(mpi_config);
|
||||
if (mpi_config.rank == mpi_config.manager) {
|
||||
auto experiment = grid_experiment.getExperiment();
|
||||
std::cout << "* Report of the computed hyperparameters" << std::endl;
|
||||
auto duration = timer.getDuration();
|
||||
experiment.setDuration(duration);
|
||||
if (grid_experiment.haveToSaveResults()) {
|
||||
experiment.saveResult(path_results);
|
||||
}
|
||||
experiment.report();
|
||||
std::cout << "Process took " << duration << std::endl;
|
||||
}
|
||||
MPI_Finalize();
|
||||
}
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
//
|
||||
// Manage arguments
|
||||
//
|
||||
argparse::ArgumentParser program("b_grid", { project_version.begin(), project_version.end() });
|
||||
argparse::ArgumentParser program("b_grid", { platform_project_version.begin(), platform_project_version.end() });
|
||||
// grid dump subparser
|
||||
argparse::ArgumentParser dump_command("dump");
|
||||
dump_command.add_description("Dump the combinations of hyperparameters of a model.");
|
||||
@@ -234,15 +274,21 @@ int main(int argc, char** argv)
|
||||
assignModel(report_command);
|
||||
report_command.add_description("Report the computed hyperparameters of a model.");
|
||||
|
||||
// grid compute subparser
|
||||
argparse::ArgumentParser compute_command("compute");
|
||||
compute_command.add_description("Compute using mpi the hyperparameters of a model.");
|
||||
assignModel(compute_command);
|
||||
add_compute_args(compute_command);
|
||||
// grid search subparser
|
||||
argparse::ArgumentParser search_command("search");
|
||||
search_command.add_description("Search using mpi the hyperparameters of a model.");
|
||||
assignModel(search_command);
|
||||
add_search_args(search_command);
|
||||
|
||||
// grid experiment subparser
|
||||
argparse::ArgumentParser experiment_command("experiment");
|
||||
experiment_command.add_description("Experiment like b_main using mpi.");
|
||||
auto arguments = platform::ArgumentsExperiment(experiment_command, platform::experiment_t::GRID);
|
||||
arguments.add_arguments();
|
||||
program.add_subparser(dump_command);
|
||||
program.add_subparser(report_command);
|
||||
program.add_subparser(compute_command);
|
||||
program.add_subparser(search_command);
|
||||
program.add_subparser(experiment_command);
|
||||
|
||||
//
|
||||
// Process options
|
||||
@@ -250,7 +296,7 @@ int main(int argc, char** argv)
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
bool found = false;
|
||||
map<std::string, void(*)(argparse::ArgumentParser&)> commands = { {"dump", &dump}, {"report", &report}, {"compute", &compute} };
|
||||
map<std::string, void(*)(argparse::ArgumentParser&)> commands = { {"dump", &dump}, {"report", &report}, {"search", &search}, { "experiment",&experiment } };
|
||||
for (const auto& command : commands) {
|
||||
if (program.is_subcommand_used(command.first)) {
|
||||
std::invoke(command.second, program.at<argparse::ArgumentParser>(command.first));
|
||||
@@ -259,7 +305,7 @@ int main(int argc, char** argv)
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
throw std::runtime_error("You must specify one of the following commands: dump, report, compute, export\n");
|
||||
throw std::runtime_error("You must specify one of the following commands: dump, experiment, report, search \n");
|
||||
}
|
||||
}
|
||||
catch (const exception& err) {
|
119
src/commands/b_list.cpp
Normal file
119
src/commands/b_list.cpp
Normal file
@@ -0,0 +1,119 @@
|
||||
#include <iostream>
|
||||
#include <locale>
|
||||
#include <map>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "main/Models.h"
|
||||
#include "main/modelRegister.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Colors.h"
|
||||
#include "common/Datasets.h"
|
||||
#include "common/Utils.h"
|
||||
#include "reports/DatasetsExcel.h"
|
||||
#include "reports/DatasetsConsole.h"
|
||||
#include "results/ResultsDatasetConsole.h"
|
||||
#include "results/ResultsDataset.h"
|
||||
#include "results/ResultsDatasetExcel.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
|
||||
void list_datasets(argparse::ArgumentParser& program)
|
||||
{
|
||||
auto excel = program.get<bool>("excel");
|
||||
auto report = platform::DatasetsConsole();
|
||||
report.report();
|
||||
std::cout << report.getOutput();
|
||||
if (excel) {
|
||||
auto data = report.getData();
|
||||
auto ereport = new platform::DatasetsExcel();
|
||||
ereport->report(data);
|
||||
std::cout << std::endl << Colors::GREEN() << "Output saved in " << ereport->getFileName() << std::endl;
|
||||
auto fileName = ereport->getExcelFileName();
|
||||
delete ereport;
|
||||
std::cout << "Opening " << fileName << std::endl;
|
||||
platform::openFile(fileName);
|
||||
}
|
||||
}
|
||||
|
||||
void list_results(argparse::ArgumentParser& program)
|
||||
{
|
||||
auto dataset = program.get<string>("dataset");
|
||||
auto score = program.get<string>("score");
|
||||
auto model = program.get<string>("model");
|
||||
auto excel = program.get<bool>("excel");
|
||||
auto report = platform::ResultsDatasetsConsole();
|
||||
if (!report.report(dataset, score, model))
|
||||
return;
|
||||
std::cout << report.getOutput();
|
||||
if (excel) {
|
||||
auto data = report.getData();
|
||||
auto ereport = new platform::ResultsDatasetExcel();
|
||||
ereport->report(data);
|
||||
std::cout << std::endl << Colors::GREEN() << "Output saved in " << ereport->getFileName() << std::endl;
|
||||
auto fileName = ereport->getExcelFileName();
|
||||
delete ereport;
|
||||
std::cout << "Opening " << fileName << std::endl;
|
||||
platform::openFile(fileName);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
argparse::ArgumentParser program("b_list", { platform_project_version.begin(), platform_project_version.end() });
|
||||
//
|
||||
// datasets subparser
|
||||
//
|
||||
argparse::ArgumentParser datasets_command("datasets");
|
||||
datasets_command.add_description("List datasets available in the platform.");
|
||||
datasets_command.add_argument("--excel").help("Output in Excel format").default_value(false).implicit_value(true);
|
||||
//
|
||||
// results subparser
|
||||
//
|
||||
argparse::ArgumentParser results_command("results");
|
||||
results_command.add_description("List the results of a given dataset.");
|
||||
auto datasets = platform::Datasets(false, platform::Paths::datasets());
|
||||
results_command.add_argument("-d", "--dataset")
|
||||
.help("Dataset to use " + datasets.toString())
|
||||
.required()
|
||||
.action([](const std::string& value) {
|
||||
auto datasets = platform::Datasets(false, platform::Paths::datasets());
|
||||
static const std::vector<std::string> choices = datasets.getNames();
|
||||
if (find(choices.begin(), choices.end(), value) != choices.end()) {
|
||||
return value;
|
||||
}
|
||||
throw std::runtime_error("Dataset must be one of " + datasets.toString());
|
||||
}
|
||||
);
|
||||
results_command.add_argument("-m", "--model")
|
||||
.help("Model to use or any")
|
||||
.default_value("any");
|
||||
results_command.add_argument("--excel").help("Output in Excel format").default_value(false).implicit_value(true);
|
||||
results_command.add_argument("-s", "--score").default_value("accuracy").help("Filter results of the score name supplied");
|
||||
|
||||
// Add subparsers
|
||||
program.add_subparser(datasets_command);
|
||||
program.add_subparser(results_command);
|
||||
// Parse command line and execute
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
bool found = false;
|
||||
map<std::string, void(*)(argparse::ArgumentParser&)> commands = { {"datasets", &list_datasets}, {"results", &list_results} };
|
||||
for (const auto& command : commands) {
|
||||
if (program.is_subcommand_used(command.first)) {
|
||||
std::invoke(command.second, program.at<argparse::ArgumentParser>(command.first));
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
throw std::runtime_error("You must specify one of the following commands: {datasets, results}\n");
|
||||
}
|
||||
}
|
||||
catch (const exception& err) {
|
||||
cerr << err.what() << std::endl;
|
||||
cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
std::cout << Colors::RESET() << std::endl;
|
||||
return 0;
|
||||
}
|
37
src/commands/b_main.cpp
Normal file
37
src/commands/b_main.cpp
Normal file
@@ -0,0 +1,37 @@
|
||||
#include <argparse/argparse.hpp>
|
||||
#include "main/Experiment.h"
|
||||
#include "main/ArgumentsExperiment.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
argparse::ArgumentParser program("b_main", { platform_project_version.begin(), platform_project_version.end() });
|
||||
auto arguments = platform::ArgumentsExperiment(program, platform::experiment_t::NORMAL);
|
||||
arguments.add_arguments();
|
||||
arguments.parse_args(argc, argv);
|
||||
/*
|
||||
* Begin Processing
|
||||
*/
|
||||
// Initialize the experiment class with the command line arguments
|
||||
auto experiment = arguments.initializedExperiment();
|
||||
auto path_results = arguments.getPathResults();
|
||||
platform::Timer timer;
|
||||
timer.start();
|
||||
experiment.go();
|
||||
experiment.setDuration(timer.getDuration());
|
||||
if (!arguments.isQuiet()) {
|
||||
// Classification report if only one dataset is tested
|
||||
experiment.report();
|
||||
}
|
||||
if (arguments.haveToSaveResults()) {
|
||||
experiment.saveResult(path_results);
|
||||
}
|
||||
if (arguments.doGraph()) {
|
||||
experiment.saveGraph();
|
||||
}
|
||||
return 0;
|
||||
}
|
85
src/commands/b_manage.cpp
Normal file
85
src/commands/b_manage.cpp
Normal file
@@ -0,0 +1,85 @@
|
||||
|
||||
#include <utility>
|
||||
#include <iostream>
|
||||
#include <sys/ioctl.h>
|
||||
#include "common/Paths.h"
|
||||
#include <argparse/argparse.hpp>
|
||||
#include "manage/ManageScreen.h"
|
||||
#include <signal.h>
|
||||
#include "config_platform.h"
|
||||
|
||||
platform::ManageScreen* manager = nullptr;
|
||||
|
||||
void manageArguments(argparse::ArgumentParser& program, int argc, char** argv)
|
||||
{
|
||||
program.add_argument("-m", "--model").default_value("any").help("Filter results of the selected model)");
|
||||
program.add_argument("-s", "--score").default_value("any").help("Filter results of the score name supplied");
|
||||
program.add_argument("--folder").help("Results folder to use").default_value(platform::Paths::results());
|
||||
program.add_argument("--platform").default_value("any").help("Filter results of the selected platform");
|
||||
program.add_argument("--complete").help("Show only results with all datasets").default_value(false).implicit_value(true);
|
||||
program.add_argument("--partial").help("Show only partial results").default_value(false).implicit_value(true);
|
||||
program.add_argument("--compare").help("Compare with best results").default_value(false).implicit_value(true);
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
auto platform = program.get<std::string>("platform");
|
||||
auto model = program.get<std::string>("model");
|
||||
auto score = program.get<std::string>("score");
|
||||
auto complete = program.get<bool>("complete");
|
||||
auto partial = program.get<bool>("partial");
|
||||
auto compare = program.get<bool>("compare");
|
||||
}
|
||||
catch (const std::exception& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<int, int> numRowsCols()
|
||||
{
|
||||
#ifdef TIOCGSIZE
|
||||
struct ttysize ts;
|
||||
ioctl(STDIN_FILENO, TIOCGSIZE, &ts);
|
||||
return { ts.ts_lines, ts.ts_cols };
|
||||
#elif defined(TIOCGWINSZ)
|
||||
struct winsize ts;
|
||||
ioctl(STDIN_FILENO, TIOCGWINSZ, &ts);
|
||||
return { ts.ws_row, ts.ws_col };
|
||||
#endif /* TIOCGSIZE */
|
||||
}
|
||||
void handleResize(int sig)
|
||||
{
|
||||
auto [rows, cols] = numRowsCols();
|
||||
manager->updateSize(rows, cols);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
auto program = argparse::ArgumentParser("b_manage", { platform_project_version.begin(), platform_project_version.end() });
|
||||
manageArguments(program, argc, argv);
|
||||
std::string model = program.get<std::string>("model");
|
||||
std::string path = program.get<std::string>("folder");
|
||||
if (path.back() != '/') {
|
||||
path += '/';
|
||||
}
|
||||
std::string score = program.get<std::string>("score");
|
||||
std::string platform = program.get<std::string>("platform");
|
||||
bool complete = program.get<bool>("complete");
|
||||
bool partial = program.get<bool>("partial");
|
||||
bool compare = program.get<bool>("compare");
|
||||
if (complete)
|
||||
partial = false;
|
||||
signal(SIGWINCH, handleResize);
|
||||
auto [rows, cols] = numRowsCols();
|
||||
manager = new platform::ManageScreen(path, rows, cols, model, score, platform, complete, partial, compare);
|
||||
manager->doMenu();
|
||||
auto fileName = manager->getExcelFileName();
|
||||
delete manager;
|
||||
if (!fileName.empty()) {
|
||||
std::cout << "Opening " << fileName << std::endl;
|
||||
platform::openFile(fileName);
|
||||
}
|
||||
return 0;
|
||||
}
|
102
src/commands/b_results.cpp
Normal file
102
src/commands/b_results.cpp
Normal file
@@ -0,0 +1,102 @@
|
||||
#include <iostream>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "argparse/argparse.hpp"
|
||||
#include "common/Paths.h"
|
||||
#include "results/JsonValidator.h"
|
||||
#include "results/SchemaV1_0.h"
|
||||
#include "config_platform.h"
|
||||
|
||||
using json = nlohmann::json;
|
||||
namespace fs = std::filesystem;
|
||||
void header(const std::string& message, int length, const std::string& symbol)
|
||||
{
|
||||
std::cout << std::string(length + 11, symbol[0]) << std::endl;
|
||||
std::cout << symbol << " " << std::setw(length + 7) << std::left << message << " " << symbol << std::endl;
|
||||
std::cout << std::string(length + 11, symbol[0]) << std::endl;
|
||||
}
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
argparse::ArgumentParser program("b_results", { platform_project_version.begin(), platform_project_version.end() });
|
||||
program.add_description("Check the results files and optionally fixes them.");
|
||||
program.add_argument("--fix").help("Fix any errors in results").default_value(false).implicit_value(true);
|
||||
program.add_argument("--file").help("check only this results file").default_value("");
|
||||
std::string nameSuffix = "results_";
|
||||
std::string schemaVersion = "1.0";
|
||||
bool fix_it = false;
|
||||
std::string selected_file;
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
fix_it = program.get<bool>("fix");
|
||||
selected_file = program.get<std::string>("file");
|
||||
}
|
||||
catch (const std::exception& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
std::cerr << program;
|
||||
exit(1);
|
||||
}
|
||||
//
|
||||
// Determine the files to process
|
||||
//
|
||||
std::vector<std::string> result_files;
|
||||
int max_length = 0;
|
||||
if (selected_file != "") {
|
||||
if (!selected_file.starts_with(platform::Paths::results())) {
|
||||
selected_file = platform::Paths::results() + selected_file;
|
||||
}
|
||||
// Only check the selected file
|
||||
result_files.push_back(selected_file);
|
||||
max_length = selected_file.length();
|
||||
} else {
|
||||
// Load the result files and find the longest file name
|
||||
for (const auto& entry : fs::directory_iterator(platform::Paths::results())) {
|
||||
if (entry.is_regular_file() && entry.path().filename().string().starts_with(nameSuffix) && entry.path().filename().string().ends_with(".json")) {
|
||||
std::string fileName = entry.path().string();
|
||||
if (fileName.length() > max_length) {
|
||||
max_length = fileName.length();
|
||||
}
|
||||
result_files.push_back(fileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
//
|
||||
// Process the results files
|
||||
//
|
||||
if (result_files.empty()) {
|
||||
std::cerr << "Error: No result files found." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
std::string header_message = "Processing " + std::to_string(result_files.size()) + " result files.";
|
||||
header(header_message, max_length, "*");
|
||||
platform::JsonValidator validator(platform::SchemaV1_0::schema);
|
||||
int n_errors = 0;
|
||||
std::vector<std::string> files_with_errors;
|
||||
for (const auto& file_name : result_files) {
|
||||
std::vector<std::string> errors = validator.validate_file(file_name);
|
||||
if (!errors.empty()) {
|
||||
n_errors++;
|
||||
std::cout << std::setw(max_length) << std::left << file_name << ": " << errors.size() << " Errors:" << std::endl;
|
||||
for (const auto& error : errors) {
|
||||
std::cout << " - " << error << std::endl;
|
||||
}
|
||||
if (fix_it) {
|
||||
validator.fix_it(file_name);
|
||||
std::cout << " -> File fixed." << std::endl;
|
||||
}
|
||||
files_with_errors.push_back(file_name);
|
||||
}
|
||||
}
|
||||
if (n_errors == 0) {
|
||||
header("All files are valid.", max_length, "*");
|
||||
} else {
|
||||
std::string $verb = (fix_it) ? "had" : "have";
|
||||
std::string msg = std::to_string(n_errors) + " files " + $verb + " errors.";
|
||||
header(msg, max_length, "*");
|
||||
for (const auto& file_name : files_with_errors) {
|
||||
std::cout << "- " << file_name << std::endl;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
@@ -1,5 +1,5 @@
|
||||
#ifndef LOCALE_H
|
||||
#define LOCALE_H
|
||||
#ifndef CLOCALE_H
|
||||
#define CLOCALE_H
|
||||
#include <locale>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
@@ -19,4 +19,4 @@ namespace platform {
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
||||
#endif
|
@@ -1,15 +1,30 @@
|
||||
#ifndef COLORS_H
|
||||
#define COLORS_H
|
||||
#include <string>
|
||||
class Colors {
|
||||
public:
|
||||
static std::string MAGENTA() { return "\033[1;35m"; }
|
||||
static std::string BLACK() { return "\033[1;30m"; }
|
||||
static std::string IBLACK() { return "\033[0;90m"; }
|
||||
static std::string BLUE() { return "\033[1;34m"; }
|
||||
static std::string CYAN() { return "\033[1;36m"; }
|
||||
static std::string GREEN() { return "\033[1;32m"; }
|
||||
static std::string YELLOW() { return "\033[1;33m"; }
|
||||
static std::string RED() { return "\033[1;31m"; }
|
||||
static std::string WHITE() { return "\033[1;37m"; }
|
||||
static std::string IBLUE() { return "\033[0;94m"; }
|
||||
static std::string CYAN() { return "\033[1;36m"; }
|
||||
static std::string ICYAN() { return "\033[0;96m"; }
|
||||
static std::string GREEN() { return "\033[1;32m"; }
|
||||
static std::string IGREEN() { return "\033[0;92m"; }
|
||||
static std::string MAGENTA() { return "\033[1;35m"; }
|
||||
static std::string IMAGENTA() { return "\033[0;95m"; }
|
||||
static std::string RED() { return "\033[1;31m"; }
|
||||
static std::string IRED() { return "\033[0;91m"; }
|
||||
static std::string YELLOW() { return "\033[1;33m"; }
|
||||
static std::string IYELLOW() { return "\033[0;93m"; }
|
||||
static std::string WHITE() { return "\033[1;37m"; }
|
||||
static std::string IWHITE() { return "\033[0;97m"; }
|
||||
static std::string RESET() { return "\033[0m"; }
|
||||
static std::string BOLD() { return "\033[1m"; }
|
||||
static std::string UNDERLINE() { return "\033[4m"; }
|
||||
static std::string BLINK() { return "\033[5m"; }
|
||||
static std::string REVERSE() { return "\033[7m"; }
|
||||
static std::string CONCEALED() { return "\033[8m"; }
|
||||
static std::string CLRSCR() { return "\033[2J\033[1;1H"; }
|
||||
};
|
||||
#endif // COLORS_H
|
||||
#endif
|
@@ -1,215 +0,0 @@
|
||||
#include "Dataset.h"
|
||||
#include "ArffFiles.h"
|
||||
#include <fstream>
|
||||
namespace platform {
|
||||
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
|
||||
{
|
||||
}
|
||||
std::string Dataset::getName() const
|
||||
{
|
||||
return name;
|
||||
}
|
||||
std::string Dataset::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
std::vector<std::string> Dataset::getFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return features;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Dataset::getNFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_features;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Dataset::getNSamples() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_samples;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
std::map<std::string, std::vector<int>> Dataset::getStates() const
|
||||
{
|
||||
if (loaded) {
|
||||
return states;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xv, yv };
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<int>>&, std::vector<int>&> Dataset::getVectorsDiscretized()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xd, yv };
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
|
||||
{
|
||||
if (loaded) {
|
||||
buildTensors();
|
||||
return { X, y };
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
void Dataset::load_csv()
|
||||
{
|
||||
ifstream file(path + "/" + name + ".csv");
|
||||
if (file.is_open()) {
|
||||
std::string line;
|
||||
getline(file, line);
|
||||
std::vector<std::string> tokens = split(line, ',');
|
||||
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
|
||||
if (className == "-1") {
|
||||
className = tokens.back();
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = split(line, ',');
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv[i].push_back(stof(tokens[i]));
|
||||
}
|
||||
yv.push_back(stoi(tokens.back()));
|
||||
}
|
||||
file.close();
|
||||
} else {
|
||||
throw std::invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
}
|
||||
void Dataset::computeStates()
|
||||
{
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
states[features[i]] = std::vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
|
||||
auto item = states.at(features[i]);
|
||||
iota(begin(item), end(item), 0);
|
||||
}
|
||||
states[className] = std::vector<int>(*max_element(yv.begin(), yv.end()) + 1);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
}
|
||||
void Dataset::load_arff()
|
||||
{
|
||||
auto arff = ArffFiles();
|
||||
arff.load(path + "/" + name + ".arff", className);
|
||||
// Get Dataset X, y
|
||||
Xv = arff.getX();
|
||||
yv = arff.getY();
|
||||
// Get className & Features
|
||||
className = arff.getClassName();
|
||||
auto attributes = arff.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
|
||||
}
|
||||
std::vector<std::string> tokenize(std::string line)
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
for (auto i = 0; i < line.size(); ++i) {
|
||||
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
|
||||
std::string token = line.substr(0, i);
|
||||
tokens.push_back(token);
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
i = 0;
|
||||
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
}
|
||||
}
|
||||
if (line.size() > 0) {
|
||||
tokens.push_back(line);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
void Dataset::load_rdata()
|
||||
{
|
||||
ifstream file(path + "/" + name + "_R.dat");
|
||||
if (file.is_open()) {
|
||||
std::string line;
|
||||
getline(file, line);
|
||||
line = ArffFiles::trim(line);
|
||||
std::vector<std::string> tokens = tokenize(line);
|
||||
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
|
||||
if (className == "-1") {
|
||||
className = ArffFiles::trim(tokens.back());
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = tokenize(line);
|
||||
// We have to skip the first token, which is the instance number.
|
||||
for (auto i = 1; i < features.size() + 1; ++i) {
|
||||
const float value = stof(tokens[i]);
|
||||
Xv[i - 1].push_back(value);
|
||||
}
|
||||
yv.push_back(stoi(tokens.back()));
|
||||
}
|
||||
file.close();
|
||||
} else {
|
||||
throw std::invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
}
|
||||
void Dataset::load()
|
||||
{
|
||||
if (loaded) {
|
||||
return;
|
||||
}
|
||||
if (fileType == CSV) {
|
||||
load_csv();
|
||||
} else if (fileType == ARFF) {
|
||||
load_arff();
|
||||
} else if (fileType == RDATA) {
|
||||
load_rdata();
|
||||
}
|
||||
if (discretize) {
|
||||
Xd = discretizeDataset(Xv, yv);
|
||||
computeStates();
|
||||
}
|
||||
n_samples = Xv[0].size();
|
||||
n_features = Xv.size();
|
||||
loaded = true;
|
||||
}
|
||||
void Dataset::buildTensors()
|
||||
{
|
||||
if (discretize) {
|
||||
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
|
||||
} else {
|
||||
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
|
||||
}
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
if (discretize) {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
|
||||
} else {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
|
||||
}
|
||||
}
|
||||
y = torch::tensor(yv, torch::kInt32);
|
||||
}
|
||||
std::vector<mdlp::labels_t> Dataset::discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
||||
{
|
||||
std::vector<mdlp::labels_t> Xd;
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return Xd;
|
||||
}
|
||||
}
|
278
src/common/Dataset.cpp
Normal file
278
src/common/Dataset.cpp
Normal file
@@ -0,0 +1,278 @@
|
||||
#include <ArffFiles.hpp>
|
||||
#include <fstream>
|
||||
#include "Dataset.h"
|
||||
namespace platform {
|
||||
const std::string message_dataset_not_loaded = "Dataset not loaded.";
|
||||
Dataset::Dataset(const Dataset& dataset) :
|
||||
path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples),
|
||||
n_features(dataset.n_features), numericFeatures(dataset.numericFeatures), features(dataset.features),
|
||||
states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y),
|
||||
X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), yv(dataset.yv),
|
||||
fileType(dataset.fileType)
|
||||
{
|
||||
}
|
||||
std::string Dataset::getName() const
|
||||
{
|
||||
return name;
|
||||
}
|
||||
std::vector<std::string> Dataset::getFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return features;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
int Dataset::getNFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_features;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
int Dataset::getNSamples() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_samples;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::string Dataset::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
int Dataset::getNClasses() const
|
||||
{
|
||||
if (loaded) {
|
||||
return *std::max_element(yv.begin(), yv.end()) + 1;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::vector<std::string> Dataset::getLabels() const
|
||||
{
|
||||
// Return the labels factorization result
|
||||
if (loaded) {
|
||||
return labels;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::vector<int> Dataset::getClassesCounts() const
|
||||
{
|
||||
if (loaded) {
|
||||
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
|
||||
for (auto y : yv) {
|
||||
counts[y]++;
|
||||
}
|
||||
return counts;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::map<std::string, std::vector<int>> Dataset::getStates() const
|
||||
{
|
||||
if (loaded) {
|
||||
return states;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xv, yv };
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
|
||||
{
|
||||
if (loaded) {
|
||||
return { X, y };
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
void Dataset::load_csv()
|
||||
{
|
||||
ifstream file(path + "/" + name + ".csv");
|
||||
if (!file.is_open()) {
|
||||
throw std::invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
labels.clear();
|
||||
std::string line;
|
||||
getline(file, line);
|
||||
std::vector<std::string> tokens = split(line, ',');
|
||||
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
|
||||
if (className == "-1") {
|
||||
className = tokens.back();
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = split(line, ',');
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv[i].push_back(stof(tokens[i]));
|
||||
}
|
||||
auto label = trim(tokens.back());
|
||||
if (find(labels.begin(), labels.end(), label) == labels.end()) {
|
||||
labels.push_back(label);
|
||||
}
|
||||
yv.push_back(stoi(label));
|
||||
}
|
||||
file.close();
|
||||
}
|
||||
void Dataset::computeStates()
|
||||
{
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
auto [max_value, idx] = torch::max(X_train.index({ i, "..." }), 0);
|
||||
states[features[i]] = std::vector<int>(max_value.item<int>() + 1);
|
||||
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
|
||||
}
|
||||
auto [max_value, idx] = torch::max(y_train, 0);
|
||||
states[className] = std::vector<int>(max_value.item<int>() + 1);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
}
|
||||
void Dataset::load_arff()
|
||||
{
|
||||
auto arff = ArffFiles();
|
||||
arff.load(path + "/" + name + ".arff", className);
|
||||
// Get Dataset X, y
|
||||
Xv = arff.getX();
|
||||
yv = arff.getY();
|
||||
// Get className & Features
|
||||
className = arff.getClassName();
|
||||
auto attributes = arff.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
|
||||
labels = arff.getLabels();
|
||||
}
|
||||
std::vector<std::string> tokenize(std::string line)
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
for (auto i = 0; i < line.size(); ++i) {
|
||||
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
|
||||
std::string token = line.substr(0, i);
|
||||
tokens.push_back(token);
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
i = 0;
|
||||
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
}
|
||||
}
|
||||
if (line.size() > 0) {
|
||||
tokens.push_back(line);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
void Dataset::load_rdata()
|
||||
{
|
||||
ifstream file(path + "/" + name + "_R.dat");
|
||||
if (!file.is_open()) {
|
||||
throw std::invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
std::string line;
|
||||
labels.clear();
|
||||
getline(file, line);
|
||||
line = ArffFiles::trim(line);
|
||||
std::vector<std::string> tokens = tokenize(line);
|
||||
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
|
||||
if (className == "-1") {
|
||||
className = ArffFiles::trim(tokens.back());
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = tokenize(line);
|
||||
// We have to skip the first token, which is the instance number.
|
||||
for (auto i = 1; i < features.size() + 1; ++i) {
|
||||
const float value = stof(tokens[i]);
|
||||
Xv[i - 1].push_back(value);
|
||||
}
|
||||
auto label = trim(tokens.back());
|
||||
if (find(labels.begin(), labels.end(), label) == labels.end()) {
|
||||
labels.push_back(label);
|
||||
}
|
||||
yv.push_back(stoi(label));
|
||||
}
|
||||
file.close();
|
||||
}
|
||||
void Dataset::load()
|
||||
{
|
||||
if (loaded) {
|
||||
return;
|
||||
}
|
||||
if (fileType == CSV) {
|
||||
load_csv();
|
||||
} else if (fileType == ARFF) {
|
||||
load_arff();
|
||||
} else if (fileType == RDATA) {
|
||||
load_rdata();
|
||||
}
|
||||
n_samples = Xv[0].size();
|
||||
n_features = Xv.size();
|
||||
if (numericFeaturesIdx.size() == 0) {
|
||||
numericFeatures = std::vector<bool>(n_features, false);
|
||||
} else {
|
||||
if (numericFeaturesIdx.at(0) == -1) {
|
||||
numericFeatures = std::vector<bool>(n_features, true);
|
||||
} else {
|
||||
numericFeatures = std::vector<bool>(n_features, false);
|
||||
for (auto i : numericFeaturesIdx) {
|
||||
numericFeatures[i] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Build Tensors
|
||||
X = torch::zeros({ n_features, n_samples }, torch::kFloat32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
|
||||
}
|
||||
y = torch::tensor(yv, torch::kInt32);
|
||||
loaded = true;
|
||||
}
|
||||
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> Dataset::getTrainTestTensors(std::vector<int>& train, std::vector<int>& test)
|
||||
{
|
||||
if (!loaded) {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
auto train_t = torch::tensor(train);
|
||||
int samples_train = train.size();
|
||||
int samples_test = test.size();
|
||||
auto test_t = torch::tensor(test);
|
||||
X_train = X.index({ "...", train_t });
|
||||
y_train = y.index({ train_t });
|
||||
X_test = X.index({ "...", test_t });
|
||||
y_test = y.index({ test_t });
|
||||
if (discretize) {
|
||||
auto discretizer = Discretization::instance()->create(discretizer_algorithm);
|
||||
auto X_train_d = torch::zeros({ n_features, samples_train }, torch::kInt32);
|
||||
auto X_test_d = torch::zeros({ n_features, samples_test }, torch::kInt32);
|
||||
for (auto feature = 0; feature < n_features; ++feature) {
|
||||
if (numericFeatures[feature]) {
|
||||
auto feature_train = X_train.index({ feature, "..." });
|
||||
auto feature_test = X_test.index({ feature, "..." });
|
||||
auto feature_train_disc = discretizer->fit_transform_t(feature_train, y_train);
|
||||
auto feature_test_disc = discretizer->transform_t(feature_test);
|
||||
X_train_d.index_put_({ feature, "..." }, feature_train_disc);
|
||||
X_test_d.index_put_({ feature, "..." }, feature_test_disc);
|
||||
} else {
|
||||
X_train_d.index_put_({ feature, "..." }, X_train.index({ feature, "..." }).to(torch::kInt32));
|
||||
X_test_d.index_put_({ feature, "..." }, X_test.index({ feature, "..." }).to(torch::kInt32));
|
||||
}
|
||||
}
|
||||
X_train = X_train_d;
|
||||
X_test = X_test_d;
|
||||
assert(X_train.dtype() == torch::kInt32);
|
||||
assert(X_test.dtype() == torch::kInt32);
|
||||
computeStates();
|
||||
}
|
||||
assert(y_train.dtype() == torch::kInt32);
|
||||
assert(y_test.dtype() == torch::kInt32);
|
||||
return { X_train, X_test, y_train, y_test };
|
||||
}
|
||||
}
|
@@ -4,75 +4,57 @@
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "CPPFImdlp.h"
|
||||
#include <tuple>
|
||||
#include <common/DiscretizationRegister.h>
|
||||
#include "Utils.h"
|
||||
#include "SourceData.h"
|
||||
namespace platform {
|
||||
enum fileType_t { CSV, ARFF, RDATA };
|
||||
class SourceData {
|
||||
public:
|
||||
SourceData(std::string source)
|
||||
{
|
||||
if (source == "Surcov") {
|
||||
path = "datasets/";
|
||||
fileType = CSV;
|
||||
} else if (source == "Arff") {
|
||||
path = "datasets/";
|
||||
fileType = ARFF;
|
||||
} else if (source == "Tanveer") {
|
||||
path = "data/";
|
||||
fileType = RDATA;
|
||||
} else {
|
||||
throw std::invalid_argument("Unknown source.");
|
||||
}
|
||||
}
|
||||
std::string getPath()
|
||||
{
|
||||
return path;
|
||||
}
|
||||
fileType_t getFileType()
|
||||
{
|
||||
return fileType;
|
||||
}
|
||||
private:
|
||||
std::string path;
|
||||
fileType_t fileType;
|
||||
};
|
||||
class Dataset {
|
||||
public:
|
||||
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType, std::vector<int> numericFeaturesIdx, std::string discretizer_algo = "none") :
|
||||
path(path), name(name), className(className), discretize(discretize),
|
||||
loaded(false), fileType(fileType), numericFeaturesIdx(numericFeaturesIdx), discretizer_algorithm(discretizer_algo)
|
||||
{
|
||||
};
|
||||
explicit Dataset(const Dataset&);
|
||||
std::string getName() const;
|
||||
std::string getClassName() const;
|
||||
int getNClasses() const;
|
||||
std::vector<std::string> getLabels() const; // return the labels factorization result
|
||||
std::vector<int> getClassesCounts() const;
|
||||
std::vector<string> getFeatures() const;
|
||||
std::map<std::string, std::vector<int>> getStates() const;
|
||||
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
|
||||
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
|
||||
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> getTrainTestTensors(std::vector<int>& train, std::vector<int>& test);
|
||||
int getNFeatures() const;
|
||||
int getNSamples() const;
|
||||
std::vector<bool>& getNumericFeatures() { return numericFeatures; }
|
||||
void load();
|
||||
const bool inline isLoaded() const { return loaded; };
|
||||
private:
|
||||
std::string path;
|
||||
std::string name;
|
||||
fileType_t fileType;
|
||||
std::string className;
|
||||
int n_samples{ 0 }, n_features{ 0 };
|
||||
std::vector<int> numericFeaturesIdx;
|
||||
std::string discretizer_algorithm;
|
||||
std::vector<bool> numericFeatures; // true if feature is numeric
|
||||
std::vector<std::string> features;
|
||||
std::vector<std::string> labels;
|
||||
std::map<std::string, std::vector<int>> states;
|
||||
bool loaded;
|
||||
bool discretize;
|
||||
torch::Tensor X, y;
|
||||
torch::Tensor X_train, X_test, y_train, y_test;
|
||||
std::vector<std::vector<float>> Xv;
|
||||
std::vector<std::vector<int>> Xd;
|
||||
std::vector<int> yv;
|
||||
void buildTensors();
|
||||
void load_csv();
|
||||
void load_arff();
|
||||
void load_rdata();
|
||||
void computeStates();
|
||||
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y);
|
||||
public:
|
||||
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
|
||||
explicit Dataset(const Dataset&);
|
||||
std::string getName() const;
|
||||
std::string getClassName() const;
|
||||
std::vector<string> getFeatures() const;
|
||||
std::map<std::string, std::vector<int>> getStates() const;
|
||||
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
|
||||
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
|
||||
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
|
||||
int getNFeatures() const;
|
||||
int getNSamples() const;
|
||||
void load();
|
||||
const bool inline isLoaded() const { return loaded; };
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
@@ -1,129 +0,0 @@
|
||||
#include "Datasets.h"
|
||||
#include <fstream>
|
||||
namespace platform {
|
||||
void Datasets::load()
|
||||
{
|
||||
auto sd = SourceData(sfileType);
|
||||
fileType = sd.getFileType();
|
||||
path = sd.getPath();
|
||||
ifstream catalog(path + "all.txt");
|
||||
if (catalog.is_open()) {
|
||||
std::string line;
|
||||
while (getline(catalog, line)) {
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
std::vector<std::string> tokens = split(line, ',');
|
||||
std::string name = tokens[0];
|
||||
std::string className;
|
||||
if (tokens.size() == 1) {
|
||||
className = "-1";
|
||||
} else {
|
||||
className = tokens[1];
|
||||
}
|
||||
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
|
||||
}
|
||||
catalog.close();
|
||||
} else {
|
||||
throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
|
||||
}
|
||||
}
|
||||
std::vector<std::string> Datasets::getNames()
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
|
||||
return result;
|
||||
}
|
||||
std::vector<std::string> Datasets::getFeatures(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getFeatures();
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
map<std::string, std::vector<int>> Datasets::getStates(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getStates();
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
void Datasets::loadDataset(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return;
|
||||
} else {
|
||||
datasets.at(name)->load();
|
||||
}
|
||||
}
|
||||
std::string Datasets::getClassName(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getClassName();
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Datasets::getNSamples(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getNSamples();
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Datasets::getNClasses(const std::string& name)
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
auto className = datasets.at(name)->getClassName();
|
||||
if (discretize) {
|
||||
auto states = getStates(name);
|
||||
return states.at(className).size();
|
||||
}
|
||||
auto [Xv, yv] = getVectors(name);
|
||||
return *std::max_element(yv.begin(), yv.end()) + 1;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
std::vector<int> Datasets::getClassesCounts(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
auto [Xv, yv] = datasets.at(name)->getVectors();
|
||||
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
|
||||
for (auto y : yv) {
|
||||
counts[y]++;
|
||||
}
|
||||
return counts;
|
||||
} else {
|
||||
throw std::invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<float>>&, std::vector<int>&> Datasets::getVectors(const std::string& name)
|
||||
{
|
||||
if (!datasets[name]->isLoaded()) {
|
||||
datasets[name]->load();
|
||||
}
|
||||
return datasets[name]->getVectors();
|
||||
}
|
||||
pair<std::vector<std::vector<int>>&, std::vector<int>&> Datasets::getVectorsDiscretized(const std::string& name)
|
||||
{
|
||||
if (!datasets[name]->isLoaded()) {
|
||||
datasets[name]->load();
|
||||
}
|
||||
return datasets[name]->getVectorsDiscretized();
|
||||
}
|
||||
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(const std::string& name)
|
||||
{
|
||||
if (!datasets[name]->isLoaded()) {
|
||||
datasets[name]->load();
|
||||
}
|
||||
return datasets[name]->getTensors();
|
||||
}
|
||||
bool Datasets::isDataset(const std::string& name) const
|
||||
{
|
||||
return datasets.find(name) != datasets.end();
|
||||
}
|
||||
}
|
105
src/common/Datasets.cpp
Normal file
105
src/common/Datasets.cpp
Normal file
@@ -0,0 +1,105 @@
|
||||
#include <fstream>
|
||||
#include<algorithm>
|
||||
#include "Datasets.h"
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
const std::string message_dataset_not_loaded = "dataset not loaded.";
|
||||
Datasets::Datasets(bool discretize, std::string sfileType, std::string discretizer_algorithm) :
|
||||
discretize(discretize), sfileType(sfileType), discretizer_algorithm(discretizer_algorithm)
|
||||
{
|
||||
if ((discretizer_algorithm == "none" || discretizer_algorithm == "") && discretize) {
|
||||
throw std::runtime_error("Can't discretize without discretization algorithm");
|
||||
}
|
||||
load();
|
||||
}
|
||||
void Datasets::load()
|
||||
{
|
||||
auto sd = SourceData(sfileType);
|
||||
fileType = sd.getFileType();
|
||||
path = sd.getPath();
|
||||
ifstream catalog(path + "all.txt");
|
||||
std::vector<int> numericFeaturesIdx;
|
||||
if (!catalog.is_open()) {
|
||||
throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
|
||||
}
|
||||
std::string line;
|
||||
std::vector<std::string> sorted_lines;
|
||||
while (getline(catalog, line)) {
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
sorted_lines.push_back(line);
|
||||
}
|
||||
sort(sorted_lines.begin(), sorted_lines.end(), [](const auto& lhs, const auto& rhs) {
|
||||
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
|
||||
|
||||
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
|
||||
});
|
||||
|
||||
for (const auto& line : sorted_lines) {
|
||||
std::vector<std::string> tokens = split(line, ';');
|
||||
std::string name = tokens[0];
|
||||
std::string className;
|
||||
numericFeaturesIdx.clear();
|
||||
int size = tokens.size();
|
||||
switch (size) {
|
||||
case 1:
|
||||
className = "-1";
|
||||
numericFeaturesIdx.push_back(-1);
|
||||
break;
|
||||
case 2:
|
||||
className = tokens[1];
|
||||
numericFeaturesIdx.push_back(-1);
|
||||
break;
|
||||
case 3:
|
||||
{
|
||||
className = tokens[1];
|
||||
auto numericFeatures = tokens[2];
|
||||
if (numericFeatures == "all") {
|
||||
numericFeaturesIdx.push_back(-1);
|
||||
} else {
|
||||
if (numericFeatures != "none") {
|
||||
auto features = json::parse(numericFeatures);
|
||||
for (auto& f : features) {
|
||||
numericFeaturesIdx.push_back(f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw std::invalid_argument("Invalid catalog file format.");
|
||||
|
||||
}
|
||||
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType, numericFeaturesIdx, discretizer_algorithm);
|
||||
}
|
||||
catalog.close();
|
||||
}
|
||||
std::vector<std::string> Datasets::getNames()
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
|
||||
sort(result.begin(), result.end(), [](const auto& lhs, const auto& rhs) {
|
||||
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
|
||||
|
||||
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
|
||||
});
|
||||
return result;
|
||||
}
|
||||
bool Datasets::isDataset(const std::string& name) const
|
||||
{
|
||||
return datasets.find(name) != datasets.end();
|
||||
}
|
||||
std::string Datasets::toString() const
|
||||
{
|
||||
std::string result;
|
||||
std::string sep = "";
|
||||
for (const auto& d : datasets) {
|
||||
result += sep + d.first;
|
||||
sep = ", ";
|
||||
}
|
||||
return "{" + result + "}";
|
||||
}
|
||||
}
|
@@ -3,28 +3,20 @@
|
||||
#include "Dataset.h"
|
||||
namespace platform {
|
||||
class Datasets {
|
||||
public:
|
||||
explicit Datasets(bool discretize, std::string sfileType, std::string discretizer_algorithm = "none");
|
||||
std::vector<std::string> getNames();
|
||||
bool isDataset(const std::string& name) const;
|
||||
Dataset& getDataset(const std::string& name) const { return *datasets.at(name); }
|
||||
std::string toString() const;
|
||||
private:
|
||||
std::string path;
|
||||
fileType_t fileType;
|
||||
std::string sfileType;
|
||||
std::string discretizer_algorithm;
|
||||
std::map<std::string, std::unique_ptr<Dataset>> datasets;
|
||||
bool discretize;
|
||||
void load(); // Loads the list of datasets
|
||||
public:
|
||||
explicit Datasets(bool discretize, std::string sfileType) : discretize(discretize), sfileType(sfileType) { load(); };
|
||||
std::vector<string> getNames();
|
||||
std::vector<string> getFeatures(const std::string& name) const;
|
||||
int getNSamples(const std::string& name) const;
|
||||
std::string getClassName(const std::string& name) const;
|
||||
int getNClasses(const std::string& name);
|
||||
std::vector<int> getClassesCounts(const std::string& name) const;
|
||||
std::map<std::string, std::vector<int>> getStates(const std::string& name) const;
|
||||
std::pair<std::vector<std::vector<float>>&, std::vector<int>&> getVectors(const std::string& name);
|
||||
std::pair<std::vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized(const std::string& name);
|
||||
std::pair<torch::Tensor&, torch::Tensor&> getTensors(const std::string& name);
|
||||
bool isDataset(const std::string& name) const;
|
||||
void loadDataset(const std::string& name) const;
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
55
src/common/Discretization.cpp
Normal file
55
src/common/Discretization.cpp
Normal file
@@ -0,0 +1,55 @@
|
||||
#include "Discretization.h"
|
||||
|
||||
namespace platform {
|
||||
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
|
||||
Discretization* Discretization::factory = nullptr;
|
||||
Discretization* Discretization::instance()
|
||||
{
|
||||
//manages singleton
|
||||
if (factory == nullptr)
|
||||
factory = new Discretization();
|
||||
return factory;
|
||||
}
|
||||
void Discretization::registerFactoryFunction(const std::string& name,
|
||||
function<mdlp::Discretizer* (void)> classFactoryFunction)
|
||||
{
|
||||
// register the class factory function
|
||||
functionRegistry[name] = classFactoryFunction;
|
||||
}
|
||||
std::shared_ptr<mdlp::Discretizer> Discretization::create(const std::string& name)
|
||||
{
|
||||
mdlp::Discretizer* instance = nullptr;
|
||||
|
||||
// find name in the registry and call factory method.
|
||||
auto it = functionRegistry.find(name);
|
||||
if (it != functionRegistry.end())
|
||||
instance = it->second();
|
||||
// wrap instance in a shared ptr and return
|
||||
if (instance != nullptr)
|
||||
return std::unique_ptr<mdlp::Discretizer>(instance);
|
||||
else
|
||||
throw std::runtime_error("Discretizer not found: " + name);
|
||||
}
|
||||
std::vector<std::string> Discretization::getNames()
|
||||
{
|
||||
std::vector<std::string> names;
|
||||
transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names),
|
||||
[](const pair<std::string, function<mdlp::Discretizer* (void)>>& pair) { return pair.first; });
|
||||
return names;
|
||||
}
|
||||
std::string Discretization::toString()
|
||||
{
|
||||
std::string result = "";
|
||||
std::string sep = "";
|
||||
for (const auto& pair : functionRegistry) {
|
||||
result += sep + pair.first;
|
||||
sep = ", ";
|
||||
}
|
||||
return "{" + result + "}";
|
||||
}
|
||||
RegistrarDiscretization::RegistrarDiscretization(const std::string& name, function<mdlp::Discretizer* (void)> classFactoryFunction)
|
||||
{
|
||||
// register the class factory function
|
||||
Discretization::instance()->registerFactoryFunction(name, classFactoryFunction);
|
||||
}
|
||||
}
|
33
src/common/Discretization.h
Normal file
33
src/common/Discretization.h
Normal file
@@ -0,0 +1,33 @@
|
||||
#ifndef DISCRETIZATION_H
|
||||
#define DISCRETIZATION_H
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
#include <fimdlp/Discretizer.h>
|
||||
#include <fimdlp/BinDisc.h>
|
||||
#include <fimdlp/CPPFImdlp.h>
|
||||
namespace platform {
|
||||
class Discretization {
|
||||
public:
|
||||
Discretization(Discretization&) = delete;
|
||||
void operator=(const Discretization&) = delete;
|
||||
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
|
||||
static Discretization* instance();
|
||||
std::shared_ptr<mdlp::Discretizer> create(const std::string& name);
|
||||
void registerFactoryFunction(const std::string& name,
|
||||
function<mdlp::Discretizer* (void)> classFactoryFunction);
|
||||
std::vector<string> getNames();
|
||||
std::string toString();
|
||||
private:
|
||||
map<std::string, function<mdlp::Discretizer* (void)>> functionRegistry;
|
||||
static Discretization* factory; //singleton
|
||||
Discretization() {};
|
||||
};
|
||||
class RegistrarDiscretization {
|
||||
public:
|
||||
RegistrarDiscretization(const std::string& className, function<mdlp::Discretizer* (void)> classFactoryFunction);
|
||||
};
|
||||
}
|
||||
#endif
|
38
src/common/DiscretizationRegister.h
Normal file
38
src/common/DiscretizationRegister.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#ifndef DISCRETIZATIONREGISTER_H
|
||||
#define DISCRETIZATIONREGISTER_H
|
||||
#include <common/Discretization.h>
|
||||
static platform::RegistrarDiscretization registrarM("mdlp",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp();});
|
||||
static platform::RegistrarDiscretization registrarBU3("bin3u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ3("bin3q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU4("bin4u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ4("bin4q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU5("bin5u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(5, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ5("bin5q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(5, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU6("bin6u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(6, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ6("bin6q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(6, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU7("bin7u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(7, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ7("bin7q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(7, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU8("bin8u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(8, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ8("bin8q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(8, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU9("bin9u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(9, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ9("bin9q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(9, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU10("bin10u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(10, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ10("bin10q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(10, mdlp::strategy_t::QUANTILE);});
|
||||
#endif
|
@@ -13,9 +13,55 @@ namespace platform {
|
||||
class DotEnv {
|
||||
private:
|
||||
std::map<std::string, std::string> env;
|
||||
std::map<std::string, std::vector<std::string>> valid;
|
||||
public:
|
||||
DotEnv()
|
||||
DotEnv(bool create = false)
|
||||
{
|
||||
valid =
|
||||
{
|
||||
{"depth", {"any"}},
|
||||
{"discretize", {"0", "1"}},
|
||||
{"discretize_algo", {"mdlp", "bin3u", "bin3q", "bin4u", "bin4q", "bin5q", "bin5u", "bin6q", "bin6u", "bin7q", "bin7u", "bin8q", "bin8u", "bin9q", "bin9u", "bin10q", "bin10u"}},
|
||||
{"experiment", {"discretiz", "odte", "covid", "Test"}},
|
||||
{"fit_features", {"0", "1"}},
|
||||
{"framework", {"bulma", "bootstrap"}},
|
||||
{"ignore_nan", {"0", "1"}},
|
||||
{"leaves", {"any"}},
|
||||
{"margin", {"0.1", "0.2", "0.3"}},
|
||||
{"model", {"any"}},
|
||||
{"n_folds", {"5", "10"}},
|
||||
{"nodes", {"any"}},
|
||||
{"platform", {"any"}},
|
||||
{"stratified", {"0", "1"}},
|
||||
{"score", {"accuracy", "roc-auc-ovr"}},
|
||||
{"seeds", {"any"}},
|
||||
{"smooth_strat", {"ORIGINAL", "LAPLACE", "CESTNIK"}},
|
||||
{"source_data", {"Arff", "Tanveer", "Surcov", "Test"}},
|
||||
};
|
||||
if (create) {
|
||||
// For testing purposes
|
||||
std::ofstream file(".env");
|
||||
file << "experiment=Test" << std::endl;
|
||||
file << "source_data=Test" << std::endl;
|
||||
file << "margin=0.1" << std::endl;
|
||||
file << "score=accuracy" << std::endl;
|
||||
file << "platform=um790Linux" << std::endl;
|
||||
file << "n_folds=5" << std::endl;
|
||||
file << "discretize_algo=mdlp" << std::endl;
|
||||
file << "smooth_strat=ORIGINAL" << std::endl;
|
||||
file << "stratified=0" << std::endl;
|
||||
file << "model=TAN" << std::endl;
|
||||
file << "seeds=[271]" << std::endl;
|
||||
file << "discretize=0" << std::endl;
|
||||
file << "ignore_nan=0" << std::endl;
|
||||
file << "nodes=Nodes" << std::endl;
|
||||
file << "leaves=Edges" << std::endl;
|
||||
file << "depth=States" << std::endl;
|
||||
file << "fit_features=0" << std::endl;
|
||||
file << "framework=bulma" << std::endl;
|
||||
file << "margin=0.1" << std::endl;
|
||||
file.close();
|
||||
}
|
||||
std::ifstream file(".env");
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "File .env not found" << std::endl;
|
||||
@@ -30,12 +76,62 @@ namespace platform {
|
||||
std::istringstream iss(line);
|
||||
std::string key, value;
|
||||
if (std::getline(iss, key, '=') && std::getline(iss, value)) {
|
||||
key = trim(key);
|
||||
value = trim(value);
|
||||
parse(key, value);
|
||||
env[key] = value;
|
||||
}
|
||||
}
|
||||
parseEnv();
|
||||
}
|
||||
void parse(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (valid.find(key) == valid.end()) {
|
||||
std::cerr << "Invalid key in .env: " << key << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
if (valid[key].front() == "any") {
|
||||
return;
|
||||
}
|
||||
if (std::find(valid[key].begin(), valid[key].end(), value) == valid[key].end()) {
|
||||
std::cerr << "Invalid value in .env: " << key << " = " << value << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
std::vector<std::string> valid_tokens(const std::string& key)
|
||||
{
|
||||
if (valid.find(key) == valid.end()) {
|
||||
return {};
|
||||
}
|
||||
return valid.at(key);
|
||||
}
|
||||
std::string valid_values(const std::string& key)
|
||||
{
|
||||
std::string valid_values = "{", sep = "";
|
||||
if (valid.find(key) == valid.end()) {
|
||||
return "{}";
|
||||
}
|
||||
for (const auto& value : valid.at(key)) {
|
||||
valid_values += sep + value;
|
||||
sep = ", ";
|
||||
}
|
||||
return valid_values + "}";
|
||||
}
|
||||
void parseEnv()
|
||||
{
|
||||
for (auto& [key, values] : valid) {
|
||||
if (env.find(key) == env.end()) {
|
||||
std::cerr << "Key not found in .env: " << key << ", valid values: " << valid_values(key) << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::string get(const std::string& key)
|
||||
{
|
||||
if (env.find(key) == env.end()) {
|
||||
std::cerr << "Key not found in .env: " << key << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
return env.at(key);
|
||||
}
|
||||
std::vector<int> getSeeds()
|
||||
|
@@ -6,15 +6,30 @@
|
||||
namespace platform {
|
||||
class Paths {
|
||||
public:
|
||||
static std::string results() { return "results/"; }
|
||||
static std::string hiddenResults() { return "hidden_results/"; }
|
||||
static std::string excel() { return "excel/"; }
|
||||
static std::string grid() { return "grid/"; }
|
||||
static std::string createIfNotExists(const std::string& folder)
|
||||
{
|
||||
if (!std::filesystem::exists(folder)) {
|
||||
std::filesystem::create_directory(folder);
|
||||
}
|
||||
return folder;
|
||||
}
|
||||
static std::string results() { return createIfNotExists("results/"); }
|
||||
static std::string hiddenResults() { return createIfNotExists("hidden_results/"); }
|
||||
static std::string excel() { return createIfNotExists("excel/"); }
|
||||
static std::string grid() { return createIfNotExists("grid/"); }
|
||||
static std::string graphs() { return createIfNotExists("graphs/"); }
|
||||
static std::string tex() { return createIfNotExists("tex/"); }
|
||||
static std::string datasets()
|
||||
{
|
||||
auto env = platform::DotEnv();
|
||||
return env.get("source_data");
|
||||
}
|
||||
static std::string experiment_file(const std::string& fileName, bool discretize, bool stratified, int seed, int nfold)
|
||||
{
|
||||
std::string disc = discretize ? "_disc_" : "_ndisc_";
|
||||
std::string strat = stratified ? "strat_" : "nstrat_";
|
||||
return "datasets_experiment/" + fileName + disc + strat + std::to_string(seed) + "_" + std::to_string(nfold) + ".json";
|
||||
}
|
||||
static void createPath(const std::string& path)
|
||||
{
|
||||
// Create directory if it does not exist
|
||||
@@ -25,7 +40,16 @@ namespace platform {
|
||||
throw std::runtime_error("Could not create directory " + path);
|
||||
}
|
||||
}
|
||||
static std::string bestResultsFile(const std::string& score, const std::string& model)
|
||||
{
|
||||
return "best_results_" + score + "_" + model + ".json";
|
||||
}
|
||||
static std::string bestResultsExcel(const std::string& score)
|
||||
{
|
||||
return "BestResults_" + score + ".xlsx";
|
||||
}
|
||||
static std::string excelResults() { return "some_results.xlsx"; }
|
||||
static std::string excelDatasets() { return "datasets.xlsx"; }
|
||||
static std::string grid_input(const std::string& model)
|
||||
{
|
||||
return grid() + "grid_" + model + "_input.json";
|
||||
@@ -34,6 +58,23 @@ namespace platform {
|
||||
{
|
||||
return grid() + "grid_" + model + "_output.json";
|
||||
}
|
||||
static std::string tex_output()
|
||||
{
|
||||
return "results.tex";
|
||||
}
|
||||
static std::string md_output()
|
||||
{
|
||||
return "results.md";
|
||||
}
|
||||
static std::string tex_post_hoc()
|
||||
{
|
||||
return "post_hoc.tex";
|
||||
}
|
||||
static std::string md_post_hoc()
|
||||
{
|
||||
return "post_hoc.md";
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
#endif
|
38
src/common/SourceData.h.in
Normal file
38
src/common/SourceData.h.in
Normal file
@@ -0,0 +1,38 @@
|
||||
#ifndef SOURCEDATA_H
|
||||
#define SOURCEDATA_H
|
||||
namespace platform {
|
||||
enum fileType_t { CSV, ARFF, RDATA };
|
||||
class SourceData {
|
||||
public:
|
||||
SourceData(std::string source)
|
||||
{
|
||||
if (source == "Surcov") {
|
||||
path = "datasets/";
|
||||
fileType = CSV;
|
||||
} else if (source == "Arff") {
|
||||
path = "datasets/";
|
||||
fileType = ARFF;
|
||||
} else if (source == "Tanveer") {
|
||||
path = "data/";
|
||||
fileType = RDATA;
|
||||
} else if (source == "Test") {
|
||||
path = "@TEST_DATA_PATH@/";
|
||||
fileType = ARFF;
|
||||
} else {
|
||||
throw std::invalid_argument("Unknown source.");
|
||||
}
|
||||
}
|
||||
std::string getPath()
|
||||
{
|
||||
return path;
|
||||
}
|
||||
fileType_t getFileType()
|
||||
{
|
||||
return fileType;
|
||||
}
|
||||
private:
|
||||
std::string path;
|
||||
fileType_t fileType;
|
||||
};
|
||||
}
|
||||
#endif
|
@@ -9,10 +9,13 @@ namespace platform {
|
||||
inline static const std::string black_star{ "\u2605" };
|
||||
inline static const std::string cross{ "\u2717" };
|
||||
inline static const std::string upward_arrow{ "\u27B6" };
|
||||
inline static const std::string down_arrow{ "\u27B4" };
|
||||
inline static const std::string downward_arrow{ "\u27B4" };
|
||||
inline static const std::string up_arrow{ "\u2B06" };
|
||||
inline static const std::string down_arrow{ "\u2B07" };
|
||||
inline static const std::string ellipsis{ "\u2026" };
|
||||
inline static const std::string equal_best{ check_mark };
|
||||
inline static const std::string better_best{ black_star };
|
||||
inline static const std::string notebook{ "\U0001F5C8" };
|
||||
};
|
||||
}
|
||||
#endif // !SYMBOLS_H
|
||||
#endif
|
106
src/common/TensorUtils.hpp
Normal file
106
src/common/TensorUtils.hpp
Normal file
@@ -0,0 +1,106 @@
|
||||
#ifndef TENSORUTILS_HPP
|
||||
#define TENSORUTILS_HPP
|
||||
#include <torch/torch.h>
|
||||
#include <vector>
|
||||
namespace platform {
|
||||
class TensorUtils {
|
||||
public:
|
||||
template <typename T>
|
||||
static std::vector<T> tensorToVector(const torch::Tensor& tensor)
|
||||
{
|
||||
torch::Tensor contig_tensor = tensor.contiguous();
|
||||
auto num_elements = contig_tensor.numel();
|
||||
const T* tensor_data = contig_tensor.data_ptr<T>();
|
||||
std::vector<T> result(tensor_data, tensor_data + num_elements);
|
||||
return result;
|
||||
}
|
||||
static std::vector<std::vector<int>> to_matrix(const torch::Tensor& X)
|
||||
{
|
||||
// Ensure tensor is contiguous in memory
|
||||
auto X_contig = X.contiguous();
|
||||
|
||||
// Access tensor data pointer directly
|
||||
auto data_ptr = X_contig.data_ptr<int>();
|
||||
|
||||
// IF you are using int64_t as the data type, use the following line
|
||||
//auto data_ptr = X_contig.data_ptr<int64_t>();
|
||||
//std::vector<std::vector<int64_t>> data(X.size(0), std::vector<int64_t>(X.size(1)));
|
||||
|
||||
// Prepare output container
|
||||
std::vector<std::vector<int>> data(X.size(0), std::vector<int>(X.size(1)));
|
||||
|
||||
// Fill the 2D vector in a single loop using pointer arithmetic
|
||||
int rows = X.size(0);
|
||||
int cols = X.size(1);
|
||||
for (int i = 0; i < rows; ++i) {
|
||||
std::copy(data_ptr + i * cols, data_ptr + (i + 1) * cols, data[i].begin());
|
||||
}
|
||||
return data;
|
||||
}
|
||||
template <typename T>
|
||||
static std::vector<T> to_vector(const torch::Tensor& y)
|
||||
{
|
||||
// Ensure the tensor is contiguous in memory
|
||||
auto y_contig = y.contiguous();
|
||||
|
||||
// Access data pointer
|
||||
auto data_ptr = y_contig.data_ptr<T>();
|
||||
|
||||
// Prepare output container
|
||||
std::vector<T> data(y.size(0));
|
||||
|
||||
// Copy data efficiently
|
||||
std::copy(data_ptr, data_ptr + y.size(0), data.begin());
|
||||
|
||||
return data;
|
||||
}
|
||||
static torch::Tensor to_matrix(const std::vector<std::vector<int>>& data)
|
||||
{
|
||||
if (data.empty()) return torch::empty({ 0, 0 }, torch::kInt64);
|
||||
size_t rows = data.size();
|
||||
size_t cols = data[0].size();
|
||||
torch::Tensor tensor = torch::empty({ static_cast<long>(rows), static_cast<long>(cols) }, torch::kInt64);
|
||||
for (size_t i = 0; i < rows; ++i) {
|
||||
for (size_t j = 0; j < cols; ++j) {
|
||||
tensor.index_put_({static_cast<int64_t>(i), static_cast<int64_t>(j)}, torch::scalar_tensor(data[i][j]));
|
||||
}
|
||||
}
|
||||
return tensor;
|
||||
}
|
||||
};
|
||||
static void dumpVector(const std::vector<std::vector<int>>& vec, const std::string& name)
|
||||
{
|
||||
std::cout << name << ": " << std::endl;
|
||||
for (const auto& row : vec) {
|
||||
std::cout << "[";
|
||||
for (const auto& val : row) {
|
||||
std::cout << val << " ";
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
static void dumpTensor(const torch::Tensor& tensor, const std::string& name)
|
||||
{
|
||||
std::cout << name << ": " << std::endl;
|
||||
for (auto i = 0; i < tensor.size(0); i++) {
|
||||
std::cout << "[";
|
||||
for (auto j = 0; j < tensor.size(1); j++) {
|
||||
std::cout << tensor[i][j].item<int>() << " ";
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
static void dumpTensorV(const torch::Tensor& tensor, const std::string& name)
|
||||
{
|
||||
std::cout << name << ": " << std::endl;
|
||||
std::cout << "[";
|
||||
for (int i = 0; i < tensor.size(0); i++) {
|
||||
std::cout << tensor[i].item<int>() << " ";
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // TENSORUTILS_HPP
|
@@ -40,4 +40,4 @@ namespace platform {
|
||||
}
|
||||
};
|
||||
} /* namespace platform */
|
||||
#endif /* TIMER_H */
|
||||
#endif
|
@@ -1,20 +1,20 @@
|
||||
#ifndef UTILS_H
|
||||
#define UTILS_H
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
#include <iomanip>
|
||||
#include <string.h>
|
||||
|
||||
extern char** environ;
|
||||
|
||||
namespace platform {
|
||||
//static std::vector<std::string> split(const std::string& text, char delimiter);
|
||||
static std::vector<std::string> split(const std::string& text, char delimiter)
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
std::stringstream ss(text);
|
||||
std::string token;
|
||||
while (std::getline(ss, token, delimiter)) {
|
||||
result.push_back(token);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
static std::string trim(const std::string& str)
|
||||
{
|
||||
std::string result = str;
|
||||
@@ -26,5 +26,104 @@ namespace platform {
|
||||
}).base(), result.end());
|
||||
return result;
|
||||
}
|
||||
static std::vector<std::string> split(const std::string& text, char delimiter)
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
std::stringstream ss(text);
|
||||
std::string token;
|
||||
while (std::getline(ss, token, delimiter)) {
|
||||
result.push_back(trim(token));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
inline double compute_std(std::vector<double> values, double mean)
|
||||
{
|
||||
// Compute standard devation of the values
|
||||
double sum = 0.0;
|
||||
for (const auto& value : values) {
|
||||
sum += std::pow(value - mean, 2);
|
||||
}
|
||||
double variance = sum / values.size();
|
||||
return std::sqrt(variance);
|
||||
}
|
||||
inline std::string get_date()
|
||||
{
|
||||
time_t rawtime;
|
||||
tm* timeinfo;
|
||||
time(&rawtime);
|
||||
timeinfo = std::localtime(&rawtime);
|
||||
std::ostringstream oss;
|
||||
oss << std::put_time(timeinfo, "%Y-%m-%d");
|
||||
return oss.str();
|
||||
}
|
||||
inline std::string get_time()
|
||||
{
|
||||
time_t rawtime;
|
||||
tm* timeinfo;
|
||||
time(&rawtime);
|
||||
timeinfo = std::localtime(&rawtime);
|
||||
std::ostringstream oss;
|
||||
oss << std::put_time(timeinfo, "%H:%M:%S");
|
||||
return oss.str();
|
||||
}
|
||||
static void openFile(const std::string& fileName)
|
||||
{
|
||||
// #ifdef __APPLE__
|
||||
// // macOS uses the "open" command
|
||||
// std::string command = "open";
|
||||
// #elif defined(__linux__)
|
||||
// // Linux typically uses "xdg-open"
|
||||
// std::string command = "xdg-open";
|
||||
// #else
|
||||
// // For other OSes, do nothing or handle differently
|
||||
// std::cerr << "Unsupported platform." << std::endl;
|
||||
// return;
|
||||
// #endif
|
||||
// execlp(command.c_str(), command.c_str(), fileName.c_str(), NULL);
|
||||
#ifdef __APPLE__
|
||||
const char* tool = "/usr/bin/open";
|
||||
#elif defined(__linux__)
|
||||
const char* tool = "/usr/bin/xdg-open";
|
||||
#else
|
||||
std::cerr << "Unsupported platform." << std::endl;
|
||||
return;
|
||||
#endif
|
||||
|
||||
// We'll build an argv array for execve:
|
||||
std::vector<char*> argv;
|
||||
argv.push_back(const_cast<char*>(tool)); // argv[0]
|
||||
argv.push_back(const_cast<char*>(fileName.c_str())); // argv[1]
|
||||
argv.push_back(nullptr);
|
||||
|
||||
// Make a new environment array, skipping BASH_FUNC_ variables
|
||||
std::vector<std::string> filteredEnv;
|
||||
for (char** env = environ; *env != nullptr; ++env) {
|
||||
// *env is a string like "NAME=VALUE"
|
||||
// We want to skip those starting with "BASH_FUNC_"
|
||||
if (strncmp(*env, "BASH_FUNC_", 10) == 0) {
|
||||
// skip it
|
||||
continue;
|
||||
}
|
||||
filteredEnv.push_back(*env);
|
||||
}
|
||||
|
||||
// Convert filteredEnv into a char* array
|
||||
std::vector<char*> envp;
|
||||
for (auto& var : filteredEnv) {
|
||||
envp.push_back(const_cast<char*>(var.c_str()));
|
||||
}
|
||||
envp.push_back(nullptr);
|
||||
|
||||
// Now call execve with the cleaned environment
|
||||
// NOTE: You may need a full path to the tool if it's not in PATH, or use which() logic
|
||||
// For now, let's assume "open" or "xdg-open" is found in the default PATH:
|
||||
execve(tool, argv.data(), envp.data());
|
||||
|
||||
// If we reach here, execve failed
|
||||
perror("execve failed");
|
||||
// This would terminate your current process if it's not in a child
|
||||
// Usually you'd do something like:
|
||||
_exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
492
src/experimental_clfs/AdaBoost.cpp
Normal file
492
src/experimental_clfs/AdaBoost.cpp
Normal file
@@ -0,0 +1,492 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#include "AdaBoost.h"
|
||||
#include "DecisionTree.h"
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
#include "common/TensorUtils.hpp"
|
||||
|
||||
// Conditional debug macro for performance-critical sections
|
||||
#define DEBUG_LOG(condition, ...) \
|
||||
do { \
|
||||
if (__builtin_expect((condition), 0)) { \
|
||||
std::cout << __VA_ARGS__ << std::endl; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
namespace bayesnet {
|
||||
|
||||
AdaBoost::AdaBoost(int n_estimators, int max_depth)
|
||||
: Ensemble(true), n_estimators(n_estimators), base_max_depth(max_depth), n(0), n_classes(0)
|
||||
{
|
||||
validHyperparameters = { "n_estimators", "base_max_depth" };
|
||||
}
|
||||
|
||||
// Versión optimizada de buildModel - Reemplazar en AdaBoost.cpp:
|
||||
|
||||
void AdaBoost::buildModel(const torch::Tensor& weights)
|
||||
{
|
||||
// Initialize variables
|
||||
models.clear();
|
||||
alphas.clear();
|
||||
training_errors.clear();
|
||||
|
||||
// Initialize n (number of features) and n_classes
|
||||
n = dataset.size(0) - 1; // Exclude the label row
|
||||
n_classes = states[className].size();
|
||||
|
||||
// Initialize sample weights uniformly
|
||||
int n_samples = dataset.size(1);
|
||||
sample_weights = torch::ones({ n_samples }) / n_samples;
|
||||
|
||||
// If initial weights are provided, incorporate them
|
||||
if (weights.defined() && weights.numel() > 0) {
|
||||
if (weights.size(0) != n_samples) {
|
||||
throw std::runtime_error("weights must have the same length as number of samples");
|
||||
}
|
||||
sample_weights = weights.clone();
|
||||
normalizeWeights();
|
||||
}
|
||||
|
||||
// Conditional debug information (only when debug is enabled)
|
||||
DEBUG_LOG(debug, "Starting AdaBoost training with " << n_estimators << " estimators\n"
|
||||
<< "Number of classes: " << n_classes << "\n"
|
||||
<< "Number of features: " << n << "\n"
|
||||
<< "Number of samples: " << n_samples);
|
||||
|
||||
// Pre-compute random guess error threshold
|
||||
const double random_guess_error = 1.0 - (1.0 / static_cast<double>(n_classes));
|
||||
|
||||
// Main AdaBoost training loop (SAMME algorithm)
|
||||
for (int iter = 0; iter < n_estimators; ++iter) {
|
||||
// Train base estimator with current sample weights
|
||||
auto estimator = trainBaseEstimator(sample_weights);
|
||||
|
||||
// Calculate weighted error
|
||||
double weighted_error = calculateWeightedError(estimator.get(), sample_weights);
|
||||
training_errors.push_back(weighted_error);
|
||||
|
||||
// According to SAMME, we need error < random_guess_error
|
||||
if (weighted_error >= random_guess_error) {
|
||||
DEBUG_LOG(debug, "Error >= random guess (" << random_guess_error << "), stopping");
|
||||
// If only one estimator and it's worse than random, keep it with zero weight
|
||||
if (models.empty()) {
|
||||
models.push_back(std::move(estimator));
|
||||
alphas.push_back(0.0);
|
||||
}
|
||||
break; // Stop boosting
|
||||
}
|
||||
|
||||
// Check for perfect classification BEFORE calculating alpha
|
||||
if (weighted_error <= 1e-10) {
|
||||
DEBUG_LOG(debug, "Perfect classification achieved (error=" << weighted_error << ")");
|
||||
|
||||
// For perfect classification, use a large but finite alpha
|
||||
double alpha = 10.0 + std::log(static_cast<double>(n_classes - 1));
|
||||
|
||||
// Store the estimator and its weight
|
||||
models.push_back(std::move(estimator));
|
||||
alphas.push_back(alpha);
|
||||
|
||||
DEBUG_LOG(debug, "Iteration " << iter << ":\n"
|
||||
<< " Weighted error: " << weighted_error << "\n"
|
||||
<< " Alpha (finite): " << alpha << "\n"
|
||||
<< " Random guess error: " << random_guess_error);
|
||||
|
||||
break; // Stop training as we have a perfect classifier
|
||||
}
|
||||
|
||||
// Calculate alpha (estimator weight) using SAMME formula
|
||||
// alpha = log((1 - err) / err) + log(K - 1)
|
||||
// Clamp weighted_error to avoid division by zero and infinite alpha
|
||||
double clamped_error = std::max(1e-15, std::min(1.0 - 1e-15, weighted_error));
|
||||
double alpha = std::log((1.0 - clamped_error) / clamped_error) +
|
||||
std::log(static_cast<double>(n_classes - 1));
|
||||
|
||||
// Clamp alpha to reasonable bounds to avoid numerical issues
|
||||
alpha = std::max(-10.0, std::min(10.0, alpha));
|
||||
|
||||
// Store the estimator and its weight
|
||||
models.push_back(std::move(estimator));
|
||||
alphas.push_back(alpha);
|
||||
|
||||
// Update sample weights (only if this is not the last iteration)
|
||||
if (iter < n_estimators - 1) {
|
||||
updateSampleWeights(models.back().get(), alpha);
|
||||
normalizeWeights();
|
||||
}
|
||||
|
||||
DEBUG_LOG(debug, "Iteration " << iter << ":\n"
|
||||
<< " Weighted error: " << weighted_error << "\n"
|
||||
<< " Alpha: " << alpha << "\n"
|
||||
<< " Random guess error: " << random_guess_error);
|
||||
}
|
||||
|
||||
// Set the number of models actually trained
|
||||
n_models = models.size();
|
||||
DEBUG_LOG(debug, "AdaBoost training completed with " << n_models << " models");
|
||||
}
|
||||
|
||||
void AdaBoost::trainModel(const torch::Tensor& weights, const Smoothing_t smoothing)
|
||||
{
|
||||
// Call buildModel which does the actual training
|
||||
buildModel(weights);
|
||||
fitted = true;
|
||||
}
|
||||
|
||||
std::unique_ptr<Classifier> AdaBoost::trainBaseEstimator(const torch::Tensor& weights)
|
||||
{
|
||||
// Create a decision tree with specified max depth
|
||||
auto tree = std::make_unique<DecisionTree>(base_max_depth);
|
||||
|
||||
// Ensure weights are properly normalized
|
||||
auto normalized_weights = weights / weights.sum();
|
||||
|
||||
// Fit the tree with the current sample weights
|
||||
tree->fit(dataset, features, className, states, normalized_weights, Smoothing_t::NONE);
|
||||
|
||||
return tree;
|
||||
}
|
||||
|
||||
double AdaBoost::calculateWeightedError(Classifier* estimator, const torch::Tensor& weights)
|
||||
{
|
||||
// Get features and labels from dataset (avoid repeated indexing)
|
||||
auto X = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), torch::indexing::Slice() });
|
||||
auto y_true = dataset.index({ -1, torch::indexing::Slice() });
|
||||
|
||||
// Get predictions from the estimator
|
||||
auto y_pred = estimator->predict(X);
|
||||
|
||||
// Vectorized error calculation using PyTorch operations
|
||||
auto incorrect = (y_pred != y_true).to(torch::kDouble);
|
||||
|
||||
// Direct dot product for weighted error (more efficient than sum)
|
||||
double weighted_error = torch::dot(incorrect, weights).item<double>();
|
||||
|
||||
// Clamp to valid range in one operation
|
||||
return std::clamp(weighted_error, 1e-15, 1.0 - 1e-15);
|
||||
}
|
||||
|
||||
void AdaBoost::updateSampleWeights(Classifier* estimator, double alpha)
|
||||
{
|
||||
// Get predictions from the estimator (reuse from calculateWeightedError if possible)
|
||||
auto X = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), torch::indexing::Slice() });
|
||||
auto y_true = dataset.index({ -1, torch::indexing::Slice() });
|
||||
auto y_pred = estimator->predict(X);
|
||||
|
||||
// Vectorized weight update using PyTorch operations
|
||||
auto incorrect = (y_pred != y_true).to(torch::kDouble);
|
||||
|
||||
// Single vectorized operation instead of element-wise multiplication
|
||||
sample_weights *= torch::exp(alpha * incorrect);
|
||||
|
||||
// Vectorized clamping for numerical stability
|
||||
sample_weights = torch::clamp(sample_weights, 1e-15, 1e15);
|
||||
}
|
||||
|
||||
void AdaBoost::normalizeWeights()
|
||||
{
|
||||
// Single-pass normalization using PyTorch operations
|
||||
double sum_weights = torch::sum(sample_weights).item<double>();
|
||||
|
||||
if (__builtin_expect(sum_weights <= 0, 0)) {
|
||||
// Reset to uniform if all weights are zero/negative (rare case)
|
||||
sample_weights = torch::ones_like(sample_weights) / sample_weights.size(0);
|
||||
} else {
|
||||
// Vectorized normalization
|
||||
sample_weights /= sum_weights;
|
||||
|
||||
// Vectorized minimum weight enforcement
|
||||
sample_weights = torch::clamp_min(sample_weights, 1e-15);
|
||||
|
||||
// Renormalize after clamping (if any weights were clamped)
|
||||
double new_sum = torch::sum(sample_weights).item<double>();
|
||||
if (new_sum != 1.0) {
|
||||
sample_weights /= new_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> AdaBoost::graph(const std::string& title) const
|
||||
{
|
||||
// Create a graph representation of the AdaBoost ensemble
|
||||
std::vector<std::string> graph_lines;
|
||||
|
||||
// Header
|
||||
graph_lines.push_back("digraph AdaBoost {");
|
||||
graph_lines.push_back(" rankdir=TB;");
|
||||
graph_lines.push_back(" node [shape=box];");
|
||||
|
||||
if (!title.empty()) {
|
||||
graph_lines.push_back(" label=\"" + title + "\";");
|
||||
graph_lines.push_back(" labelloc=t;");
|
||||
}
|
||||
|
||||
// Add input node
|
||||
graph_lines.push_back(" Input [shape=ellipse, label=\"Input Features\"];");
|
||||
|
||||
// Add base estimators
|
||||
for (size_t i = 0; i < models.size(); ++i) {
|
||||
std::stringstream ss;
|
||||
ss << " Estimator" << i << " [label=\"Base Estimator " << i + 1
|
||||
<< "\\nα = " << std::fixed << std::setprecision(3) << alphas[i] << "\"];";
|
||||
graph_lines.push_back(ss.str());
|
||||
|
||||
// Connect input to estimator
|
||||
ss.str("");
|
||||
ss << " Input -> Estimator" << i << ";";
|
||||
graph_lines.push_back(ss.str());
|
||||
}
|
||||
|
||||
// Add combination node
|
||||
graph_lines.push_back(" Combination [shape=diamond, label=\"Weighted Vote\"];");
|
||||
|
||||
// Connect estimators to combination
|
||||
for (size_t i = 0; i < models.size(); ++i) {
|
||||
std::stringstream ss;
|
||||
ss << " Estimator" << i << " -> Combination;";
|
||||
graph_lines.push_back(ss.str());
|
||||
}
|
||||
|
||||
// Add output node
|
||||
graph_lines.push_back(" Output [shape=ellipse, label=\"Final Prediction\"];");
|
||||
graph_lines.push_back(" Combination -> Output;");
|
||||
|
||||
// Close graph
|
||||
graph_lines.push_back("}");
|
||||
|
||||
return graph_lines;
|
||||
}
|
||||
|
||||
void AdaBoost::checkValues() const
|
||||
{
|
||||
if (n_estimators <= 0) {
|
||||
throw std::invalid_argument("n_estimators must be positive");
|
||||
}
|
||||
if (base_max_depth <= 0) {
|
||||
throw std::invalid_argument("base_max_depth must be positive");
|
||||
}
|
||||
}
|
||||
|
||||
void AdaBoost::setHyperparameters(const nlohmann::json& hyperparameters_)
|
||||
{
|
||||
auto hyperparameters = hyperparameters_;
|
||||
// Set hyperparameters from JSON
|
||||
auto it = hyperparameters.find("n_estimators");
|
||||
if (it != hyperparameters.end()) {
|
||||
n_estimators = it->get<int>();
|
||||
hyperparameters.erase("n_estimators");
|
||||
}
|
||||
|
||||
it = hyperparameters.find("base_max_depth");
|
||||
if (it != hyperparameters.end()) {
|
||||
base_max_depth = it->get<int>();
|
||||
hyperparameters.erase("base_max_depth");
|
||||
}
|
||||
checkValues();
|
||||
Ensemble::setHyperparameters(hyperparameters);
|
||||
}
|
||||
|
||||
int AdaBoost::predictSample(const torch::Tensor& x) const
|
||||
{
|
||||
// Early validation (keep essential checks only)
|
||||
if (!fitted || models.empty()) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
// Pre-allocate and reuse memory
|
||||
static thread_local std::vector<double> class_votes_cache;
|
||||
if (class_votes_cache.size() != static_cast<size_t>(n_classes)) {
|
||||
class_votes_cache.resize(n_classes);
|
||||
}
|
||||
std::fill(class_votes_cache.begin(), class_votes_cache.end(), 0.0);
|
||||
|
||||
// Optimized voting loop - avoid exception handling in hot path
|
||||
for (size_t i = 0; i < models.size(); ++i) {
|
||||
double alpha = alphas[i];
|
||||
if (alpha <= 0 || !std::isfinite(alpha)) continue;
|
||||
|
||||
// Direct cast and call - avoid virtual dispatch overhead
|
||||
int predicted_class = static_cast<DecisionTree*>(models[i].get())->predictSample(x);
|
||||
|
||||
// Bounds check with branch prediction hint
|
||||
if (__builtin_expect(predicted_class >= 0 && predicted_class < n_classes, 1)) {
|
||||
class_votes_cache[predicted_class] += alpha;
|
||||
}
|
||||
}
|
||||
|
||||
// Fast argmax using iterators
|
||||
return std::distance(class_votes_cache.begin(),
|
||||
std::max_element(class_votes_cache.begin(), class_votes_cache.end()));
|
||||
}
|
||||
|
||||
torch::Tensor AdaBoost::predictProbaSample(const torch::Tensor& x) const
|
||||
{
|
||||
// Early validation
|
||||
if (!fitted || models.empty()) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
// Use stack allocation for small arrays (typical case: n_classes <= 32)
|
||||
constexpr int STACK_THRESHOLD = 32;
|
||||
double stack_votes[STACK_THRESHOLD];
|
||||
std::vector<double> heap_votes;
|
||||
double* class_votes;
|
||||
|
||||
if (n_classes <= STACK_THRESHOLD) {
|
||||
class_votes = stack_votes;
|
||||
std::fill_n(class_votes, n_classes, 0.0);
|
||||
} else {
|
||||
heap_votes.resize(n_classes, 0.0);
|
||||
class_votes = heap_votes.data();
|
||||
}
|
||||
|
||||
double total_votes = 0.0;
|
||||
|
||||
// Optimized voting loop
|
||||
for (size_t i = 0; i < models.size(); ++i) {
|
||||
double alpha = alphas[i];
|
||||
if (alpha <= 0 || !std::isfinite(alpha)) continue;
|
||||
|
||||
int predicted_class = static_cast<DecisionTree*>(models[i].get())->predictSample(x);
|
||||
|
||||
if (__builtin_expect(predicted_class >= 0 && predicted_class < n_classes, 1)) {
|
||||
class_votes[predicted_class] += alpha;
|
||||
total_votes += alpha;
|
||||
}
|
||||
}
|
||||
|
||||
// Direct tensor creation with pre-computed size
|
||||
torch::Tensor class_probs = torch::empty({ n_classes }, torch::TensorOptions().dtype(torch::kFloat32));
|
||||
auto probs_accessor = class_probs.accessor<float, 1>();
|
||||
|
||||
if (__builtin_expect(total_votes > 0.0, 1)) {
|
||||
// Vectorized probability calculation
|
||||
const double inv_total = 1.0 / total_votes;
|
||||
for (int j = 0; j < n_classes; ++j) {
|
||||
probs_accessor[j] = static_cast<float>(class_votes[j] * inv_total);
|
||||
}
|
||||
} else {
|
||||
// Uniform distribution fallback
|
||||
const float uniform_prob = 1.0f / n_classes;
|
||||
for (int j = 0; j < n_classes; ++j) {
|
||||
probs_accessor[j] = uniform_prob;
|
||||
}
|
||||
}
|
||||
|
||||
return class_probs;
|
||||
}
|
||||
|
||||
torch::Tensor AdaBoost::predict_proba(torch::Tensor& X)
|
||||
{
|
||||
if (!fitted || models.empty()) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
// Input validation
|
||||
if (X.size(0) != n) {
|
||||
throw std::runtime_error("Input has wrong number of features. Expected " +
|
||||
std::to_string(n) + " but got " + std::to_string(X.size(0)));
|
||||
}
|
||||
|
||||
const int n_samples = X.size(1);
|
||||
|
||||
// Pre-allocate output tensor with correct layout
|
||||
torch::Tensor probabilities = torch::empty({ n_samples, n_classes },
|
||||
torch::TensorOptions().dtype(torch::kFloat32));
|
||||
|
||||
// Convert to contiguous memory if needed (optimization for memory access)
|
||||
if (!X.is_contiguous()) {
|
||||
X = X.contiguous();
|
||||
}
|
||||
|
||||
// Batch processing with memory-efficient sample extraction
|
||||
for (int i = 0; i < n_samples; ++i) {
|
||||
// Extract sample without unnecessary copies
|
||||
auto sample = X.select(1, i);
|
||||
|
||||
// Direct assignment to pre-allocated tensor
|
||||
probabilities[i] = predictProbaSample(sample);
|
||||
}
|
||||
|
||||
return probabilities;
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>> AdaBoost::predict_proba(std::vector<std::vector<int>>& X)
|
||||
{
|
||||
const size_t n_samples = X[0].size();
|
||||
|
||||
// Pre-allocate result with exact size
|
||||
std::vector<std::vector<double>> result;
|
||||
result.reserve(n_samples);
|
||||
|
||||
// Avoid repeated allocations
|
||||
for (size_t i = 0; i < n_samples; ++i) {
|
||||
result.emplace_back(n_classes, 0.0);
|
||||
}
|
||||
|
||||
// Convert to tensor only once (batch conversion is more efficient)
|
||||
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
|
||||
torch::Tensor proba_tensor = predict_proba(X_tensor);
|
||||
|
||||
// Optimized tensor-to-vector conversion
|
||||
auto proba_accessor = proba_tensor.accessor<float, 2>();
|
||||
for (size_t i = 0; i < n_samples; ++i) {
|
||||
for (int j = 0; j < n_classes; ++j) {
|
||||
result[i][j] = static_cast<double>(proba_accessor[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
torch::Tensor AdaBoost::predict(torch::Tensor& X)
|
||||
{
|
||||
if (!fitted || models.empty()) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
if (X.size(0) != n) {
|
||||
throw std::runtime_error("Input has wrong number of features. Expected " +
|
||||
std::to_string(n) + " but got " + std::to_string(X.size(0)));
|
||||
}
|
||||
|
||||
const int n_samples = X.size(1);
|
||||
|
||||
// Pre-allocate with correct dtype
|
||||
torch::Tensor predictions = torch::empty({ n_samples }, torch::TensorOptions().dtype(torch::kInt32));
|
||||
auto pred_accessor = predictions.accessor<int32_t, 1>();
|
||||
|
||||
// Ensure contiguous memory layout
|
||||
if (!X.is_contiguous()) {
|
||||
X = X.contiguous();
|
||||
}
|
||||
|
||||
// Optimized prediction loop
|
||||
for (int i = 0; i < n_samples; ++i) {
|
||||
auto sample = X.select(1, i);
|
||||
pred_accessor[i] = predictSample(sample);
|
||||
}
|
||||
|
||||
return predictions;
|
||||
}
|
||||
|
||||
std::vector<int> AdaBoost::predict(std::vector<std::vector<int>>& X)
|
||||
{
|
||||
// Single tensor conversion for batch processing
|
||||
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
|
||||
torch::Tensor predictions_tensor = predict(X_tensor);
|
||||
|
||||
// Optimized tensor-to-vector conversion
|
||||
std::vector<int> result = platform::TensorUtils::to_vector<int>(predictions_tensor);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace bayesnet
|
81
src/experimental_clfs/AdaBoost.h
Normal file
81
src/experimental_clfs/AdaBoost.h
Normal file
@@ -0,0 +1,81 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#ifndef ADABOOST_H
|
||||
#define ADABOOST_H
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "bayesnet/ensembles/Ensemble.h"
|
||||
|
||||
namespace bayesnet {
|
||||
class AdaBoost : public Ensemble {
|
||||
public:
|
||||
explicit AdaBoost(int n_estimators = 100, int max_depth = 1);
|
||||
virtual ~AdaBoost() = default;
|
||||
|
||||
// Override base class methods
|
||||
std::vector<std::string> graph(const std::string& title = "") const override;
|
||||
|
||||
// AdaBoost specific methods
|
||||
void setNEstimators(int n_estimators) { this->n_estimators = n_estimators; checkValues(); }
|
||||
int getNEstimators() const { return n_estimators; }
|
||||
void setBaseMaxDepth(int depth) { this->base_max_depth = depth; checkValues(); }
|
||||
int getBaseMaxDepth() const { return base_max_depth; }
|
||||
|
||||
// Get the weight of each base estimator
|
||||
std::vector<double> getEstimatorWeights() const { return alphas; }
|
||||
|
||||
// Get training errors for each iteration
|
||||
std::vector<double> getTrainingErrors() const { return training_errors; }
|
||||
|
||||
// Override setHyperparameters from BaseClassifier
|
||||
void setHyperparameters(const nlohmann::json& hyperparameters) override;
|
||||
|
||||
torch::Tensor predict(torch::Tensor& X) override;
|
||||
std::vector<int> predict(std::vector<std::vector<int>>& X) override;
|
||||
torch::Tensor predict_proba(torch::Tensor& X) override;
|
||||
std::vector<std::vector<double>> predict_proba(std::vector<std::vector<int>>& X) override;
|
||||
void setDebug(bool debug) { this->debug = debug; }
|
||||
|
||||
protected:
|
||||
void buildModel(const torch::Tensor& weights) override;
|
||||
void trainModel(const torch::Tensor& weights, const Smoothing_t smoothing) override;
|
||||
|
||||
private:
|
||||
int n_estimators;
|
||||
int base_max_depth; // Max depth for base decision trees
|
||||
std::vector<double> alphas; // Weight of each base estimator
|
||||
std::vector<double> training_errors; // Training error at each iteration
|
||||
torch::Tensor sample_weights; // Current sample weights
|
||||
int n_classes; // Number of classes in the target variable
|
||||
int n; // Number of features
|
||||
|
||||
// Train a single base estimator
|
||||
std::unique_ptr<Classifier> trainBaseEstimator(const torch::Tensor& weights);
|
||||
|
||||
// Calculate weighted error
|
||||
double calculateWeightedError(Classifier* estimator, const torch::Tensor& weights);
|
||||
|
||||
// Update sample weights based on predictions
|
||||
void updateSampleWeights(Classifier* estimator, double alpha);
|
||||
|
||||
// Normalize weights to sum to 1
|
||||
void normalizeWeights();
|
||||
|
||||
// Check if hyperparameters values are valid
|
||||
void checkValues() const;
|
||||
|
||||
// Make predictions for a single sample
|
||||
int predictSample(const torch::Tensor& x) const;
|
||||
|
||||
// Make probabilistic predictions for a single sample
|
||||
torch::Tensor predictProbaSample(const torch::Tensor& x) const;
|
||||
bool debug = false; // Enable debug mode for debug output
|
||||
};
|
||||
}
|
||||
|
||||
#endif // ADABOOST_H
|
53
src/experimental_clfs/CountingSemaphore.hpp
Normal file
53
src/experimental_clfs/CountingSemaphore.hpp
Normal file
@@ -0,0 +1,53 @@
|
||||
#ifndef COUNTING_SEMAPHORE_H
|
||||
#define COUNTING_SEMAPHORE_H
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <algorithm>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
|
||||
class CountingSemaphore {
|
||||
public:
|
||||
static CountingSemaphore& getInstance()
|
||||
{
|
||||
static CountingSemaphore instance;
|
||||
return instance;
|
||||
}
|
||||
// Delete copy constructor and assignment operator
|
||||
CountingSemaphore(const CountingSemaphore&) = delete;
|
||||
CountingSemaphore& operator=(const CountingSemaphore&) = delete;
|
||||
void acquire()
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mtx_);
|
||||
cv_.wait(lock, [this]() { return count_ > 0; });
|
||||
--count_;
|
||||
}
|
||||
void release()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mtx_);
|
||||
++count_;
|
||||
if (count_ <= max_count_) {
|
||||
cv_.notify_one();
|
||||
}
|
||||
}
|
||||
uint getCount() const
|
||||
{
|
||||
return count_;
|
||||
}
|
||||
uint getMaxCount() const
|
||||
{
|
||||
return max_count_;
|
||||
}
|
||||
private:
|
||||
CountingSemaphore()
|
||||
: max_count_(std::max(1u, static_cast<uint>(0.95 * std::thread::hardware_concurrency()))),
|
||||
count_(max_count_)
|
||||
{
|
||||
}
|
||||
std::mutex mtx_;
|
||||
std::condition_variable cv_;
|
||||
const uint max_count_;
|
||||
uint count_;
|
||||
};
|
||||
#endif
|
495
src/experimental_clfs/DecisionTree.cpp
Normal file
495
src/experimental_clfs/DecisionTree.cpp
Normal file
@@ -0,0 +1,495 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#include "DecisionTree.h"
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
#include <limits>
|
||||
#include "common/TensorUtils.hpp"
|
||||
|
||||
namespace bayesnet {
|
||||
|
||||
DecisionTree::DecisionTree(int max_depth, int min_samples_split, int min_samples_leaf)
|
||||
: Classifier(Network()), max_depth(max_depth),
|
||||
min_samples_split(min_samples_split), min_samples_leaf(min_samples_leaf)
|
||||
{
|
||||
validHyperparameters = { "max_depth", "min_samples_split", "min_samples_leaf" };
|
||||
}
|
||||
|
||||
void DecisionTree::setHyperparameters(const nlohmann::json& hyperparameters_)
|
||||
{
|
||||
auto hyperparameters = hyperparameters_;
|
||||
// Set hyperparameters from JSON
|
||||
auto it = hyperparameters.find("max_depth");
|
||||
if (it != hyperparameters.end()) {
|
||||
max_depth = it->get<int>();
|
||||
hyperparameters.erase("max_depth"); // Remove 'order' if present
|
||||
}
|
||||
|
||||
it = hyperparameters.find("min_samples_split");
|
||||
if (it != hyperparameters.end()) {
|
||||
min_samples_split = it->get<int>();
|
||||
hyperparameters.erase("min_samples_split"); // Remove 'min_samples_split' if present
|
||||
}
|
||||
|
||||
it = hyperparameters.find("min_samples_leaf");
|
||||
if (it != hyperparameters.end()) {
|
||||
min_samples_leaf = it->get<int>();
|
||||
hyperparameters.erase("min_samples_leaf"); // Remove 'min_samples_leaf' if present
|
||||
}
|
||||
Classifier::setHyperparameters(hyperparameters);
|
||||
checkValues();
|
||||
}
|
||||
void DecisionTree::checkValues()
|
||||
{
|
||||
if (max_depth <= 0) {
|
||||
throw std::invalid_argument("max_depth must be positive");
|
||||
}
|
||||
if (min_samples_leaf <= 0) {
|
||||
throw std::invalid_argument("min_samples_leaf must be positive");
|
||||
}
|
||||
if (min_samples_split <= 0) {
|
||||
throw std::invalid_argument("min_samples_split must be positive");
|
||||
}
|
||||
}
|
||||
void DecisionTree::buildModel(const torch::Tensor& weights)
|
||||
{
|
||||
// Extract features (X) and labels (y) from dataset
|
||||
auto X = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), torch::indexing::Slice() }).t();
|
||||
auto y = dataset.index({ -1, torch::indexing::Slice() });
|
||||
|
||||
if (X.size(0) != y.size(0)) {
|
||||
throw std::runtime_error("X and y must have the same number of samples");
|
||||
}
|
||||
|
||||
n_classes = states[className].size();
|
||||
|
||||
// Use provided weights or uniform weights
|
||||
torch::Tensor sample_weights;
|
||||
if (weights.defined() && weights.numel() > 0) {
|
||||
if (weights.size(0) != X.size(0)) {
|
||||
throw std::runtime_error("weights must have the same length as number of samples");
|
||||
}
|
||||
sample_weights = weights;
|
||||
} else {
|
||||
sample_weights = torch::ones({ X.size(0) }) / X.size(0);
|
||||
}
|
||||
|
||||
// Normalize weights
|
||||
sample_weights = sample_weights / sample_weights.sum();
|
||||
|
||||
// Build the tree
|
||||
root = buildTree(X, y, sample_weights, 0);
|
||||
|
||||
// Mark as fitted
|
||||
fitted = true;
|
||||
}
|
||||
bool DecisionTree::validateTensors(const torch::Tensor& X, const torch::Tensor& y,
|
||||
const torch::Tensor& sample_weights) const
|
||||
{
|
||||
if (X.size(0) != y.size(0) || X.size(0) != sample_weights.size(0)) {
|
||||
return false;
|
||||
}
|
||||
if (X.size(0) == 0) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::unique_ptr<TreeNode> DecisionTree::buildTree(
|
||||
const torch::Tensor& X,
|
||||
const torch::Tensor& y,
|
||||
const torch::Tensor& sample_weights,
|
||||
int current_depth)
|
||||
{
|
||||
auto node = std::make_unique<TreeNode>();
|
||||
int n_samples = y.size(0);
|
||||
|
||||
// Check stopping criteria
|
||||
auto unique = at::_unique(y);
|
||||
bool should_stop = (current_depth >= max_depth) ||
|
||||
(n_samples < min_samples_split) ||
|
||||
(std::get<0>(unique).size(0) == 1); // All samples same class
|
||||
|
||||
if (should_stop || n_samples <= min_samples_leaf) {
|
||||
// Create leaf node
|
||||
node->is_leaf = true;
|
||||
|
||||
// Calculate class probabilities
|
||||
node->class_probabilities = torch::zeros({ n_classes });
|
||||
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
int class_idx = y[i].item<int>();
|
||||
node->class_probabilities[class_idx] += sample_weights[i].item<float>();
|
||||
}
|
||||
|
||||
// Normalize probabilities
|
||||
node->class_probabilities /= node->class_probabilities.sum();
|
||||
|
||||
// Set predicted class as the one with highest probability
|
||||
node->predicted_class = torch::argmax(node->class_probabilities).item<int>();
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
// Find best split
|
||||
SplitInfo best_split = findBestSplit(X, y, sample_weights);
|
||||
|
||||
// If no valid split found, create leaf
|
||||
if (best_split.feature_index == -1 || best_split.impurity_decrease <= 0) {
|
||||
node->is_leaf = true;
|
||||
|
||||
// Calculate class probabilities
|
||||
node->class_probabilities = torch::zeros({ n_classes });
|
||||
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
int class_idx = y[i].item<int>();
|
||||
node->class_probabilities[class_idx] += sample_weights[i].item<float>();
|
||||
}
|
||||
|
||||
node->class_probabilities /= node->class_probabilities.sum();
|
||||
node->predicted_class = torch::argmax(node->class_probabilities).item<int>();
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
// Create internal node
|
||||
node->is_leaf = false;
|
||||
node->split_feature = best_split.feature_index;
|
||||
node->split_value = best_split.split_value;
|
||||
|
||||
// Split data
|
||||
auto left_X = X.index({ best_split.left_mask });
|
||||
auto left_y = y.index({ best_split.left_mask });
|
||||
auto left_weights = sample_weights.index({ best_split.left_mask });
|
||||
|
||||
auto right_X = X.index({ best_split.right_mask });
|
||||
auto right_y = y.index({ best_split.right_mask });
|
||||
auto right_weights = sample_weights.index({ best_split.right_mask });
|
||||
|
||||
// Recursively build subtrees
|
||||
if (left_X.size(0) >= min_samples_leaf) {
|
||||
node->left = buildTree(left_X, left_y, left_weights, current_depth + 1);
|
||||
} else {
|
||||
// Force leaf if not enough samples
|
||||
node->left = std::make_unique<TreeNode>();
|
||||
node->left->is_leaf = true;
|
||||
auto mode = std::get<0>(torch::mode(left_y));
|
||||
node->left->predicted_class = mode.item<int>();
|
||||
node->left->class_probabilities = torch::zeros({ n_classes });
|
||||
node->left->class_probabilities[node->left->predicted_class] = 1.0;
|
||||
}
|
||||
|
||||
if (right_X.size(0) >= min_samples_leaf) {
|
||||
node->right = buildTree(right_X, right_y, right_weights, current_depth + 1);
|
||||
} else {
|
||||
// Force leaf if not enough samples
|
||||
node->right = std::make_unique<TreeNode>();
|
||||
node->right->is_leaf = true;
|
||||
auto mode = std::get<0>(torch::mode(right_y));
|
||||
node->right->predicted_class = mode.item<int>();
|
||||
node->right->class_probabilities = torch::zeros({ n_classes });
|
||||
node->right->class_probabilities[node->right->predicted_class] = 1.0;
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
DecisionTree::SplitInfo DecisionTree::findBestSplit(
|
||||
const torch::Tensor& X,
|
||||
const torch::Tensor& y,
|
||||
const torch::Tensor& sample_weights)
|
||||
{
|
||||
|
||||
SplitInfo best_split;
|
||||
best_split.feature_index = -1;
|
||||
best_split.split_value = -1;
|
||||
best_split.impurity_decrease = -std::numeric_limits<double>::infinity();
|
||||
|
||||
int n_features = X.size(1);
|
||||
int n_samples = X.size(0);
|
||||
|
||||
// Calculate impurity of current node
|
||||
double current_impurity = calculateGiniImpurity(y, sample_weights);
|
||||
double total_weight = sample_weights.sum().item<double>();
|
||||
|
||||
// Try each feature
|
||||
for (int feat_idx = 0; feat_idx < n_features; feat_idx++) {
|
||||
auto feature_values = X.index({ torch::indexing::Slice(), feat_idx });
|
||||
auto unique_values = std::get<0>(torch::unique_consecutive(std::get<0>(torch::sort(feature_values))));
|
||||
|
||||
// Try each unique value as split point
|
||||
for (int i = 0; i < unique_values.size(0); i++) {
|
||||
int split_val = unique_values[i].item<int>();
|
||||
|
||||
// Create masks for left and right splits
|
||||
auto left_mask = feature_values == split_val;
|
||||
auto right_mask = ~left_mask;
|
||||
|
||||
int left_count = left_mask.sum().item<int>();
|
||||
int right_count = right_mask.sum().item<int>();
|
||||
|
||||
// Skip if split doesn't satisfy minimum samples requirement
|
||||
if (left_count < min_samples_leaf || right_count < min_samples_leaf) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Calculate weighted impurities
|
||||
auto left_y = y.index({ left_mask });
|
||||
auto left_weights = sample_weights.index({ left_mask });
|
||||
double left_weight = left_weights.sum().item<double>();
|
||||
double left_impurity = calculateGiniImpurity(left_y, left_weights);
|
||||
|
||||
auto right_y = y.index({ right_mask });
|
||||
auto right_weights = sample_weights.index({ right_mask });
|
||||
double right_weight = right_weights.sum().item<double>();
|
||||
double right_impurity = calculateGiniImpurity(right_y, right_weights);
|
||||
|
||||
// Calculate impurity decrease
|
||||
double impurity_decrease = current_impurity -
|
||||
(left_weight / total_weight * left_impurity +
|
||||
right_weight / total_weight * right_impurity);
|
||||
|
||||
// Update best split if this is better
|
||||
if (impurity_decrease > best_split.impurity_decrease) {
|
||||
best_split.feature_index = feat_idx;
|
||||
best_split.split_value = split_val;
|
||||
best_split.impurity_decrease = impurity_decrease;
|
||||
best_split.left_mask = left_mask;
|
||||
best_split.right_mask = right_mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return best_split;
|
||||
}
|
||||
|
||||
double DecisionTree::calculateGiniImpurity(
|
||||
const torch::Tensor& y,
|
||||
const torch::Tensor& sample_weights)
|
||||
{
|
||||
if (y.size(0) == 0 || sample_weights.size(0) == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (y.size(0) != sample_weights.size(0)) {
|
||||
throw std::runtime_error("y and sample_weights must have same size");
|
||||
}
|
||||
|
||||
torch::Tensor class_weights = torch::zeros({ n_classes });
|
||||
|
||||
// Calculate weighted class counts
|
||||
for (int i = 0; i < y.size(0); i++) {
|
||||
int class_idx = y[i].item<int>();
|
||||
|
||||
if (class_idx < 0 || class_idx >= n_classes) {
|
||||
throw std::runtime_error("Invalid class index: " + std::to_string(class_idx));
|
||||
}
|
||||
|
||||
class_weights[class_idx] += sample_weights[i].item<float>();
|
||||
}
|
||||
|
||||
// Normalize
|
||||
double total_weight = class_weights.sum().item<double>();
|
||||
if (total_weight == 0) return 0.0;
|
||||
|
||||
class_weights /= total_weight;
|
||||
|
||||
// Calculate Gini impurity: 1 - sum(p_i^2)
|
||||
double gini = 1.0;
|
||||
for (int i = 0; i < n_classes; i++) {
|
||||
double p = class_weights[i].item<double>();
|
||||
gini -= p * p;
|
||||
}
|
||||
|
||||
return gini;
|
||||
}
|
||||
|
||||
|
||||
torch::Tensor DecisionTree::predict(torch::Tensor& X)
|
||||
{
|
||||
if (!fitted) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
int n_samples = X.size(1);
|
||||
torch::Tensor predictions = torch::zeros({ n_samples }, torch::kInt32);
|
||||
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
auto sample = X.index({ torch::indexing::Slice(), i }).ravel();
|
||||
predictions[i] = predictSample(sample);
|
||||
}
|
||||
|
||||
return predictions;
|
||||
}
|
||||
|
||||
std::vector<int> DecisionTree::predict(std::vector<std::vector<int>>& X)
|
||||
{
|
||||
// Convert to tensor
|
||||
long n = X.size();
|
||||
long m = X.at(0).size();
|
||||
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
|
||||
auto predictions = predict(X_tensor);
|
||||
std::vector<int> result = platform::TensorUtils::to_vector<int>(predictions);
|
||||
return result;
|
||||
}
|
||||
|
||||
torch::Tensor DecisionTree::predict_proba(torch::Tensor& X)
|
||||
{
|
||||
if (!fitted) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
int n_samples = X.size(1);
|
||||
torch::Tensor probabilities = torch::zeros({ n_samples, n_classes });
|
||||
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
auto sample = X.index({ torch::indexing::Slice(), i }).ravel();
|
||||
probabilities[i] = predictProbaSample(sample);
|
||||
}
|
||||
|
||||
return probabilities;
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>> DecisionTree::predict_proba(std::vector<std::vector<int>>& X)
|
||||
{
|
||||
auto n_samples = X.at(0).size();
|
||||
// Convert to tensor
|
||||
torch::Tensor X_tensor = platform::TensorUtils::to_matrix(X);
|
||||
auto proba_tensor = predict_proba(X_tensor);
|
||||
std::vector<std::vector<double>> result(n_samples, std::vector<double>(n_classes, 0.0));
|
||||
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
for (int j = 0; j < n_classes; j++) {
|
||||
result[i][j] = proba_tensor[i][j].item<double>();
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int DecisionTree::predictSample(const torch::Tensor& x) const
|
||||
{
|
||||
if (!fitted) {
|
||||
throw std::runtime_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
|
||||
if (x.size(0) != n) { // n debería ser el número de características
|
||||
throw std::runtime_error("Input sample has wrong number of features");
|
||||
}
|
||||
|
||||
const TreeNode* leaf = traverseTree(x, root.get());
|
||||
return leaf->predicted_class;
|
||||
}
|
||||
torch::Tensor DecisionTree::predictProbaSample(const torch::Tensor& x) const
|
||||
{
|
||||
const TreeNode* leaf = traverseTree(x, root.get());
|
||||
return leaf->class_probabilities.clone();
|
||||
}
|
||||
|
||||
|
||||
const TreeNode* DecisionTree::traverseTree(const torch::Tensor& x, const TreeNode* node) const
|
||||
{
|
||||
if (!node) {
|
||||
throw std::runtime_error("Null node encountered during tree traversal");
|
||||
}
|
||||
|
||||
if (node->is_leaf) {
|
||||
return node;
|
||||
}
|
||||
|
||||
if (node->split_feature < 0 || node->split_feature >= x.size(0)) {
|
||||
throw std::runtime_error("Invalid split_feature index: " + std::to_string(node->split_feature));
|
||||
}
|
||||
|
||||
int feature_value = x[node->split_feature].item<int>();
|
||||
|
||||
if (feature_value == node->split_value) {
|
||||
if (!node->left) {
|
||||
throw std::runtime_error("Missing left child in tree");
|
||||
}
|
||||
return traverseTree(x, node->left.get());
|
||||
} else {
|
||||
if (!node->right) {
|
||||
throw std::runtime_error("Missing right child in tree");
|
||||
}
|
||||
return traverseTree(x, node->right.get());
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> DecisionTree::graph(const std::string& title) const
|
||||
{
|
||||
std::vector<std::string> lines;
|
||||
lines.push_back("digraph DecisionTree {");
|
||||
lines.push_back(" rankdir=TB;");
|
||||
lines.push_back(" node [shape=box, style=\"filled, rounded\", fontname=\"helvetica\"];");
|
||||
lines.push_back(" edge [fontname=\"helvetica\"];");
|
||||
|
||||
if (!title.empty()) {
|
||||
lines.push_back(" label=\"" + title + "\";");
|
||||
lines.push_back(" labelloc=t;");
|
||||
}
|
||||
|
||||
if (root) {
|
||||
int node_id = 0;
|
||||
treeToGraph(root.get(), lines, node_id);
|
||||
}
|
||||
|
||||
lines.push_back("}");
|
||||
return lines;
|
||||
}
|
||||
|
||||
void DecisionTree::treeToGraph(
|
||||
const TreeNode* node,
|
||||
std::vector<std::string>& lines,
|
||||
int& node_id,
|
||||
int parent_id,
|
||||
const std::string& edge_label) const
|
||||
{
|
||||
|
||||
int current_id = node_id++;
|
||||
std::stringstream ss;
|
||||
|
||||
if (node->is_leaf) {
|
||||
// Leaf node
|
||||
ss << " node" << current_id << " [label=\"Class: " << node->predicted_class;
|
||||
ss << "\\nProb: " << std::fixed << std::setprecision(3)
|
||||
<< node->class_probabilities[node->predicted_class].item<float>();
|
||||
ss << "\", fillcolor=\"lightblue\"];";
|
||||
lines.push_back(ss.str());
|
||||
} else {
|
||||
// Internal node
|
||||
ss << " node" << current_id << " [label=\"" << features[node->split_feature];
|
||||
ss << " = " << node->split_value << "?\", fillcolor=\"lightgreen\"];";
|
||||
lines.push_back(ss.str());
|
||||
}
|
||||
|
||||
// Add edge from parent
|
||||
if (parent_id >= 0) {
|
||||
ss.str("");
|
||||
ss << " node" << parent_id << " -> node" << current_id;
|
||||
if (!edge_label.empty()) {
|
||||
ss << " [label=\"" << edge_label << "\"];";
|
||||
} else {
|
||||
ss << ";";
|
||||
}
|
||||
lines.push_back(ss.str());
|
||||
}
|
||||
|
||||
// Recurse on children
|
||||
if (!node->is_leaf) {
|
||||
if (node->left) {
|
||||
treeToGraph(node->left.get(), lines, node_id, current_id, "Yes");
|
||||
}
|
||||
if (node->right) {
|
||||
treeToGraph(node->right.get(), lines, node_id, current_id, "No");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace bayesnet
|
134
src/experimental_clfs/DecisionTree.h
Normal file
134
src/experimental_clfs/DecisionTree.h
Normal file
@@ -0,0 +1,134 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#ifndef DECISION_TREE_H
|
||||
#define DECISION_TREE_H
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <torch/torch.h>
|
||||
#include "bayesnet/classifiers/Classifier.h"
|
||||
|
||||
namespace bayesnet {
|
||||
|
||||
// Forward declaration
|
||||
struct TreeNode;
|
||||
|
||||
class DecisionTree : public Classifier {
|
||||
public:
|
||||
explicit DecisionTree(int max_depth = 3, int min_samples_split = 2, int min_samples_leaf = 1);
|
||||
virtual ~DecisionTree() = default;
|
||||
|
||||
// Override graph method to show tree structure
|
||||
std::vector<std::string> graph(const std::string& title = "") const override;
|
||||
|
||||
// Setters for hyperparameters
|
||||
void setMaxDepth(int depth) { max_depth = depth; checkValues(); }
|
||||
void setMinSamplesSplit(int samples) { min_samples_split = samples; checkValues(); }
|
||||
void setMinSamplesLeaf(int samples) { min_samples_leaf = samples; checkValues(); }
|
||||
int getMaxDepth() const { return max_depth; }
|
||||
int getMinSamplesSplit() const { return min_samples_split; }
|
||||
int getMinSamplesLeaf() const { return min_samples_leaf; }
|
||||
|
||||
// Override setHyperparameters
|
||||
void setHyperparameters(const nlohmann::json& hyperparameters) override;
|
||||
|
||||
torch::Tensor predict(torch::Tensor& X) override;
|
||||
std::vector<int> predict(std::vector<std::vector<int>>& X) override;
|
||||
torch::Tensor predict_proba(torch::Tensor& X) override;
|
||||
std::vector<std::vector<double>> predict_proba(std::vector<std::vector<int>>& X) override;
|
||||
|
||||
// Make predictions for a single sample
|
||||
int predictSample(const torch::Tensor& x) const;
|
||||
|
||||
// Make probabilistic predictions for a single sample
|
||||
torch::Tensor predictProbaSample(const torch::Tensor& x) const;
|
||||
|
||||
protected:
|
||||
void buildModel(const torch::Tensor& weights) override;
|
||||
void trainModel(const torch::Tensor& weights, const Smoothing_t smoothing) override
|
||||
{
|
||||
// Decision trees do not require training in the traditional sense
|
||||
// as they are built from the data directly.
|
||||
// This method can be used to set weights or other parameters if needed.
|
||||
}
|
||||
private:
|
||||
void checkValues();
|
||||
bool validateTensors(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& sample_weights) const;
|
||||
// Tree hyperparameters
|
||||
int max_depth;
|
||||
int min_samples_split;
|
||||
int min_samples_leaf;
|
||||
int n_classes; // Number of classes in the target variable
|
||||
|
||||
// Root of the decision tree
|
||||
std::unique_ptr<TreeNode> root;
|
||||
|
||||
// Build tree recursively
|
||||
std::unique_ptr<TreeNode> buildTree(
|
||||
const torch::Tensor& X,
|
||||
const torch::Tensor& y,
|
||||
const torch::Tensor& sample_weights,
|
||||
int current_depth
|
||||
);
|
||||
|
||||
// Find best split for a node
|
||||
struct SplitInfo {
|
||||
int feature_index;
|
||||
int split_value;
|
||||
double impurity_decrease;
|
||||
torch::Tensor left_mask;
|
||||
torch::Tensor right_mask;
|
||||
};
|
||||
|
||||
SplitInfo findBestSplit(
|
||||
const torch::Tensor& X,
|
||||
const torch::Tensor& y,
|
||||
const torch::Tensor& sample_weights
|
||||
);
|
||||
|
||||
// Calculate weighted Gini impurity for multi-class
|
||||
double calculateGiniImpurity(
|
||||
const torch::Tensor& y,
|
||||
const torch::Tensor& sample_weights
|
||||
);
|
||||
|
||||
|
||||
|
||||
// Traverse tree to find leaf node
|
||||
const TreeNode* traverseTree(const torch::Tensor& x, const TreeNode* node) const;
|
||||
|
||||
// Convert tree to graph representation
|
||||
void treeToGraph(
|
||||
const TreeNode* node,
|
||||
std::vector<std::string>& lines,
|
||||
int& node_id,
|
||||
int parent_id = -1,
|
||||
const std::string& edge_label = ""
|
||||
) const;
|
||||
};
|
||||
|
||||
// Tree node structure
|
||||
struct TreeNode {
|
||||
bool is_leaf;
|
||||
|
||||
// For internal nodes
|
||||
int split_feature;
|
||||
int split_value;
|
||||
std::unique_ptr<TreeNode> left;
|
||||
std::unique_ptr<TreeNode> right;
|
||||
|
||||
// For leaf nodes
|
||||
int predicted_class;
|
||||
torch::Tensor class_probabilities; // Probability for each class
|
||||
|
||||
TreeNode() : is_leaf(false), split_feature(-1), split_value(-1), predicted_class(-1) {}
|
||||
};
|
||||
|
||||
} // namespace bayesnet
|
||||
|
||||
#endif // DECISION_TREE_H
|
182
src/experimental_clfs/ExpClf.cpp
Normal file
182
src/experimental_clfs/ExpClf.cpp
Normal file
@@ -0,0 +1,182 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#include "ExpClf.h"
|
||||
#include "common/TensorUtils.hpp"
|
||||
|
||||
namespace platform {
|
||||
ExpClf::ExpClf() : semaphore_{ CountingSemaphore::getInstance() }, Boost(false)
|
||||
{
|
||||
validHyperparameters = {};
|
||||
}
|
||||
//
|
||||
// Parents
|
||||
//
|
||||
void ExpClf::add_active_parents(const std::vector<int>& active_parents)
|
||||
{
|
||||
for (const auto& parent : active_parents)
|
||||
aode_.add_active_parent(parent);
|
||||
}
|
||||
void ExpClf::add_active_parent(int parent)
|
||||
{
|
||||
aode_.add_active_parent(parent);
|
||||
}
|
||||
void ExpClf::remove_last_parent()
|
||||
{
|
||||
aode_.remove_last_parent();
|
||||
}
|
||||
//
|
||||
// Predict
|
||||
//
|
||||
std::vector<int> ExpClf::predict_spode(std::vector<std::vector<int>>& test_data, int parent)
|
||||
{
|
||||
int test_size = test_data[0].size();
|
||||
int sample_size = test_data.size();
|
||||
auto predictions = std::vector<int>(test_size);
|
||||
|
||||
int chunk_size = std::min(150, int(test_size / semaphore_.getMaxCount()) + 1);
|
||||
std::vector<std::thread> threads;
|
||||
auto worker = [&](const std::vector<std::vector<int>>& samples, int begin, int chunk, int sample_size, std::vector<int>& predictions) {
|
||||
std::string threadName = "(V)PWorker-" + std::to_string(begin) + "-" + std::to_string(chunk);
|
||||
#if defined(__linux__)
|
||||
pthread_setname_np(pthread_self(), threadName.c_str());
|
||||
#else
|
||||
pthread_setname_np(threadName.c_str());
|
||||
#endif
|
||||
std::vector<int> instance(sample_size);
|
||||
for (int sample = begin; sample < begin + chunk; ++sample) {
|
||||
for (int feature = 0; feature < sample_size; ++feature) {
|
||||
instance[feature] = samples[feature][sample];
|
||||
}
|
||||
predictions[sample] = aode_.predict_spode(instance, parent);
|
||||
}
|
||||
semaphore_.release();
|
||||
};
|
||||
for (int begin = 0; begin < test_size; begin += chunk_size) {
|
||||
int chunk = std::min(chunk_size, test_size - begin);
|
||||
semaphore_.acquire();
|
||||
threads.emplace_back(worker, test_data, begin, chunk, sample_size, std::ref(predictions));
|
||||
}
|
||||
for (auto& thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
return predictions;
|
||||
}
|
||||
torch::Tensor ExpClf::predict(torch::Tensor& X)
|
||||
{
|
||||
auto X_ = TensorUtils::to_matrix(X);
|
||||
torch::Tensor y = torch::tensor(predict(X_));
|
||||
return y;
|
||||
}
|
||||
torch::Tensor ExpClf::predict_proba(torch::Tensor& X)
|
||||
{
|
||||
auto X_ = TensorUtils::to_matrix(X);
|
||||
auto probabilities = predict_proba(X_);
|
||||
auto n_samples = X.size(1);
|
||||
int n_classes = probabilities[0].size();
|
||||
auto y = torch::zeros({ n_samples, n_classes });
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
for (int j = 0; j < n_classes; j++) {
|
||||
y[i][j] = probabilities[i][j];
|
||||
}
|
||||
}
|
||||
return y;
|
||||
}
|
||||
float ExpClf::score(torch::Tensor& X, torch::Tensor& y)
|
||||
{
|
||||
auto X_ = TensorUtils::to_matrix(X);
|
||||
auto y_ = TensorUtils::to_vector<int>(y);
|
||||
return score(X_, y_);
|
||||
}
|
||||
std::vector<std::vector<double>> ExpClf::predict_proba(const std::vector<std::vector<int>>& test_data)
|
||||
{
|
||||
int test_size = test_data[0].size();
|
||||
int sample_size = test_data.size();
|
||||
auto probabilities = std::vector<std::vector<double>>(test_size, std::vector<double>(aode_.statesClass()));
|
||||
|
||||
int chunk_size = std::min(150, int(test_size / semaphore_.getMaxCount()) + 1);
|
||||
std::vector<std::thread> threads;
|
||||
auto worker = [&](const std::vector<std::vector<int>>& samples, int begin, int chunk, int sample_size, std::vector<std::vector<double>>& predictions) {
|
||||
std::string threadName = "(V)PWorker-" + std::to_string(begin) + "-" + std::to_string(chunk);
|
||||
#if defined(__linux__)
|
||||
pthread_setname_np(pthread_self(), threadName.c_str());
|
||||
#else
|
||||
pthread_setname_np(threadName.c_str());
|
||||
#endif
|
||||
|
||||
std::vector<int> instance(sample_size);
|
||||
for (int sample = begin; sample < begin + chunk; ++sample) {
|
||||
for (int feature = 0; feature < sample_size; ++feature) {
|
||||
instance[feature] = samples[feature][sample];
|
||||
}
|
||||
predictions[sample] = aode_.predict_proba(instance);
|
||||
}
|
||||
semaphore_.release();
|
||||
};
|
||||
for (int begin = 0; begin < test_size; begin += chunk_size) {
|
||||
int chunk = std::min(chunk_size, test_size - begin);
|
||||
semaphore_.acquire();
|
||||
threads.emplace_back(worker, test_data, begin, chunk, sample_size, std::ref(probabilities));
|
||||
}
|
||||
for (auto& thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
return probabilities;
|
||||
}
|
||||
std::vector<int> ExpClf::predict(std::vector<std::vector<int>>& test_data)
|
||||
{
|
||||
if (!fitted) {
|
||||
throw std::logic_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
auto probabilities = predict_proba(test_data);
|
||||
std::vector<int> predictions(probabilities.size(), 0);
|
||||
|
||||
for (size_t i = 0; i < probabilities.size(); i++) {
|
||||
predictions[i] = std::distance(probabilities[i].begin(), std::max_element(probabilities[i].begin(), probabilities[i].end()));
|
||||
}
|
||||
|
||||
return predictions;
|
||||
}
|
||||
float ExpClf::score(std::vector<std::vector<int>>& test_data, std::vector<int>& labels)
|
||||
{
|
||||
Timer timer;
|
||||
timer.start();
|
||||
std::vector<int> predictions = predict(test_data);
|
||||
int correct = 0;
|
||||
|
||||
for (size_t i = 0; i < predictions.size(); i++) {
|
||||
if (predictions[i] == labels[i]) {
|
||||
correct++;
|
||||
}
|
||||
}
|
||||
if (debug) {
|
||||
std::cout << "* Time to predict: " << timer.getDurationString() << std::endl;
|
||||
}
|
||||
return static_cast<float>(correct) / predictions.size();
|
||||
}
|
||||
|
||||
//
|
||||
// statistics
|
||||
//
|
||||
int ExpClf::getNumberOfNodes() const
|
||||
{
|
||||
return aode_.getNumberOfNodes();
|
||||
}
|
||||
int ExpClf::getNumberOfEdges() const
|
||||
{
|
||||
return aode_.getNumberOfEdges();
|
||||
}
|
||||
int ExpClf::getNumberOfStates() const
|
||||
{
|
||||
return aode_.getNumberOfStates();
|
||||
}
|
||||
int ExpClf::getClassNumStates() const
|
||||
{
|
||||
return aode_.statesClass();
|
||||
}
|
||||
|
||||
|
||||
}
|
67
src/experimental_clfs/ExpClf.h
Normal file
67
src/experimental_clfs/ExpClf.h
Normal file
@@ -0,0 +1,67 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#ifndef EXPCLF_H
|
||||
#define EXPCLF_H
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <bayesnet/ensembles/Boost.h>
|
||||
#include <bayesnet/network/Smoothing.h>
|
||||
#include "common/Timer.hpp"
|
||||
#include "CountingSemaphore.hpp"
|
||||
#include "Xaode.hpp"
|
||||
|
||||
namespace platform {
|
||||
class ExpClf : public bayesnet::Boost {
|
||||
public:
|
||||
ExpClf();
|
||||
virtual ~ExpClf() = default;
|
||||
std::vector<int> predict(std::vector<std::vector<int>>& X) override;
|
||||
torch::Tensor predict(torch::Tensor& X) override;
|
||||
torch::Tensor predict_proba(torch::Tensor& X) override;
|
||||
std::vector<int> predict_spode(std::vector<std::vector<int>>& test_data, int parent);
|
||||
std::vector<std::vector<double>> predict_proba(const std::vector<std::vector<int>>& X);
|
||||
float score(std::vector<std::vector<int>>& X, std::vector<int>& y) override;
|
||||
float score(torch::Tensor& X, torch::Tensor& y) override;
|
||||
int getNumberOfNodes() const override;
|
||||
int getNumberOfEdges() const override;
|
||||
int getNumberOfStates() const override;
|
||||
int getClassNumStates() const override;
|
||||
std::vector<std::string> show() const override { return {}; }
|
||||
std::vector<std::string> topological_order() override { return {}; }
|
||||
std::string dump_cpt() const override { return ""; }
|
||||
void setDebug(bool debug) { this->debug = debug; }
|
||||
bayesnet::status_t getStatus() const override { return status; }
|
||||
std::vector<std::string> getNotes() const override { return notes; }
|
||||
std::vector<std::string> graph(const std::string& title = "") const override { return {}; }
|
||||
void add_active_parents(const std::vector<int>& active_parents);
|
||||
void add_active_parent(int parent);
|
||||
void remove_last_parent();
|
||||
void setHyperparameters(const nlohmann::json& hyperparameters_) override {};
|
||||
protected:
|
||||
bool debug = false;
|
||||
Xaode aode_;
|
||||
torch::Tensor weights_;
|
||||
const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted";
|
||||
inline void normalize_weights(int num_instances)
|
||||
{
|
||||
double sum = weights_.sum().item<double>();
|
||||
if (sum == 0) {
|
||||
weights_ = torch::full({ num_instances }, 1.0);
|
||||
} else {
|
||||
for (int i = 0; i < weights_.size(0); ++i) {
|
||||
weights_[i] = weights_[i].item<double>() * num_instances / sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
CountingSemaphore& semaphore_;
|
||||
};
|
||||
}
|
||||
#endif // EXPCLF_H
|
158
src/experimental_clfs/ExpEnsemble.cpp
Normal file
158
src/experimental_clfs/ExpEnsemble.cpp
Normal file
@@ -0,0 +1,158 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#include "ExpEnsemble.h"
|
||||
#include "common/TensorUtils.hpp"
|
||||
|
||||
namespace platform {
|
||||
ExpEnsemble::ExpEnsemble() : semaphore_{ CountingSemaphore::getInstance() }, Boost(false)
|
||||
{
|
||||
validHyperparameters = {};
|
||||
}
|
||||
//
|
||||
// Parents
|
||||
//
|
||||
void ExpEnsemble::add_model(std::unique_ptr<XSpode> model)
|
||||
{
|
||||
models.push_back(std::move(model));
|
||||
n_models++;
|
||||
}
|
||||
void ExpEnsemble::remove_last_model()
|
||||
{
|
||||
models.pop_back();
|
||||
n_models--;
|
||||
}
|
||||
//
|
||||
// Predict
|
||||
//
|
||||
torch::Tensor ExpEnsemble::predict(torch::Tensor& X)
|
||||
{
|
||||
auto X_ = TensorUtils::to_matrix(X);
|
||||
torch::Tensor y = torch::tensor(predict(X_));
|
||||
return y;
|
||||
}
|
||||
torch::Tensor ExpEnsemble::predict_proba(torch::Tensor& X)
|
||||
{
|
||||
auto X_ = TensorUtils::to_matrix(X);
|
||||
auto probabilities = predict_proba(X_);
|
||||
auto n_samples = X.size(1);
|
||||
int n_classes = probabilities[0].size();
|
||||
auto y = torch::zeros({ n_samples, n_classes });
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
for (int j = 0; j < n_classes; j++) {
|
||||
y[i][j] = probabilities[i][j];
|
||||
}
|
||||
}
|
||||
return y;
|
||||
}
|
||||
float ExpEnsemble::score(torch::Tensor& X, torch::Tensor& y)
|
||||
{
|
||||
auto X_ = TensorUtils::to_matrix(X);
|
||||
auto y_ = TensorUtils::to_vector<int>(y);
|
||||
return score(X_, y_);
|
||||
}
|
||||
std::vector<std::vector<double>> ExpEnsemble::predict_proba(const std::vector<std::vector<int>>& test_data)
|
||||
{
|
||||
int test_size = test_data[0].size();
|
||||
int sample_size = test_data.size();
|
||||
auto probabilities = std::vector<std::vector<double>>(test_size, std::vector<double>(getClassNumStates()));
|
||||
int chunk_size = std::min(150, int(test_size / semaphore_.getMaxCount()) + 1);
|
||||
std::vector<std::thread> threads;
|
||||
auto worker = [&](const std::vector<std::vector<int>>& samples, int begin, int chunk, int sample_size, std::vector<std::vector<double>>& predictions) {
|
||||
std::string threadName = "(V)PWorker-" + std::to_string(begin) + "-" + std::to_string(chunk);
|
||||
#if defined(__linux__)
|
||||
pthread_setname_np(pthread_self(), threadName.c_str());
|
||||
#else
|
||||
pthread_setname_np(threadName.c_str());
|
||||
#endif
|
||||
|
||||
std::vector<int> instance(sample_size);
|
||||
for (int sample = begin; sample < begin + chunk; ++sample) {
|
||||
for (int feature = 0; feature < sample_size; ++feature) {
|
||||
instance[feature] = samples[feature][sample];
|
||||
}
|
||||
// predictions[sample] = aode_.predict_proba(instance);
|
||||
}
|
||||
semaphore_.release();
|
||||
};
|
||||
for (int begin = 0; begin < test_size; begin += chunk_size) {
|
||||
int chunk = std::min(chunk_size, test_size - begin);
|
||||
semaphore_.acquire();
|
||||
threads.emplace_back(worker, test_data, begin, chunk, sample_size, std::ref(probabilities));
|
||||
}
|
||||
for (auto& thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
return probabilities;
|
||||
}
|
||||
std::vector<int> ExpEnsemble::predict(std::vector<std::vector<int>>& test_data)
|
||||
{
|
||||
if (!fitted) {
|
||||
throw std::logic_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
auto probabilities = predict_proba(test_data);
|
||||
std::vector<int> predictions(probabilities.size(), 0);
|
||||
|
||||
for (size_t i = 0; i < probabilities.size(); i++) {
|
||||
predictions[i] = std::distance(probabilities[i].begin(), std::max_element(probabilities[i].begin(), probabilities[i].end()));
|
||||
}
|
||||
|
||||
return predictions;
|
||||
}
|
||||
float ExpEnsemble::score(std::vector<std::vector<int>>& test_data, std::vector<int>& labels)
|
||||
{
|
||||
Timer timer;
|
||||
timer.start();
|
||||
std::vector<int> predictions = predict(test_data);
|
||||
int correct = 0;
|
||||
|
||||
for (size_t i = 0; i < predictions.size(); i++) {
|
||||
if (predictions[i] == labels[i]) {
|
||||
correct++;
|
||||
}
|
||||
}
|
||||
if (debug) {
|
||||
std::cout << "* Time to predict: " << timer.getDurationString() << std::endl;
|
||||
}
|
||||
return static_cast<float>(correct) / predictions.size();
|
||||
}
|
||||
|
||||
//
|
||||
// statistics
|
||||
//
|
||||
int ExpEnsemble::getNumberOfNodes() const
|
||||
{
|
||||
if (models_.empty()) {
|
||||
return 0;
|
||||
}
|
||||
return n_models * (models_.at(0)->getNFeatures() + 1);
|
||||
}
|
||||
int ExpEnsemble::getNumberOfEdges() const
|
||||
{
|
||||
if (models_.empty()) {
|
||||
return 0;
|
||||
}
|
||||
return n_models * (2 * models_.at(0)->getNFeatures() - 1);
|
||||
}
|
||||
int ExpEnsemble::getNumberOfStates() const
|
||||
{
|
||||
if (models_.empty()) {
|
||||
return 0;
|
||||
}
|
||||
auto states = models_.at(0)->getStates();
|
||||
int nFeatures = models_.at(0)->getNFeatures();
|
||||
return std::accumulate(states.begin(), states.end(), 0) * nFeatures * n_models;
|
||||
}
|
||||
int ExpEnsemble::getClassNumStates() const
|
||||
{
|
||||
if (models_.empty()) {
|
||||
return 0;
|
||||
}
|
||||
return models_.at(0)->statesClass();
|
||||
}
|
||||
|
||||
|
||||
}
|
66
src/experimental_clfs/ExpEnsemble.h
Normal file
66
src/experimental_clfs/ExpEnsemble.h
Normal file
@@ -0,0 +1,66 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#ifndef EXPENSEMBLE_H
|
||||
#define EXPENSEMBLE_H
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <bayesnet/ensembles/Boost.h>
|
||||
#include <bayesnet/network/Smoothing.h>
|
||||
#include "common/Timer.hpp"
|
||||
#include "CountingSemaphore.hpp"
|
||||
#include "XSpode.hpp"
|
||||
|
||||
namespace platform {
|
||||
class ExpEnsemble : public bayesnet::Boost {
|
||||
public:
|
||||
ExpEnsemble();
|
||||
virtual ~ExpEnsemble() = default;
|
||||
std::vector<int> predict(std::vector<std::vector<int>>& X) override;
|
||||
torch::Tensor predict(torch::Tensor& X) override;
|
||||
torch::Tensor predict_proba(torch::Tensor& X) override;
|
||||
std::vector<int> predict_spode(std::vector<std::vector<int>>& test_data, int parent);
|
||||
std::vector<std::vector<double>> predict_proba(const std::vector<std::vector<int>>& X);
|
||||
float score(std::vector<std::vector<int>>& X, std::vector<int>& y) override;
|
||||
float score(torch::Tensor& X, torch::Tensor& y) override;
|
||||
int getNumberOfNodes() const override;
|
||||
int getNumberOfEdges() const override;
|
||||
int getNumberOfStates() const override;
|
||||
int getClassNumStates() const override;
|
||||
std::vector<std::string> show() const override { return {}; }
|
||||
std::vector<std::string> topological_order() override { return {}; }
|
||||
std::string dump_cpt() const override { return ""; }
|
||||
void setDebug(bool debug) { this->debug = debug; }
|
||||
bayesnet::status_t getStatus() const override { return status; }
|
||||
std::vector<std::string> getNotes() const override { return notes; }
|
||||
std::vector<std::string> graph(const std::string& title = "") const override { return {}; }
|
||||
protected:
|
||||
void add_model(std::unique_ptr<XSpode> model);
|
||||
void remove_last_model();
|
||||
bool debug = false;
|
||||
std::vector <std::unique_ptr<XSpode>> models_;
|
||||
torch::Tensor weights_;
|
||||
std::vector<double> significanceModels_;
|
||||
const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted";
|
||||
inline void normalize_weights(int num_instances)
|
||||
{
|
||||
double sum = weights_.sum().item<double>();
|
||||
if (sum == 0) {
|
||||
weights_ = torch::full({ num_instances }, 1.0);
|
||||
} else {
|
||||
for (int i = 0; i < weights_.size(0); ++i) {
|
||||
weights_[i] = weights_[i].item<double>() * num_instances / sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
CountingSemaphore& semaphore_;
|
||||
};
|
||||
}
|
||||
#endif // EXPENSEMBLE_H
|
142
src/experimental_clfs/README.md
Normal file
142
src/experimental_clfs/README.md
Normal file
@@ -0,0 +1,142 @@
|
||||
# AdaBoost and DecisionTree Classifier Implementation
|
||||
|
||||
This implementation provides both a Decision Tree classifier and a multi-class AdaBoost classifier based on the SAMME (Stagewise Additive Modeling using a Multi-class Exponential loss) algorithm described in the paper "Multi-class AdaBoost" by Zhu et al. Implemented in C++ using <https://claude.ai>
|
||||
|
||||
## Components
|
||||
|
||||
### 1. DecisionTree Classifier
|
||||
|
||||
A classic decision tree implementation that:
|
||||
|
||||
- Supports multi-class classification
|
||||
- Handles weighted samples (essential for boosting)
|
||||
- Uses Gini impurity as the splitting criterion
|
||||
- Works with discrete/categorical features
|
||||
- Provides both class predictions and probability estimates
|
||||
|
||||
#### Key Features
|
||||
|
||||
- **Max Depth Control**: Limit tree depth to create weak learners
|
||||
- **Minimum Samples**: Control minimum samples for splitting and leaf nodes
|
||||
- **Weighted Training**: Properly handles sample weights for boosting
|
||||
- **Visualization**: Generates DOT format graphs of the tree structure
|
||||
|
||||
#### Hyperparameters
|
||||
|
||||
- `max_depth`: Maximum depth of the tree (default: 3)
|
||||
- `min_samples_split`: Minimum samples required to split a node (default: 2)
|
||||
- `min_samples_leaf`: Minimum samples required in a leaf node (default: 1)
|
||||
|
||||
### 2. AdaBoost Classifier
|
||||
|
||||
A multi-class AdaBoost implementation using DecisionTree as base estimators:
|
||||
|
||||
- **SAMME Algorithm**: Implements the multi-class extension of AdaBoost
|
||||
- **Automatic Stumps**: Uses decision stumps (max_depth=1) by default
|
||||
- **Early Stopping**: Stops if base classifier performs worse than random
|
||||
- **Ensemble Visualization**: Shows the weighted combination of base estimators
|
||||
|
||||
#### Key Features
|
||||
|
||||
- **Multi-class Support**: Natural extension to K classes
|
||||
- **Base Estimator Control**: Configure depth of base decision trees
|
||||
- **Training Monitoring**: Track training errors and estimator weights
|
||||
- **Probability Estimates**: Provides class probability predictions
|
||||
|
||||
#### Hyperparameters
|
||||
|
||||
- `n_estimators`: Number of base estimators to train (default: 50)
|
||||
- `base_max_depth`: Maximum depth for base decision trees (default: 1)
|
||||
|
||||
## Algorithm Details
|
||||
|
||||
The SAMME algorithm differs from binary AdaBoost in the calculation of the estimator weight (alpha):
|
||||
|
||||
```
|
||||
α = log((1 - err) / err) + log(K - 1)
|
||||
```
|
||||
|
||||
where `K` is the number of classes. This formula ensures that:
|
||||
|
||||
- When K = 2, it reduces to standard AdaBoost
|
||||
- For K > 2, base classifiers only need to be better than random guessing (1/K) rather than 50%
|
||||
|
||||
## Usage Example
|
||||
|
||||
```cpp
|
||||
// Create AdaBoost with decision stumps
|
||||
AdaBoost ada(100, 1); // 100 estimators, max_depth=1
|
||||
|
||||
// Train
|
||||
ada.fit(X_train, y_train, features, className, states, Smoothing_t::NONE);
|
||||
|
||||
// Predict
|
||||
auto predictions = ada.predict(X_test);
|
||||
auto probabilities = ada.predict_proba(X_test);
|
||||
|
||||
// Evaluate
|
||||
float accuracy = ada.score(X_test, y_test);
|
||||
|
||||
// Get ensemble information
|
||||
auto weights = ada.getEstimatorWeights();
|
||||
auto errors = ada.getTrainingErrors();
|
||||
```
|
||||
|
||||
## Implementation Structure
|
||||
|
||||
```
|
||||
AdaBoost (inherits from Ensemble)
|
||||
└── Uses multiple DecisionTree instances as base estimators
|
||||
└── DecisionTree (inherits from Classifier)
|
||||
└── Implements weighted Gini impurity splitting
|
||||
```
|
||||
|
||||
## Visualization
|
||||
|
||||
Both classifiers support graph visualization:
|
||||
|
||||
- **DecisionTree**: Shows the tree structure with split conditions
|
||||
- **AdaBoost**: Shows the ensemble of weighted base estimators
|
||||
|
||||
Generate visualizations using:
|
||||
|
||||
```cpp
|
||||
auto graph = classifier.graph("Title");
|
||||
```
|
||||
|
||||
## Data Format
|
||||
|
||||
Both classifiers expect discrete/categorical data:
|
||||
|
||||
- **Features**: Integer values representing categories (stored in `torch::Tensor` or `std::vector<std::vector<int>>`)
|
||||
- **Labels**: Integer values representing class indices (0, 1, ..., K-1)
|
||||
- **States**: Map defining possible values for each feature and the class variable
|
||||
- **Sample Weights**: Optional weights for each training sample (important for boosting)
|
||||
|
||||
Example data setup:
|
||||
|
||||
```cpp
|
||||
// Features matrix (n_features x n_samples)
|
||||
torch::Tensor X = torch::tensor({{0, 1, 2}, {1, 0, 1}}); // 2 features, 3 samples
|
||||
|
||||
// Labels vector
|
||||
torch::Tensor y = torch::tensor({0, 1, 0}); // 3 samples
|
||||
|
||||
// States definition
|
||||
std::map<std::string, std::vector<int>> states;
|
||||
states["feature1"] = {0, 1, 2}; // Feature 1 can take values 0, 1, or 2
|
||||
states["feature2"] = {0, 1}; // Feature 2 can take values 0 or 1
|
||||
states["class"] = {0, 1}; // Binary classification
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- The implementation handles discrete/categorical features as indicated by the int-based data structures
|
||||
- Sample weights are properly propagated through the tree building process
|
||||
- The DecisionTree implementation uses equality testing for splits (suitable for categorical data)
|
||||
- Both classifiers support the standard fit/predict interface from the base framework
|
||||
|
||||
## References
|
||||
|
||||
- Zhu, J., Zou, H., Rosset, S., & Hastie, T. (2009). Multi-class AdaBoost. Statistics and its interface, 2(3), 349-360.
|
||||
- Breiman, L., Friedman, J., Olshen, R., & Stone, C. (1984). Classification and Regression Trees. Wadsworth, Belmont, CA.
|
20
src/experimental_clfs/XA1DE.cpp
Normal file
20
src/experimental_clfs/XA1DE.cpp
Normal file
@@ -0,0 +1,20 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#include "XA1DE.h"
|
||||
#include "common/TensorUtils.hpp"
|
||||
|
||||
namespace platform {
|
||||
void XA1DE::trainModel(const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing)
|
||||
{
|
||||
auto X = TensorUtils::to_matrix(dataset.slice(0, 0, dataset.size(0) - 1));
|
||||
auto y = TensorUtils::to_vector<int>(dataset.index({ -1, "..." }));
|
||||
int num_instances = X[0].size();
|
||||
weights_ = torch::full({ num_instances }, 1.0);
|
||||
//normalize_weights(num_instances);
|
||||
aode_.fit(X, y, features, className, states, weights_, true, smoothing);
|
||||
}
|
||||
}
|
26
src/experimental_clfs/XA1DE.h
Normal file
26
src/experimental_clfs/XA1DE.h
Normal file
@@ -0,0 +1,26 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#ifndef XA1DE_H
|
||||
#define XA1DE_H
|
||||
#include "Xaode.hpp"
|
||||
#include "ExpClf.h"
|
||||
#include <bayesnet/network/Smoothing.h>
|
||||
|
||||
namespace platform {
|
||||
class XA1DE : public ExpClf {
|
||||
public:
|
||||
XA1DE() = default;
|
||||
virtual ~XA1DE() override = default;
|
||||
std::string getVersion() override { return version; };
|
||||
protected:
|
||||
void buildModel(const torch::Tensor& weights) override {};
|
||||
void trainModel(const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing) override;
|
||||
private:
|
||||
std::string version = "1.0.0";
|
||||
};
|
||||
}
|
||||
#endif // XA1DE_H
|
183
src/experimental_clfs/XBAODE.cpp
Normal file
183
src/experimental_clfs/XBAODE.cpp
Normal file
@@ -0,0 +1,183 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
#include <random>
|
||||
#include <set>
|
||||
#include <functional>
|
||||
#include <limits.h>
|
||||
#include <tuple>
|
||||
#include "XBAODE.h"
|
||||
#include "XSpode.hpp"
|
||||
#include "common/TensorUtils.hpp"
|
||||
#include <loguru.hpp>
|
||||
|
||||
namespace platform {
|
||||
XBAODE::XBAODE()
|
||||
{
|
||||
validHyperparameters = { "alpha_block", "order", "convergence", "convergence_best", "bisection", "threshold", "maxTolerance",
|
||||
"predict_voting", "select_features" };
|
||||
}
|
||||
void XBAODE::add_model(std::unique_ptr<XSpode> model)
|
||||
{
|
||||
models.push_back(std::move(model));
|
||||
n_models++;
|
||||
}
|
||||
void XBAODE::remove_last_model()
|
||||
{
|
||||
models.pop_back();
|
||||
n_models--;
|
||||
}
|
||||
void XBAODE::trainModel(const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing)
|
||||
{
|
||||
fitted = true;
|
||||
X_train_ = TensorUtils::to_matrix(X_train);
|
||||
y_train_ = TensorUtils::to_vector<int>(y_train);
|
||||
X_test_ = TensorUtils::to_matrix(X_test);
|
||||
y_test_ = TensorUtils::to_vector<int>(y_test);
|
||||
maxTolerance = 3;
|
||||
//
|
||||
// Logging setup
|
||||
//
|
||||
// loguru::set_thread_name("XBAODE");
|
||||
// loguru::g_stderr_verbosity = loguru::Verbosity_OFF;
|
||||
// loguru::add_file("XBAODE.log", loguru::Truncate, loguru::Verbosity_MAX);
|
||||
|
||||
// Algorithm based on the adaboost algorithm for classification
|
||||
// as explained in Ensemble methods (Zhi-Hua Zhou, 2012)
|
||||
double alpha_t = 0;
|
||||
weights_ = torch::full({ m }, 1.0 / static_cast<double>(m), torch::kFloat64); // m initialized in Classifier.cc
|
||||
significanceModels.resize(n, 0.0); // n initialized in Classifier.cc
|
||||
bool finished = false;
|
||||
std::vector<int> featuresUsed;
|
||||
n_models = 0;
|
||||
std::unique_ptr<XSpode> model;
|
||||
if (selectFeatures) {
|
||||
featuresUsed = featureSelection(weights_);
|
||||
for (const auto& parent : featuresUsed) {
|
||||
model = std::unique_ptr<XSpode>(new XSpode(parent));
|
||||
model->fit(X_train_, y_train_, weights_, smoothing);
|
||||
std::cout << model->getNFeatures() << std::endl;
|
||||
add_model(std::move(model));
|
||||
}
|
||||
notes.push_back("Used features in initialization: " + std::to_string(featuresUsed.size()) + " of " + std::to_string(features.size()) + " with " + select_features_algorithm);
|
||||
auto ypred = ExpEnsemble::predict(X_train);
|
||||
std::tie(weights_, alpha_t, finished) = update_weights(y_train, ypred, weights_);
|
||||
// Update significance of the models
|
||||
for (const auto& parent : featuresUsed) {
|
||||
significanceModels_[parent] = alpha_t;
|
||||
}
|
||||
n_models = featuresUsed.size();
|
||||
// VLOG_SCOPE_F(1, "SelectFeatures. alpha_t: %f n_models: %d", alpha_t, n_models);
|
||||
if (finished) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
int numItemsPack = 0; // The counter of the models inserted in the current pack
|
||||
// Variables to control the accuracy finish condition
|
||||
double priorAccuracy = 0.0;
|
||||
double improvement = 1.0;
|
||||
double convergence_threshold = 1e-4;
|
||||
int tolerance = 0; // number of times the accuracy is lower than the convergence_threshold
|
||||
// Step 0: Set the finish condition
|
||||
// epsilon sub t > 0.5 => inverse the weights policy
|
||||
// validation error is not decreasing
|
||||
// run out of features
|
||||
bool ascending = order_algorithm == bayesnet::Orders.ASC;
|
||||
std::mt19937 g{ 173 };
|
||||
while (!finished) {
|
||||
// Step 1: Build ranking with mutual information
|
||||
auto featureSelection = metrics.SelectKBestWeighted(weights_, ascending, n); // Get all the features sorted
|
||||
if (order_algorithm == bayesnet::Orders.RAND) {
|
||||
std::shuffle(featureSelection.begin(), featureSelection.end(), g);
|
||||
}
|
||||
// Remove used features
|
||||
featureSelection.erase(remove_if(featureSelection.begin(), featureSelection.end(), [&](auto x)
|
||||
{ return std::find(featuresUsed.begin(), featuresUsed.end(), x) != featuresUsed.end();}),
|
||||
featureSelection.end()
|
||||
);
|
||||
int k = bisection ? pow(2, tolerance) : 1;
|
||||
int counter = 0; // The model counter of the current pack
|
||||
// VLOG_SCOPE_F(1, "counter=%d k=%d featureSelection.size: %zu", counter, k, featureSelection.size());
|
||||
while (counter++ < k && featureSelection.size() > 0) {
|
||||
auto feature = featureSelection[0];
|
||||
featureSelection.erase(featureSelection.begin());
|
||||
model = std::unique_ptr<XSpode>(new XSpode(feature));
|
||||
model->fit(X_train_, y_train_, weights_, smoothing);
|
||||
std::vector<int> ypred;
|
||||
if (alpha_block) {
|
||||
//
|
||||
// Compute the prediction with the current ensemble + model
|
||||
//
|
||||
// Add the model to the ensemble
|
||||
significanceModels[feature] = 1.0;
|
||||
add_model(std::move(model));
|
||||
// Compute the prediction
|
||||
ypred = ExpEnsemble::predict(X_train_);
|
||||
// Remove the model from the ensemble
|
||||
significanceModels[feature] = 0.0;
|
||||
model = std::move(models_.back());
|
||||
remove_last_model();
|
||||
} else {
|
||||
ypred = model->predict(X_train_);
|
||||
}
|
||||
// Step 3.1: Compute the classifier amout of say
|
||||
auto ypred_t = torch::tensor(ypred);
|
||||
std::tie(weights_, alpha_t, finished) = update_weights(y_train, ypred_t, weights_);
|
||||
// Step 3.4: Store classifier and its accuracy to weigh its future vote
|
||||
numItemsPack++;
|
||||
featuresUsed.push_back(feature);
|
||||
add_model(std::move(model));
|
||||
significanceModels[feature] = alpha_t;
|
||||
// VLOG_SCOPE_F(2, "finished: %d numItemsPack: %d n_models: %d featuresUsed: %zu", finished, numItemsPack, n_models, featuresUsed.size());
|
||||
} // End of the pack
|
||||
if (convergence && !finished) {
|
||||
auto y_val_predict = ExpEnsemble::predict(X_test);
|
||||
double accuracy = (y_val_predict == y_test).sum().item<double>() / (double)y_test.size(0);
|
||||
if (priorAccuracy == 0) {
|
||||
priorAccuracy = accuracy;
|
||||
} else {
|
||||
improvement = accuracy - priorAccuracy;
|
||||
}
|
||||
if (improvement < convergence_threshold) {
|
||||
// VLOG_SCOPE_F(3, " (improvement<threshold) tolerance: %d numItemsPack: %d improvement: %f prior: %f current: %f", tolerance, numItemsPack, improvement, priorAccuracy, accuracy);
|
||||
tolerance++;
|
||||
} else {
|
||||
// VLOG_SCOPE_F(3, "* (improvement>=threshold) Reset. tolerance: %d numItemsPack: %d improvement: %f prior: %f current: %f", tolerance, numItemsPack, improvement, priorAccuracy, accuracy);
|
||||
tolerance = 0; // Reset the counter if the model performs better
|
||||
numItemsPack = 0;
|
||||
}
|
||||
if (convergence_best) {
|
||||
// Keep the best accuracy until now as the prior accuracy
|
||||
priorAccuracy = std::max(accuracy, priorAccuracy);
|
||||
} else {
|
||||
// Keep the last accuray obtained as the prior accuracy
|
||||
priorAccuracy = accuracy;
|
||||
}
|
||||
}
|
||||
// VLOG_SCOPE_F(1, "tolerance: %d featuresUsed.size: %zu features.size: %zu", tolerance, featuresUsed.size(), features.size());
|
||||
finished = finished || tolerance > maxTolerance || featuresUsed.size() == features.size();
|
||||
}
|
||||
if (tolerance > maxTolerance) {
|
||||
if (numItemsPack < n_models) {
|
||||
notes.push_back("Convergence threshold reached & " + std::to_string(numItemsPack) + " models eliminated");
|
||||
// VLOG_SCOPE_F(4, "Convergence threshold reached & %d models eliminated of %d", numItemsPack, n_models);
|
||||
for (int i = featuresUsed.size() - 1; i >= featuresUsed.size() - numItemsPack; --i) {
|
||||
remove_last_model();
|
||||
significanceModels[featuresUsed[i]] = 0.0;
|
||||
}
|
||||
// VLOG_SCOPE_F(4, "*Convergence threshold %d models left & %d features used.", n_models, featuresUsed.size());
|
||||
} else {
|
||||
notes.push_back("Convergence threshold reached & 0 models eliminated");
|
||||
// VLOG_SCOPE_F(4, "Convergence threshold reached & 0 models eliminated n_models=%d numItemsPack=%d", n_models, numItemsPack);
|
||||
}
|
||||
}
|
||||
if (featuresUsed.size() != features.size()) {
|
||||
notes.push_back("Used features in train: " + std::to_string(featuresUsed.size()) + " of " + std::to_string(features.size()));
|
||||
status = bayesnet::WARNING;
|
||||
}
|
||||
notes.push_back("Number of models: " + std::to_string(n_models));
|
||||
return;
|
||||
}
|
||||
}
|
35
src/experimental_clfs/XBAODE.h
Normal file
35
src/experimental_clfs/XBAODE.h
Normal file
@@ -0,0 +1,35 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#ifndef XBAODE_H
|
||||
#define XBAODE_H
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include "common/Timer.hpp"
|
||||
#include "ExpEnsemble.h"
|
||||
|
||||
namespace platform {
|
||||
class XBAODE : public Boost {
|
||||
|
||||
// Hay que hacer un vector de modelos entrenados y hacer un predict ensemble con todos ellos
|
||||
// Probar XA1DE con smooth original y laplace y comprobar diferencias si se pasan pesos a 1 o a 1/m
|
||||
public:
|
||||
XBAODE();
|
||||
std::string getVersion() override { return version; };
|
||||
protected:
|
||||
void trainModel(const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing) override;
|
||||
private:
|
||||
void add_model(std::unique_ptr<XSpode> model);
|
||||
void remove_last_model();
|
||||
std::vector<std::vector<int>> X_train_, X_test_;
|
||||
std::vector<int> y_train_, y_test_;
|
||||
std::string version = "0.9.7";
|
||||
};
|
||||
}
|
||||
#endif // XBAODE_H
|
436
src/experimental_clfs/XSpode.hpp
Normal file
436
src/experimental_clfs/XSpode.hpp
Normal file
@@ -0,0 +1,436 @@
|
||||
#ifndef XSPODE_H
|
||||
#define XSPODE_H
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <torch/torch.h>
|
||||
#include <bayesnet/network/Smoothing.h>
|
||||
#include <bayesnet/classifiers/Classifier.h>
|
||||
#include "CountingSemaphore.hpp"
|
||||
|
||||
|
||||
namespace platform {
|
||||
|
||||
class XSpode : public bayesnet::Classifier {
|
||||
public:
|
||||
// --------------------------------------
|
||||
// Constructor
|
||||
//
|
||||
// Supply which feature index is the single super-parent (“spIndex”).
|
||||
// --------------------------------------
|
||||
explicit XSpode(int spIndex)
|
||||
: superParent_{ spIndex },
|
||||
nFeatures_{ 0 },
|
||||
statesClass_{ 0 },
|
||||
fitted_{ false },
|
||||
alpha_{ 1.0 },
|
||||
initializer_{ 1.0 },
|
||||
semaphore_{ CountingSemaphore::getInstance() } : bayesnet::Classifier(bayesnet::Network())
|
||||
{
|
||||
}
|
||||
|
||||
// --------------------------------------
|
||||
// fit
|
||||
// --------------------------------------
|
||||
//
|
||||
// Trains the SPODE given data:
|
||||
// X: X[f][n] is the f-th feature value for instance n
|
||||
// y: y[n] is the class value for instance n
|
||||
// states: a map or array that tells how many distinct states each feature and the class can take
|
||||
//
|
||||
// For example, states_.back() is the number of class states,
|
||||
// and states_[f] is the number of distinct values for feature f.
|
||||
//
|
||||
// We only store conditional probabilities for:
|
||||
// p(x_sp| c) (the super-parent feature)
|
||||
// p(x_child| c, x_sp) for all child ≠ sp
|
||||
//
|
||||
// The “weights” can be a vector of per-instance weights; if not used, pass them as 1.0.
|
||||
// --------------------------------------
|
||||
void fit(const std::vector<std::vector<int>>& X,
|
||||
const std::vector<int>& y,
|
||||
const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing)
|
||||
{
|
||||
int numInstances = static_cast<int>(y.size());
|
||||
nFeatures_ = static_cast<int>(X.size());
|
||||
|
||||
// Derive the number of states for each feature and for the class.
|
||||
// (This is just one approach; adapt to match your environment.)
|
||||
// Here, we assume the user also gave us the total #states per feature in e.g. statesMap.
|
||||
// We'll simply reconstruct the integer states_ array. The last entry is statesClass_.
|
||||
states_.resize(nFeatures_);
|
||||
for (int f = 0; f < nFeatures_; f++) {
|
||||
// Suppose you look up in “statesMap” by the feature name, or read directly from X.
|
||||
// We'll assume states_[f] = max value in X[f] + 1.
|
||||
auto maxIt = std::max_element(X[f].begin(), X[f].end());
|
||||
states_[f] = (*maxIt) + 1;
|
||||
}
|
||||
// For the class: states_.back() = max(y)+1
|
||||
statesClass_ = (*std::max_element(y.begin(), y.end())) + 1;
|
||||
|
||||
// Initialize counts
|
||||
classCounts_.resize(statesClass_, 0.0);
|
||||
// p(x_sp = spVal | c)
|
||||
// We'll store these counts in spFeatureCounts_[spVal * statesClass_ + c].
|
||||
spFeatureCounts_.resize(states_[superParent_] * statesClass_, 0.0);
|
||||
|
||||
// For each child ≠ sp, we store p(childVal| c, spVal) in a separate block of childCounts_.
|
||||
// childCounts_ will be sized as sum_{child≠sp} (states_[child] * statesClass_ * states_[sp]).
|
||||
// We also need an offset for each child to index into childCounts_.
|
||||
childOffsets_.resize(nFeatures_, -1);
|
||||
int totalSize = 0;
|
||||
for (int f = 0; f < nFeatures_; f++) {
|
||||
if (f == superParent_) continue; // skip sp
|
||||
childOffsets_[f] = totalSize;
|
||||
// block size for this child's counts: states_[f] * statesClass_ * states_[superParent_]
|
||||
totalSize += (states_[f] * statesClass_ * states_[superParent_]);
|
||||
}
|
||||
childCounts_.resize(totalSize, 0.0);
|
||||
|
||||
// Accumulate raw counts
|
||||
for (int n = 0; n < numInstances; n++) {
|
||||
std::vector<int> instance(nFeatures_ + 1);
|
||||
for (int f = 0; f < nFeatures_; f++) {
|
||||
instance[f] = X[f][n];
|
||||
}
|
||||
instance[nFeatures_] = y[n];
|
||||
addSample(instance, weights[n].item<double>());
|
||||
}
|
||||
|
||||
switch (smoothing) {
|
||||
case bayesnet::Smoothing_t::ORIGINAL:
|
||||
alpha_ = 1.0 / numInstances;
|
||||
break;
|
||||
case bayesnet::Smoothing_t::LAPLACE:
|
||||
alpha_ = 1.0;
|
||||
break;
|
||||
default:
|
||||
alpha_ = 0.0; // No smoothing
|
||||
}
|
||||
initializer_ = initializer_ = std::numeric_limits<double>::max() / (nFeatures_ * nFeatures_);
|
||||
// Convert raw counts to probabilities
|
||||
computeProbabilities();
|
||||
fitted_ = true;
|
||||
}
|
||||
|
||||
// --------------------------------------
|
||||
// addSample (only valid in COUNTS mode)
|
||||
// --------------------------------------
|
||||
//
|
||||
// instance has size nFeatures_ + 1, with the class at the end.
|
||||
// We add 1 to the appropriate counters for each (c, superParentVal, childVal).
|
||||
//
|
||||
void addSample(const std::vector<int>& instance, double weight)
|
||||
{
|
||||
if (weight <= 0.0) return;
|
||||
|
||||
int c = instance.back();
|
||||
// (A) increment classCounts
|
||||
classCounts_[c] += weight;
|
||||
|
||||
// (B) increment super-parent counts => p(x_sp | c)
|
||||
int spVal = instance[superParent_];
|
||||
spFeatureCounts_[spVal * statesClass_ + c] += weight;
|
||||
|
||||
// (C) increment child counts => p(childVal | c, x_sp)
|
||||
for (int f = 0; f < nFeatures_; f++) {
|
||||
if (f == superParent_) continue;
|
||||
int childVal = instance[f];
|
||||
int offset = childOffsets_[f];
|
||||
// Compute index in childCounts_.
|
||||
// Layout: [ offset + (spVal * states_[f] + childVal) * statesClass_ + c ]
|
||||
int blockSize = states_[f] * statesClass_;
|
||||
int idx = offset + spVal * blockSize + childVal * statesClass_ + c;
|
||||
childCounts_[idx] += weight;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------
|
||||
// computeProbabilities
|
||||
// --------------------------------------
|
||||
//
|
||||
// Once all samples are added in COUNTS mode, call this to:
|
||||
// p(c)
|
||||
// p(x_sp = spVal | c)
|
||||
// p(x_child = v | c, x_sp = s_sp)
|
||||
//
|
||||
// We store them in the corresponding *Probs_ arrays for inference.
|
||||
// --------------------------------------
|
||||
void computeProbabilities()
|
||||
{
|
||||
double totalCount = std::accumulate(classCounts_.begin(), classCounts_.end(), 0.0);
|
||||
|
||||
// p(c) => classPriors_
|
||||
classPriors_.resize(statesClass_, 0.0);
|
||||
if (totalCount <= 0.0) {
|
||||
// fallback => uniform
|
||||
double unif = 1.0 / static_cast<double>(statesClass_);
|
||||
for (int c = 0; c < statesClass_; c++) {
|
||||
classPriors_[c] = unif;
|
||||
}
|
||||
} else {
|
||||
for (int c = 0; c < statesClass_; c++) {
|
||||
classPriors_[c] = (classCounts_[c] + alpha_)
|
||||
/ (totalCount + alpha_ * statesClass_);
|
||||
}
|
||||
}
|
||||
|
||||
// p(x_sp | c)
|
||||
spFeatureProbs_.resize(spFeatureCounts_.size());
|
||||
// denominator for spVal * statesClass_ + c is just classCounts_[c] + alpha_ * (#states of sp)
|
||||
int spCard = states_[superParent_];
|
||||
for (int spVal = 0; spVal < spCard; spVal++) {
|
||||
for (int c = 0; c < statesClass_; c++) {
|
||||
double denom = classCounts_[c] + alpha_ * spCard;
|
||||
double num = spFeatureCounts_[spVal * statesClass_ + c] + alpha_;
|
||||
spFeatureProbs_[spVal * statesClass_ + c] = (denom <= 0.0 ? 0.0 : num / denom);
|
||||
}
|
||||
}
|
||||
|
||||
// p(x_child | c, x_sp)
|
||||
childProbs_.resize(childCounts_.size());
|
||||
for (int f = 0; f < nFeatures_; f++) {
|
||||
if (f == superParent_) continue;
|
||||
int offset = childOffsets_[f];
|
||||
int childCard = states_[f];
|
||||
|
||||
// For each spVal, c, childVal in childCounts_:
|
||||
for (int spVal = 0; spVal < spCard; spVal++) {
|
||||
for (int childVal = 0; childVal < childCard; childVal++) {
|
||||
for (int c = 0; c < statesClass_; c++) {
|
||||
int idx = offset + spVal * (childCard * statesClass_)
|
||||
+ childVal * statesClass_
|
||||
+ c;
|
||||
|
||||
double num = childCounts_[idx] + alpha_;
|
||||
// denominator = spFeatureCounts_[spVal * statesClass_ + c] + alpha_ * (#states of child)
|
||||
double denom = spFeatureCounts_[spVal * statesClass_ + c]
|
||||
+ alpha_ * childCard;
|
||||
childProbs_[idx] = (denom <= 0.0 ? 0.0 : num / denom);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// --------------------------------------
|
||||
// predict_proba
|
||||
// --------------------------------------
|
||||
//
|
||||
// For a single instance x of dimension nFeatures_:
|
||||
// P(c | x) ∝ p(c) × p(x_sp | c) × ∏(child ≠ sp) p(x_child | c, x_sp).
|
||||
//
|
||||
// Then we normalize the result.
|
||||
// --------------------------------------
|
||||
std::vector<double> predict_proba(const std::vector<int>& instance) const
|
||||
{
|
||||
std::vector<double> probs(statesClass_, 0.0);
|
||||
|
||||
// Multiply p(c) × p(x_sp | c)
|
||||
int spVal = instance[superParent_];
|
||||
for (int c = 0; c < statesClass_; c++) {
|
||||
double pc = classPriors_[c];
|
||||
double pSpC = spFeatureProbs_[spVal * statesClass_ + c];
|
||||
probs[c] = pc * pSpC * initializer_;
|
||||
}
|
||||
|
||||
// Multiply by each child’s probability p(x_child | c, x_sp)
|
||||
for (int feature = 0; feature < nFeatures_; feature++) {
|
||||
if (feature == superParent_) continue; // skip sp
|
||||
int sf = instance[feature];
|
||||
int offset = childOffsets_[feature];
|
||||
int childCard = states_[feature]; // not used directly, but for clarity
|
||||
// Index into childProbs_ = offset + spVal*(childCard*statesClass_) + childVal*statesClass_ + c
|
||||
int base = offset + spVal * (childCard * statesClass_) + sf * statesClass_;
|
||||
for (int c = 0; c < statesClass_; c++) {
|
||||
probs[c] *= childProbs_[base + c];
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize
|
||||
normalize(probs);
|
||||
return probs;
|
||||
}
|
||||
std::vector<std::vector<double>> predict_proba(const std::vector<std::vector<int>>& test_data)
|
||||
{
|
||||
int test_size = test_data[0].size();
|
||||
int sample_size = test_data.size();
|
||||
auto probabilities = std::vector<std::vector<double>>(test_size, std::vector<double>(statesClass_));
|
||||
|
||||
int chunk_size = std::min(150, int(test_size / semaphore_.getMaxCount()) + 1);
|
||||
std::vector<std::thread> threads;
|
||||
auto worker = [&](const std::vector<std::vector<int>>& samples, int begin, int chunk, int sample_size, std::vector<std::vector<double>>& predictions) {
|
||||
std::string threadName = "(V)PWorker-" + std::to_string(begin) + "-" + std::to_string(chunk);
|
||||
#if defined(__linux__)
|
||||
pthread_setname_np(pthread_self(), threadName.c_str());
|
||||
#else
|
||||
pthread_setname_np(threadName.c_str());
|
||||
#endif
|
||||
|
||||
std::vector<int> instance(sample_size);
|
||||
for (int sample = begin; sample < begin + chunk; ++sample) {
|
||||
for (int feature = 0; feature < sample_size; ++feature) {
|
||||
instance[feature] = samples[feature][sample];
|
||||
}
|
||||
predictions[sample] = predict_proba(instance);
|
||||
}
|
||||
semaphore_.release();
|
||||
};
|
||||
for (int begin = 0; begin < test_size; begin += chunk_size) {
|
||||
int chunk = std::min(chunk_size, test_size - begin);
|
||||
semaphore_.acquire();
|
||||
threads.emplace_back(worker, test_data, begin, chunk, sample_size, std::ref(probabilities));
|
||||
}
|
||||
for (auto& thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
return probabilities;
|
||||
}
|
||||
|
||||
// --------------------------------------
|
||||
// predict
|
||||
// --------------------------------------
|
||||
//
|
||||
// Return the class argmax( P(c|x) ).
|
||||
// --------------------------------------
|
||||
int predict(const std::vector<int>& instance) const
|
||||
{
|
||||
auto p = predict_proba(instance);
|
||||
return static_cast<int>(std::distance(p.begin(),
|
||||
std::max_element(p.begin(), p.end())));
|
||||
}
|
||||
std::vector<int> predict(std::vector<std::vector<int>>& test_data)
|
||||
{
|
||||
if (!fitted_) {
|
||||
throw std::logic_error(CLASSIFIER_NOT_FITTED);
|
||||
}
|
||||
auto probabilities = predict_proba(test_data);
|
||||
std::vector<int> predictions(probabilities.size(), 0);
|
||||
|
||||
for (size_t i = 0; i < probabilities.size(); i++) {
|
||||
predictions[i] = std::distance(probabilities[i].begin(), std::max_element(probabilities[i].begin(), probabilities[i].end()));
|
||||
}
|
||||
|
||||
return predictions;
|
||||
}
|
||||
|
||||
// --------------------------------------
|
||||
// Utility: normalize
|
||||
// --------------------------------------
|
||||
void normalize(std::vector<double>& v) const
|
||||
{
|
||||
double sum = 0.0;
|
||||
for (auto val : v) { sum += val; }
|
||||
if (sum <= 0.0) {
|
||||
return;
|
||||
}
|
||||
for (auto& val : v) {
|
||||
val /= sum;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------
|
||||
// debug printing, if desired
|
||||
// --------------------------------------
|
||||
std::string to_string() const
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << "---- SPODE Model ----\n"
|
||||
<< "nFeatures_ = " << nFeatures_ << "\n"
|
||||
<< "superParent_ = " << superParent_ << "\n"
|
||||
<< "statesClass_ = " << statesClass_ << "\n"
|
||||
<< "\n";
|
||||
|
||||
oss << "States: [";
|
||||
for (int s : states_) oss << s << " ";
|
||||
oss << "]\n";
|
||||
|
||||
oss << "classCounts_: [";
|
||||
for (double c : classCounts_) oss << c << " ";
|
||||
oss << "]\n";
|
||||
|
||||
oss << "classPriors_: [";
|
||||
for (double c : classPriors_) oss << c << " ";
|
||||
oss << "]\n";
|
||||
|
||||
oss << "spFeatureCounts_: size = " << spFeatureCounts_.size() << "\n[";
|
||||
for (double c : spFeatureCounts_) oss << c << " ";
|
||||
oss << "]\n";
|
||||
|
||||
oss << "spFeatureProbs_: size = " << spFeatureProbs_.size() << "\n[";
|
||||
for (double c : spFeatureProbs_) oss << c << " ";
|
||||
oss << "]\n";
|
||||
|
||||
oss << "childCounts_: size = " << childCounts_.size() << "\n[";
|
||||
for (double cc : childCounts_) oss << cc << " ";
|
||||
oss << "]\n";
|
||||
|
||||
oss << "childProbs_: size = " << childProbs_.size() << "\n[";
|
||||
for (double cp : childProbs_) oss << cp << " ";
|
||||
oss << "]\n";
|
||||
|
||||
oss << "childOffsets_: [";
|
||||
for (int co : childOffsets_) oss << co << " ";
|
||||
oss << "]\n";
|
||||
|
||||
oss << "---------------------\n";
|
||||
return oss.str();
|
||||
}
|
||||
int statesClass() const { return statesClass_; }
|
||||
int getNFeatures() const { return nFeatures_; }
|
||||
int getNumberOfStates() const
|
||||
{
|
||||
return std::accumulate(states_.begin(), states_.end(), 0) * nFeatures_;
|
||||
}
|
||||
int getNumberOfEdges() const
|
||||
{
|
||||
return nFeatures_ * (2 * nFeatures_ - 1);
|
||||
}
|
||||
std::vector<int>& getStates() { return states_; }
|
||||
|
||||
private:
|
||||
// --------------------------------------
|
||||
// MEMBERS
|
||||
// --------------------------------------
|
||||
|
||||
int superParent_; // which feature is the single super-parent
|
||||
int nFeatures_;
|
||||
int statesClass_;
|
||||
bool fitted_ = false;
|
||||
std::vector<int> states_; // [states_feat0, ..., states_feat(N-1)] (class not included in this array)
|
||||
|
||||
const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted";
|
||||
|
||||
// Class counts
|
||||
std::vector<double> classCounts_; // [c], accumulative
|
||||
std::vector<double> classPriors_; // [c], after normalization
|
||||
|
||||
// For p(x_sp = spVal | c)
|
||||
std::vector<double> spFeatureCounts_; // [spVal * statesClass_ + c]
|
||||
std::vector<double> spFeatureProbs_; // same shape, after normalization
|
||||
|
||||
// For p(x_child = childVal | x_sp = spVal, c)
|
||||
// childCounts_ is big enough to hold all child features except sp:
|
||||
// For each child f, we store childOffsets_[f] as the start index, then
|
||||
// childVal, spVal, c => the data.
|
||||
std::vector<double> childCounts_;
|
||||
std::vector<double> childProbs_;
|
||||
std::vector<int> childOffsets_;
|
||||
|
||||
double alpha_ = 1.0;
|
||||
double initializer_; // for numerical stability
|
||||
CountingSemaphore& semaphore_;
|
||||
};
|
||||
|
||||
} // namespace platform
|
||||
|
||||
#endif // XSPODE_H
|
478
src/experimental_clfs/Xaode.hpp
Normal file
478
src/experimental_clfs/Xaode.hpp
Normal file
@@ -0,0 +1,478 @@
|
||||
// ***************************************************************
|
||||
// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
// Based on the Geoff. I. Webb A1DE java algorithm
|
||||
// https://weka.sourceforge.io/packageMetaData/AnDE/Latest.html
|
||||
|
||||
#ifndef XAODE_H
|
||||
#define XAODE_H
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <sstream>
|
||||
#include <torch/torch.h>
|
||||
#include <bayesnet/network/Smoothing.h>
|
||||
|
||||
|
||||
namespace platform {
|
||||
class Xaode {
|
||||
public:
|
||||
// -------------------------------------------------------
|
||||
// The Xaode can be EMPTY (just created), in COUNTS mode (accumulating raw counts)
|
||||
// or PROBS mode (storing conditional probabilities).
|
||||
enum class MatrixState {
|
||||
EMPTY,
|
||||
COUNTS,
|
||||
PROBS
|
||||
};
|
||||
std::vector<double> significance_models_;
|
||||
Xaode() : nFeatures_{ 0 }, statesClass_{ 0 }, matrixState_{ MatrixState::EMPTY } {}
|
||||
// -------------------------------------------------------
|
||||
// fit
|
||||
// -------------------------------------------------------
|
||||
//
|
||||
// Classifiers interface
|
||||
// all parameter decide if the model is initialized with all the parents active or none of them
|
||||
//
|
||||
// states.size() = nFeatures + 1,
|
||||
// where states.back() = number of class states.
|
||||
//
|
||||
// We'll store:
|
||||
// 1) p(x_i=si | c) in classFeatureProbs_
|
||||
// 2) p(x_j=sj | c, x_i=si) in data_, with i<j => i is "superparent," j is "child."
|
||||
//
|
||||
// Internally, in COUNTS mode, data_ accumulates raw counts, then
|
||||
// computeProbabilities(...) normalizes them into conditionals.
|
||||
void fit(std::vector<std::vector<int>>& X, std::vector<int>& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights, const bool all_parents, const bayesnet::Smoothing_t smoothing)
|
||||
{
|
||||
int num_instances = X[0].size();
|
||||
nFeatures_ = X.size();
|
||||
|
||||
significance_models_.resize(nFeatures_, (all_parents ? 1.0 : 0.0));
|
||||
for (int i = 0; i < nFeatures_; i++) {
|
||||
if (all_parents) active_parents.push_back(i);
|
||||
states_.push_back(*max_element(X[i].begin(), X[i].end()) + 1);
|
||||
}
|
||||
states_.push_back(*max_element(y.begin(), y.end()) + 1);
|
||||
//
|
||||
statesClass_ = states_.back();
|
||||
classCounts_.resize(statesClass_, 0.0);
|
||||
classPriors_.resize(statesClass_, 0.0);
|
||||
//
|
||||
// Initialize data structures
|
||||
//
|
||||
active_parents.resize(nFeatures_);
|
||||
int totalStates = std::accumulate(states_.begin(), states_.end(), 0) - statesClass_;
|
||||
|
||||
// For p(x_i=si | c), we store them in a 1D array classFeatureProbs_ after we compute.
|
||||
// We'll need the offsets for each feature i in featureClassOffset_.
|
||||
featureClassOffset_.resize(nFeatures_);
|
||||
// We'll store p(x_child=sj | c, x_sp=si) for each pair (i<j).
|
||||
// So data_(i, si, j, sj, c) indexes into a big 1D array with an offset.
|
||||
// For p(x_i=si | c), we store them in a 1D array classFeatureProbs_ after we compute.
|
||||
// We'll need the offsets for each feature i in featureClassOffset_.
|
||||
featureClassOffset_.resize(nFeatures_);
|
||||
pairOffset_.resize(totalStates);
|
||||
int feature_offset = 0;
|
||||
int runningOffset = 0;
|
||||
int feature = 0, index = 0;
|
||||
for (int i = 0; i < nFeatures_; ++i) {
|
||||
featureClassOffset_[i] = feature_offset;
|
||||
feature_offset += states_[i];
|
||||
for (int j = 0; j < states_[i]; ++j) {
|
||||
pairOffset_[feature++] = index;
|
||||
index += runningOffset;
|
||||
}
|
||||
runningOffset += states_[i];
|
||||
}
|
||||
int totalSize = index * statesClass_;
|
||||
data_.resize(totalSize);
|
||||
dataOpp_.resize(totalSize);
|
||||
|
||||
classFeatureCounts_.resize(feature_offset * statesClass_);
|
||||
classFeatureProbs_.resize(feature_offset * statesClass_);
|
||||
|
||||
matrixState_ = MatrixState::COUNTS;
|
||||
//
|
||||
// Add samples
|
||||
//
|
||||
std::vector<int> instance(nFeatures_ + 1);
|
||||
for (int n_instance = 0; n_instance < num_instances; n_instance++) {
|
||||
for (int feature = 0; feature < nFeatures_; feature++) {
|
||||
instance[feature] = X[feature][n_instance];
|
||||
}
|
||||
instance[nFeatures_] = y[n_instance];
|
||||
addSample(instance, weights[n_instance].item<double>());
|
||||
}
|
||||
switch (smoothing) {
|
||||
case bayesnet::Smoothing_t::ORIGINAL:
|
||||
alpha_ = 1.0 / num_instances;
|
||||
break;
|
||||
case bayesnet::Smoothing_t::LAPLACE:
|
||||
alpha_ = 1.0;
|
||||
break;
|
||||
default:
|
||||
alpha_ = 0.0; // No smoothing
|
||||
}
|
||||
initializer_ = std::numeric_limits<double>::max() / (nFeatures_ * nFeatures_);
|
||||
computeProbabilities();
|
||||
}
|
||||
std::string to_string() const
|
||||
{
|
||||
std::ostringstream ostream;
|
||||
ostream << "-------- Xaode.status --------" << std::endl
|
||||
<< "- nFeatures = " << nFeatures_ << std::endl
|
||||
<< "- statesClass = " << statesClass_ << std::endl
|
||||
<< "- matrixState = " << (matrixState_ == MatrixState::COUNTS ? "COUNTS" : "PROBS") << std::endl;
|
||||
ostream << "- states: size: " << states_.size() << std::endl;
|
||||
for (int s : states_) ostream << s << " "; ostream << std::endl;
|
||||
ostream << "- classCounts: size: " << classCounts_.size() << std::endl;
|
||||
for (double cc : classCounts_) ostream << cc << " "; ostream << std::endl;
|
||||
ostream << "- classPriors: size: " << classPriors_.size() << std::endl;
|
||||
for (double cp : classPriors_) ostream << cp << " "; ostream << std::endl;
|
||||
ostream << "- classFeatureCounts: size: " << classFeatureCounts_.size() << std::endl;
|
||||
for (double cfc : classFeatureCounts_) ostream << cfc << " "; ostream << std::endl;
|
||||
ostream << "- classFeatureProbs: size: " << classFeatureProbs_.size() << std::endl;
|
||||
for (double cfp : classFeatureProbs_) ostream << cfp << " "; ostream << std::endl;
|
||||
ostream << "- featureClassOffset: size: " << featureClassOffset_.size() << std::endl;
|
||||
for (int f : featureClassOffset_) ostream << f << " "; ostream << std::endl;
|
||||
ostream << "- pairOffset_: size: " << pairOffset_.size() << std::endl;
|
||||
for (int p : pairOffset_) ostream << p << " "; ostream << std::endl;
|
||||
ostream << "- data: size: " << data_.size() << std::endl;
|
||||
for (double d : data_) ostream << d << " "; ostream << std::endl;
|
||||
ostream << "- dataOpp: size: " << dataOpp_.size() << std::endl;
|
||||
for (double d : dataOpp_) ostream << d << " "; ostream << std::endl;
|
||||
ostream << "--------------------------------" << std::endl;
|
||||
std::string output = ostream.str();
|
||||
return output;
|
||||
}
|
||||
// -------------------------------------------------------
|
||||
// addSample (only in COUNTS mode)
|
||||
// -------------------------------------------------------
|
||||
//
|
||||
// instance should have the class at the end.
|
||||
//
|
||||
void addSample(const std::vector<int>& instance, double weight)
|
||||
{
|
||||
//
|
||||
// (A) increment classCounts_
|
||||
// (B) increment feature–class counts => for p(x_i|c)
|
||||
// (C) increment pair (superparent= i, child= j) counts => data_
|
||||
//
|
||||
int c = instance.back();
|
||||
if (weight <= 0.0) {
|
||||
return;
|
||||
}
|
||||
// (A) increment classCounts_
|
||||
classCounts_[c] += weight;
|
||||
|
||||
// (B,C)
|
||||
// We'll store raw counts now and turn them into p(child| c, superparent) later.
|
||||
int idx, fcIndex, sp, sc, i_offset;
|
||||
for (int parent = 0; parent < nFeatures_; ++parent) {
|
||||
sp = instance[parent];
|
||||
// (B) increment feature–class counts => for p(x_i|c)
|
||||
fcIndex = (featureClassOffset_[parent] + sp) * statesClass_ + c;
|
||||
classFeatureCounts_[fcIndex] += weight;
|
||||
// (C) increment pair (superparent= i, child= j) counts => data_
|
||||
i_offset = pairOffset_[featureClassOffset_[parent] + sp];
|
||||
for (int child = 0; child < parent; ++child) {
|
||||
sc = instance[child];
|
||||
idx = (i_offset + featureClassOffset_[child] + sc) * statesClass_ + c;
|
||||
data_[idx] += weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
// -------------------------------------------------------
|
||||
// computeProbabilities
|
||||
// -------------------------------------------------------
|
||||
//
|
||||
// Once all samples are added in COUNTS mode, call this to:
|
||||
// 1) compute p(c) => classPriors_
|
||||
// 2) compute p(x_i=si | c) => classFeatureProbs_
|
||||
// 3) compute p(x_j=sj | c, x_i=si) => data_ (for i<j) dataOpp_ (for i>j)
|
||||
//
|
||||
void computeProbabilities()
|
||||
{
|
||||
if (matrixState_ != MatrixState::COUNTS) {
|
||||
throw std::logic_error("computeProbabilities: must be in COUNTS mode.");
|
||||
}
|
||||
double totalCount = std::accumulate(classCounts_.begin(), classCounts_.end(), 0.0);
|
||||
// (1) p(c)
|
||||
if (totalCount <= 0.0) {
|
||||
// fallback => uniform
|
||||
double unif = 1.0 / statesClass_;
|
||||
for (int c = 0; c < statesClass_; ++c) {
|
||||
classPriors_[c] = unif;
|
||||
}
|
||||
} else {
|
||||
for (int c = 0; c < statesClass_; ++c) {
|
||||
classPriors_[c] = (classCounts_[c] + alpha_) / (totalCount + alpha_ * statesClass_);
|
||||
}
|
||||
}
|
||||
// (2) p(x_i=si | c) => classFeatureProbs_
|
||||
int idx, sf;
|
||||
double denom;
|
||||
for (int feature = 0; feature < nFeatures_; ++feature) {
|
||||
sf = states_[feature];
|
||||
for (int c = 0; c < statesClass_; ++c) {
|
||||
denom = classCounts_[c] + alpha_ * sf;
|
||||
for (int sf_value = 0; sf_value < sf; ++sf_value) {
|
||||
idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c;
|
||||
classFeatureProbs_[idx] = (classFeatureCounts_[idx] + alpha_) / denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
// getCountFromTable(int classVal, int pIndex, int childIndex)
|
||||
// (3) p(x_c=sc | c, x_p=sp) => data_(parent,sp,child,sc,c)
|
||||
// (3) p(x_p=sp | c, x_c=sc) => dataOpp_(child,sc,parent,sp,c)
|
||||
// C(x_c, x_p, c) + alpha_
|
||||
// P(x_p | x_c, c) = -----------------------------------
|
||||
// C(x_c, c) + alpha_
|
||||
double pcc_count, pc_count, cc_count;
|
||||
double conditionalProb, oppositeCondProb;
|
||||
int part1, part2, p1, part2_class, p1_class;
|
||||
for (int parent = 1; parent < nFeatures_; ++parent) {
|
||||
for (int sp = 0; sp < states_[parent]; ++sp) {
|
||||
p1 = featureClassOffset_[parent] + sp;
|
||||
part1 = pairOffset_[p1];
|
||||
p1_class = p1 * statesClass_;
|
||||
for (int child = 0; child < parent; ++child) {
|
||||
for (int sc = 0; sc < states_[child]; ++sc) {
|
||||
part2 = featureClassOffset_[child] + sc;
|
||||
part2_class = part2 * statesClass_;
|
||||
for (int c = 0; c < statesClass_; c++) {
|
||||
idx = (part1 + part2) * statesClass_ + c;
|
||||
// Parent, Child, Class Count
|
||||
pcc_count = data_[idx];
|
||||
// Parent, Class count
|
||||
pc_count = classFeatureCounts_[p1_class + c];
|
||||
// Child, Class count
|
||||
cc_count = classFeatureCounts_[part2_class + c];
|
||||
// p(x_c=sc | c, x_p=sp)
|
||||
conditionalProb = (pcc_count + alpha_) / (pc_count + alpha_ * states_[child]);
|
||||
data_[idx] = conditionalProb;
|
||||
// p(x_p=sp | c, x_c=sc)
|
||||
oppositeCondProb = (pcc_count + alpha_) / (cc_count + alpha_ * states_[parent]);
|
||||
dataOpp_[idx] = oppositeCondProb;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
matrixState_ = MatrixState::PROBS;
|
||||
}
|
||||
// -------------------------------------------------------
|
||||
// predict_proba_spode
|
||||
// -------------------------------------------------------
|
||||
//
|
||||
// Single-superparent approach:
|
||||
// P(c | x) ∝ p(c) * p(x_sp| c) * ∏_{i≠sp} p(x_i | c, x_sp)
|
||||
//
|
||||
// 'instance' should have size == nFeatures_ (no class).
|
||||
// sp in [0..nFeatures_).
|
||||
// We multiply p(c) * p(x_sp| c) * p(x_i| c, x_sp).
|
||||
// Then normalize the distribution.
|
||||
//
|
||||
std::vector<double> predict_proba_spode(const std::vector<int>& instance, int parent)
|
||||
{
|
||||
// accumulates posterior probabilities for each class
|
||||
auto probs = std::vector<double>(statesClass_);
|
||||
auto spodeProbs = std::vector<double>(statesClass_, 0.0);
|
||||
if (std::find(active_parents.begin(), active_parents.end(), parent) == active_parents.end()) {
|
||||
return spodeProbs;
|
||||
}
|
||||
// Initialize the probabilities with the feature|class probabilities x class priors
|
||||
int localOffset;
|
||||
int sp = instance[parent];
|
||||
localOffset = (featureClassOffset_[parent] + sp) * statesClass_;
|
||||
for (int c = 0; c < statesClass_; ++c) {
|
||||
spodeProbs[c] = classFeatureProbs_[localOffset + c] * classPriors_[c] * initializer_;
|
||||
}
|
||||
int idx, base, sc, parent_offset;
|
||||
for (int child = 0; child < nFeatures_; ++child) {
|
||||
if (child == parent) {
|
||||
continue;
|
||||
}
|
||||
sc = instance[child];
|
||||
if (child > parent) {
|
||||
parent_offset = pairOffset_[featureClassOffset_[child] + sc];
|
||||
base = (parent_offset + featureClassOffset_[parent] + sp) * statesClass_;
|
||||
} else {
|
||||
parent_offset = pairOffset_[featureClassOffset_[parent] + sp];
|
||||
base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_;
|
||||
}
|
||||
for (int c = 0; c < statesClass_; ++c) {
|
||||
/*
|
||||
* The probability P(xc|xp,c) is stored in dataOpp_, and
|
||||
* the probability P(xp|xc,c) is stored in data_
|
||||
*/
|
||||
idx = base + c;
|
||||
double factor = child > parent ? dataOpp_[idx] : data_[idx];
|
||||
// double factor = data_[idx];
|
||||
spodeProbs[c] *= factor;
|
||||
}
|
||||
}
|
||||
// Normalize the probabilities
|
||||
normalize(spodeProbs);
|
||||
return spodeProbs;
|
||||
}
|
||||
int predict_spode(const std::vector<int>& instance, int parent)
|
||||
{
|
||||
auto probs = predict_proba_spode(instance, parent);
|
||||
return (int)std::distance(probs.begin(), std::max_element(probs.begin(), probs.end()));
|
||||
}
|
||||
// -------------------------------------------------------
|
||||
// predict_proba
|
||||
// -------------------------------------------------------
|
||||
//
|
||||
// P(c | x) ∝ p(c) * ∏_{i} p(x_i | c) * ∏_{i<j} p(x_j | c, x_i) * p(x_i | c, x_j)
|
||||
//
|
||||
// 'instance' should have size == nFeatures_ (no class).
|
||||
// We multiply p(c) * p(x_i| c) * p(x_j| c, x_i) for all i, j.
|
||||
// Then normalize the distribution.
|
||||
//
|
||||
std::vector<double> predict_proba(const std::vector<int>& instance)
|
||||
{
|
||||
// accumulates posterior probabilities for each class
|
||||
auto probs = std::vector<double>(statesClass_);
|
||||
auto spodeProbs = std::vector<std::vector<double>>(nFeatures_, std::vector<double>(statesClass_));
|
||||
// Initialize the probabilities with the feature|class probabilities
|
||||
int localOffset;
|
||||
for (int feature = 0; feature < nFeatures_; ++feature) {
|
||||
// if feature is not in the active_parents, skip it
|
||||
if (std::find(active_parents.begin(), active_parents.end(), feature) == active_parents.end()) {
|
||||
continue;
|
||||
}
|
||||
localOffset = (featureClassOffset_[feature] + instance[feature]) * statesClass_;
|
||||
for (int c = 0; c < statesClass_; ++c) {
|
||||
spodeProbs[feature][c] = classFeatureProbs_[localOffset + c] * classPriors_[c] * initializer_;
|
||||
}
|
||||
}
|
||||
int idx, base, sp, sc, parent_offset;
|
||||
for (int parent = 1; parent < nFeatures_; ++parent) {
|
||||
// if parent is not in the active_parents, skip it
|
||||
if (std::find(active_parents.begin(), active_parents.end(), parent) == active_parents.end()) {
|
||||
continue;
|
||||
}
|
||||
sp = instance[parent];
|
||||
parent_offset = pairOffset_[featureClassOffset_[parent] + sp];
|
||||
for (int child = 0; child < parent; ++child) {
|
||||
sc = instance[child];
|
||||
if (child > parent) {
|
||||
parent_offset = pairOffset_[featureClassOffset_[child] + sc];
|
||||
base = (parent_offset + featureClassOffset_[parent] + sp) * statesClass_;
|
||||
} else {
|
||||
parent_offset = pairOffset_[featureClassOffset_[parent] + sp];
|
||||
base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_;
|
||||
}
|
||||
for (int c = 0; c < statesClass_; ++c) {
|
||||
/*
|
||||
* The probability P(xc|xp,c) is stored in dataOpp_, and
|
||||
* the probability P(xp|xc,c) is stored in data_
|
||||
*/
|
||||
idx = base + c;
|
||||
double factor_child = child > parent ? data_[idx] : dataOpp_[idx];
|
||||
double factor_parent = child > parent ? dataOpp_[idx] : data_[idx];
|
||||
spodeProbs[child][c] *= factor_child;
|
||||
spodeProbs[parent][c] *= factor_parent;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* add all the probabilities for each class */
|
||||
for (int c = 0; c < statesClass_; ++c) {
|
||||
for (int i = 0; i < nFeatures_; ++i) {
|
||||
probs[c] += spodeProbs[i][c] * significance_models_[i];
|
||||
}
|
||||
}
|
||||
// Normalize the probabilities
|
||||
normalize(probs);
|
||||
return probs;
|
||||
}
|
||||
void normalize(std::vector<double>& probs) const
|
||||
{
|
||||
double sum = std::accumulate(probs.begin(), probs.end(), 0.0);
|
||||
if (std::isnan(sum)) {
|
||||
throw std::runtime_error("Can't normalize array. Sum is NaN.");
|
||||
}
|
||||
if (sum == 0) {
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < (int)probs.size(); i++) {
|
||||
probs[i] /= sum;
|
||||
}
|
||||
}
|
||||
// Returns current mode: INIT, COUNTS or PROBS
|
||||
MatrixState state() const
|
||||
{
|
||||
return matrixState_;
|
||||
}
|
||||
int statesClass() const
|
||||
{
|
||||
return statesClass_;
|
||||
}
|
||||
int nFeatures() const
|
||||
{
|
||||
return nFeatures_;
|
||||
}
|
||||
int getNumberOfStates() const
|
||||
{
|
||||
return std::accumulate(states_.begin(), states_.end(), 0) * nFeatures_;
|
||||
}
|
||||
int getNumberOfEdges() const
|
||||
{
|
||||
return nFeatures_ * (2 * nFeatures_ - 1);
|
||||
}
|
||||
int getNumberOfNodes() const
|
||||
{
|
||||
return (nFeatures_ + 1) * nFeatures_;
|
||||
}
|
||||
void add_active_parent(int active_parent)
|
||||
{
|
||||
active_parents.push_back(active_parent);
|
||||
}
|
||||
void remove_last_parent()
|
||||
{
|
||||
active_parents.pop_back();
|
||||
}
|
||||
|
||||
private:
|
||||
// -----------
|
||||
// MEMBER DATA
|
||||
// -----------
|
||||
std::vector<int> states_; // [states_feat0, ..., states_feat(n-1), statesClass_]
|
||||
int nFeatures_;
|
||||
int statesClass_;
|
||||
|
||||
// data_ means p(child=sj | c, superparent= si) after normalization.
|
||||
// But in COUNTS mode, it accumulates raw counts.
|
||||
std::vector<int> pairOffset_;
|
||||
// data_ stores p(child=sj | c, superparent=si) for each pair (i<j).
|
||||
std::vector<double> data_;
|
||||
// dataOpp_ stores p(superparent=si | c, child=sj) for each pair (i<j).
|
||||
std::vector<double> dataOpp_;
|
||||
|
||||
// classCounts_[c]
|
||||
std::vector<double> classCounts_;
|
||||
std::vector<double> classPriors_; // => p(c)
|
||||
|
||||
// For p(x_i=si| c), we store counts in classFeatureCounts_ => offset by featureClassOffset_[i]
|
||||
std::vector<int> featureClassOffset_;
|
||||
std::vector<double> classFeatureCounts_;
|
||||
std::vector<double> classFeatureProbs_; // => p(x_i=si | c) after normalization
|
||||
|
||||
MatrixState matrixState_;
|
||||
|
||||
double alpha_ = 1.0; // Laplace smoothing
|
||||
double initializer_ = 1.0;
|
||||
std::vector<int> active_parents;
|
||||
};
|
||||
}
|
||||
#endif // XAODE_H
|
314
src/grid/GridBase.cpp
Normal file
314
src/grid/GridBase.cpp
Normal file
@@ -0,0 +1,314 @@
|
||||
#include <random>
|
||||
#include <cstddef>
|
||||
#include "common/DotEnv.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Colors.h"
|
||||
#include "GridBase.h"
|
||||
|
||||
|
||||
namespace platform {
|
||||
|
||||
GridBase::GridBase(struct ConfigGrid& config)
|
||||
{
|
||||
this->config = config;
|
||||
auto env = platform::DotEnv();
|
||||
this->config.platform = env.get("platform");
|
||||
|
||||
}
|
||||
void GridBase::validate_config()
|
||||
{
|
||||
if (config.smooth_strategy == "ORIGINAL")
|
||||
smooth_type = bayesnet::Smoothing_t::ORIGINAL;
|
||||
else if (config.smooth_strategy == "LAPLACE")
|
||||
smooth_type = bayesnet::Smoothing_t::LAPLACE;
|
||||
else if (config.smooth_strategy == "CESTNIK")
|
||||
smooth_type = bayesnet::Smoothing_t::CESTNIK;
|
||||
else {
|
||||
std::cerr << "GridBase: Unknown smoothing strategy: " << config.smooth_strategy << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
std::string GridBase::get_color_rank(int rank)
|
||||
{
|
||||
auto colors = { Colors::WHITE(), Colors::RED(), Colors::GREEN(), Colors::BLUE(), Colors::MAGENTA(), Colors::CYAN(), Colors::YELLOW(), Colors::BLACK() };
|
||||
std::string id = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||
auto idx = rank % id.size();
|
||||
return *(colors.begin() + rank % colors.size()) + id[idx];
|
||||
}
|
||||
void GridBase::shuffle_and_progress_bar(json& tasks)
|
||||
{
|
||||
// Shuffle the array so heavy datasets are eas ier spread across the workers
|
||||
std::mt19937 g{ 271 }; // Use fixed seed to obtain the same shuffle
|
||||
std::shuffle(tasks.begin(), tasks.end(), g);
|
||||
std::cout << "* Number of tasks: " << tasks.size() << std::endl;
|
||||
std::cout << separator << std::flush;
|
||||
for (int i = 0; i < tasks.size(); ++i) {
|
||||
if ((i + 1) % 10 == 0)
|
||||
std::cout << separator;
|
||||
else
|
||||
std::cout << (i + 1) % 10;
|
||||
}
|
||||
std::cout << separator << std::endl << separator << std::flush;
|
||||
}
|
||||
json GridBase::build_tasks(Datasets& datasets)
|
||||
{
|
||||
/*
|
||||
* Each task is a json object with the following structure:
|
||||
* {
|
||||
* "dataset": "dataset_name",
|
||||
* "idx_dataset": idx_dataset, // used to identify the dataset in the results
|
||||
* // this index is relative to the list of used datasets in the actual run not to the whole datasets list
|
||||
* "seed": # of seed to use,
|
||||
* "fold": # of fold to process
|
||||
* }
|
||||
* This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold
|
||||
*/
|
||||
auto tasks = json::array();
|
||||
auto all_datasets = datasets.getNames();
|
||||
auto datasets_names = filterDatasets(datasets);
|
||||
for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) {
|
||||
auto dataset = datasets_names[idx_dataset];
|
||||
for (const auto& seed : config.seeds) {
|
||||
for (int n_fold = 0; n_fold < config.n_folds; n_fold++) {
|
||||
json task = {
|
||||
{ "dataset", dataset },
|
||||
{ "idx_dataset", idx_dataset},
|
||||
{ "seed", seed },
|
||||
{ "fold", n_fold},
|
||||
};
|
||||
tasks.push_back(task);
|
||||
}
|
||||
}
|
||||
}
|
||||
shuffle_and_progress_bar(tasks);
|
||||
return tasks;
|
||||
}
|
||||
void GridBase::summary(json& all_results, json& tasks, struct ConfigMPI& config_mpi)
|
||||
{
|
||||
// Report the tasks done by each worker, showing dataset number, seed, fold and time spent
|
||||
// The format I want to show is:
|
||||
// worker, dataset, seed, fold, time
|
||||
// with headers
|
||||
std::cout << Colors::RESET() << "* Summary of tasks done by each worker" << std::endl;
|
||||
json worker_tasks = json::array();
|
||||
for (int i = 0; i < config_mpi.n_procs; ++i) {
|
||||
worker_tasks.push_back(json::array());
|
||||
}
|
||||
int max_dataset = 7;
|
||||
for (const auto& [key, results] : all_results.items()) {
|
||||
auto dataset = key;
|
||||
if (dataset.size() > max_dataset)
|
||||
max_dataset = dataset.size();
|
||||
for (const auto& result : results) {
|
||||
int n_task = result["task"].get<int>();
|
||||
json task = tasks[n_task];
|
||||
auto seed = task["seed"].get<int>();
|
||||
auto fold = task["fold"].get<int>();
|
||||
auto time = result["time"].get<double>();
|
||||
auto worker = result["process"].get<int>();
|
||||
json line = {
|
||||
{ "dataset", dataset },
|
||||
{ "seed", seed },
|
||||
{ "fold", fold },
|
||||
{ "time", time }
|
||||
};
|
||||
worker_tasks[worker].push_back(line);
|
||||
}
|
||||
}
|
||||
std::cout << Colors::MAGENTA() << " W " << setw(max_dataset) << std::left << "Dataset";
|
||||
std::cout << " Seed Fold Time" << std::endl;
|
||||
std::cout << "=== " << std::string(max_dataset, '=') << " ==== ==== " << std::string(15, '=') << std::endl;
|
||||
for (int worker = 0; worker < config_mpi.n_procs; ++worker) {
|
||||
auto color = (worker % 2) ? Colors::CYAN() : Colors::BLUE();
|
||||
std::cout << color << std::right << setw(3) << worker << " ";
|
||||
if (worker == config_mpi.manager) {
|
||||
std::cout << "Manager" << std::endl;
|
||||
continue;
|
||||
}
|
||||
if (worker_tasks[worker].empty()) {
|
||||
std::cout << "No tasks" << std::endl;
|
||||
continue;
|
||||
}
|
||||
bool first = true;
|
||||
double total = 0.0;
|
||||
int num_tasks = 0;
|
||||
for (const auto& task : worker_tasks[worker]) {
|
||||
num_tasks++;
|
||||
if (!first)
|
||||
std::cout << std::string(4, ' ');
|
||||
else
|
||||
first = false;
|
||||
std::cout << std::left << setw(max_dataset) << task["dataset"].get<std::string>();
|
||||
std::cout << " " << setw(4) << std::right << task["seed"].get<int>();
|
||||
std::cout << " " << setw(4) << task["fold"].get<int>();
|
||||
std::cout << " " << setw(15) << std::setprecision(7) << std::fixed << task["time"].get<double>() << std::endl;
|
||||
total += task["time"].get<double>();
|
||||
}
|
||||
if (num_tasks > 1) {
|
||||
std::cout << Colors::MAGENTA() << " ";
|
||||
std::cout << setw(max_dataset) << "Total (" << setw(2) << std::right << num_tasks << ")" << std::string(7, '.');
|
||||
std::cout << " " << setw(15) << std::setprecision(7) << std::fixed << total << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
void GridBase::go(struct ConfigMPI& config_mpi)
|
||||
{
|
||||
/*
|
||||
* Each task is a json object with the data needed by the process
|
||||
*
|
||||
* The overall process consists in these steps:
|
||||
* 0. Validate config, create the MPI result type & tasks
|
||||
* 0.1 Create the MPI result type
|
||||
* 0.2 Manager creates the tasks
|
||||
* 1. Manager will broadcast the tasks to all the processes
|
||||
* 1.1 Broadcast the number of tasks
|
||||
* 1.2 Broadcast the length of the following string
|
||||
* 1.2 Broadcast the tasks as a char* string
|
||||
* 2a. Producer delivers the tasks to the consumers
|
||||
* 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
|
||||
* 2a.2 Producer will send the end message to all the consumers
|
||||
* 2b. Consumers process the tasks and send the results to the producer
|
||||
* 2b.1 Consumers announce to the producer that they are ready to receive a task
|
||||
* 2b.2 Consumers receive the task from the producer and process it
|
||||
* 2b.3 Consumers send the result to the producer
|
||||
* 3. Manager compile results for each dataset
|
||||
* 3.1 Loop thru all the results obtained from each outer fold (task) and select the best
|
||||
* 3.2 Save the results
|
||||
* 3.3 Summary of jobs done
|
||||
*/
|
||||
//
|
||||
// 0.1 Create the MPI result type
|
||||
//
|
||||
validate_config();
|
||||
Task_Result result;
|
||||
int tasks_size;
|
||||
MPI_Datatype MPI_Result;
|
||||
MPI_Datatype type[11] = { MPI_UNSIGNED, MPI_UNSIGNED, MPI_INT, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_INT, MPI_INT };
|
||||
int blocklen[11] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||
MPI_Aint disp[11];
|
||||
disp[0] = offsetof(Task_Result, idx_dataset);
|
||||
disp[1] = offsetof(Task_Result, idx_combination);
|
||||
disp[2] = offsetof(Task_Result, n_fold);
|
||||
disp[3] = offsetof(Task_Result, score);
|
||||
disp[4] = offsetof(Task_Result, time);
|
||||
disp[5] = offsetof(Task_Result, time_train);
|
||||
disp[6] = offsetof(Task_Result, nodes);
|
||||
disp[7] = offsetof(Task_Result, leaves);
|
||||
disp[8] = offsetof(Task_Result, depth);
|
||||
disp[9] = offsetof(Task_Result, process);
|
||||
disp[10] = offsetof(Task_Result, task);
|
||||
MPI_Type_create_struct(11, blocklen, disp, type, &MPI_Result);
|
||||
MPI_Type_commit(&MPI_Result);
|
||||
//
|
||||
// 0.2 Manager creates the tasks
|
||||
//
|
||||
char* msg;
|
||||
json tasks;
|
||||
auto env = platform::DotEnv();
|
||||
auto datasets = Datasets(config.discretize, Paths::datasets(), env.get("discretize_algo"));
|
||||
if (config_mpi.rank == config_mpi.manager) {
|
||||
timer.start();
|
||||
tasks = build_tasks(datasets);
|
||||
auto tasks_str = tasks.dump();
|
||||
tasks_size = tasks_str.size();
|
||||
msg = new char[tasks_size + 1];
|
||||
strcpy(msg, tasks_str.c_str());
|
||||
}
|
||||
//
|
||||
// 1. Manager will broadcast the tasks to all the processes
|
||||
//
|
||||
MPI_Bcast(&tasks_size, 1, MPI_INT, config_mpi.manager, MPI_COMM_WORLD);
|
||||
if (config_mpi.rank != config_mpi.manager) {
|
||||
msg = new char[tasks_size + 1];
|
||||
}
|
||||
MPI_Bcast(msg, tasks_size + 1, MPI_CHAR, config_mpi.manager, MPI_COMM_WORLD);
|
||||
tasks = json::parse(msg);
|
||||
delete[] msg;
|
||||
|
||||
|
||||
if (config_mpi.rank == config_mpi.manager) {
|
||||
//
|
||||
// 2a. Producer delivers the tasks to the consumers
|
||||
//
|
||||
auto datasets_names = filterDatasets(datasets);
|
||||
json all_results = producer(datasets_names, tasks, config_mpi, MPI_Result);
|
||||
std::cout << separator << std::endl;
|
||||
//
|
||||
// 3. Manager compile results for each dataset
|
||||
//
|
||||
auto results = initializeResults();
|
||||
compile_results(results, all_results, config.model);
|
||||
//
|
||||
// 3.2 Save the results
|
||||
//
|
||||
save(results);
|
||||
//
|
||||
// 3.3 Summary of jobs done
|
||||
//
|
||||
if (!config.quiet)
|
||||
summary(all_results, tasks, config_mpi);
|
||||
} else {
|
||||
//
|
||||
// 2b. Consumers process the tasks and send the results to the producer
|
||||
//
|
||||
consumer(datasets, tasks, config, config_mpi, MPI_Result);
|
||||
}
|
||||
}
|
||||
json GridBase::producer(std::vector<std::string>& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
|
||||
{
|
||||
Task_Result result;
|
||||
json results;
|
||||
int num_tasks = tasks.size();
|
||||
//
|
||||
// 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
|
||||
//
|
||||
for (int i = 0; i < num_tasks; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
||||
if (status.MPI_TAG == TAG_RESULT) {
|
||||
//Store result
|
||||
store_result(names, result, results);
|
||||
|
||||
}
|
||||
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_TASK, MPI_COMM_WORLD);
|
||||
}
|
||||
//
|
||||
// 2a.2 Producer will send the end message to all the consumers
|
||||
//
|
||||
for (int i = 0; i < config_mpi.n_procs - 1; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
||||
if (status.MPI_TAG == TAG_RESULT) {
|
||||
//Store result
|
||||
store_result(names, result, results);
|
||||
}
|
||||
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_END, MPI_COMM_WORLD);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
void GridBase::consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
|
||||
{
|
||||
Task_Result result;
|
||||
//
|
||||
// 2b.1 Consumers announce to the producer that they are ready to receive a task
|
||||
//
|
||||
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_QUERY, MPI_COMM_WORLD);
|
||||
int task;
|
||||
while (true) {
|
||||
MPI_Status status;
|
||||
//
|
||||
// 2b.2 Consumers receive the task from the producer and process it
|
||||
//
|
||||
MPI_Recv(&task, 1, MPI_INT, config_mpi.manager, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
||||
if (status.MPI_TAG == TAG_END) {
|
||||
break;
|
||||
}
|
||||
consumer_go(config, config_mpi, tasks, task, datasets, &result);
|
||||
//
|
||||
// 2b.3 Consumers send the result to the producer
|
||||
//
|
||||
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_RESULT, MPI_COMM_WORLD);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
39
src/grid/GridBase.h
Normal file
39
src/grid/GridBase.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#ifndef GRIDBASE_H
|
||||
#define GRIDBASE_H
|
||||
#include <string>
|
||||
#include <mpi.h>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "common/Datasets.h"
|
||||
#include "common/Timer.hpp"
|
||||
#include "main/HyperParameters.h"
|
||||
#include "GridConfig.h"
|
||||
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
class GridBase {
|
||||
public:
|
||||
explicit GridBase(struct ConfigGrid& config);
|
||||
~GridBase() = default;
|
||||
void go(struct ConfigMPI& config_mpi);
|
||||
void validate_config();
|
||||
protected:
|
||||
json build_tasks(Datasets& datasets);
|
||||
virtual void save(json& results) = 0;
|
||||
virtual std::vector<std::string> filterDatasets(Datasets& datasets) const = 0;
|
||||
virtual json initializeResults() = 0;
|
||||
virtual void compile_results(json& results, json& all_results, std::string& model) = 0;
|
||||
virtual json store_result(std::vector<std::string>& names, Task_Result& result, json& results) = 0;
|
||||
virtual void consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result) = 0;
|
||||
void shuffle_and_progress_bar(json& tasks);
|
||||
json producer(std::vector<std::string>& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result);
|
||||
void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result);
|
||||
std::string get_color_rank(int rank);
|
||||
void summary(json& all_results, json& tasks, struct ConfigMPI& config_mpi);
|
||||
struct ConfigGrid config;
|
||||
Timer timer; // used to measure the time of the whole process
|
||||
const std::string separator = "|";
|
||||
bayesnet::Smoothing_t smooth_type{ bayesnet::Smoothing_t::NONE };
|
||||
};
|
||||
} /* namespace platform */
|
||||
#endif
|
55
src/grid/GridConfig.h
Normal file
55
src/grid/GridConfig.h
Normal file
@@ -0,0 +1,55 @@
|
||||
#ifndef GRIDCONFIG_H
|
||||
#define GRIDCONFIG_H
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <mpi.h>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "common/Datasets.h"
|
||||
#include "common/Timer.hpp"
|
||||
#include "main/HyperParameters.h"
|
||||
#include "GridData.h"
|
||||
#include "GridConfig.h"
|
||||
#include "bayesnet/network/Network.h"
|
||||
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
struct ConfigGrid {
|
||||
std::string model;
|
||||
std::string score;
|
||||
std::string continue_from;
|
||||
std::string platform;
|
||||
std::string smooth_strategy;
|
||||
bool quiet;
|
||||
bool only; // used with continue_from to only compute that dataset
|
||||
bool discretize;
|
||||
bool stratified;
|
||||
int nested;
|
||||
int n_folds;
|
||||
json excluded;
|
||||
std::vector<int> seeds;
|
||||
};
|
||||
struct ConfigMPI {
|
||||
int rank;
|
||||
int n_procs;
|
||||
int manager;
|
||||
};
|
||||
typedef struct {
|
||||
uint idx_dataset;
|
||||
uint idx_combination;
|
||||
int n_fold;
|
||||
double score; // Experiment: Score test, no score train in this case
|
||||
double time; // Experiment: Time test
|
||||
double time_train;
|
||||
double nodes; // Experiment specific
|
||||
double leaves; // Experiment specific
|
||||
double depth; // Experiment specific
|
||||
int process;
|
||||
int task;
|
||||
} Task_Result;
|
||||
const int TAG_QUERY = 1;
|
||||
const int TAG_RESULT = 2;
|
||||
const int TAG_TASK = 3;
|
||||
const int TAG_END = 4;
|
||||
} /* namespace platform */
|
||||
#endif
|
@@ -1,5 +1,5 @@
|
||||
#include "GridData.h"
|
||||
#include <fstream>
|
||||
#include "GridData.h"
|
||||
|
||||
namespace platform {
|
||||
GridData::GridData(const std::string& fileName)
|
@@ -6,7 +6,7 @@
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::json;
|
||||
using json = nlohmann::ordered_json;
|
||||
const std::string ALL_DATASETS = "all";
|
||||
class GridData {
|
||||
public:
|
||||
@@ -23,4 +23,4 @@ namespace platform {
|
||||
std::map<std::string, json> grid;
|
||||
};
|
||||
} /* namespace platform */
|
||||
#endif /* GRIDDATA_H */
|
||||
#endif
|
196
src/grid/GridExperiment.cpp
Normal file
196
src/grid/GridExperiment.cpp
Normal file
@@ -0,0 +1,196 @@
|
||||
#include <iostream>
|
||||
#include <cstddef>
|
||||
#include <torch/torch.h>
|
||||
#include <folding.hpp>
|
||||
#include "main/Models.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Utils.h"
|
||||
#include "GridExperiment.h"
|
||||
|
||||
namespace platform {
|
||||
// GridExperiment::GridExperiment(argparse::ArgumentParser& program, struct ConfigGrid& config) : arguments(program), GridBase(config)
|
||||
GridExperiment::GridExperiment(ArgumentsExperiment& program, struct ConfigGrid& config) : arguments(program), GridBase(config)
|
||||
{
|
||||
experiment = arguments.initializedExperiment();
|
||||
filesToTest = arguments.getFilesToTest();
|
||||
saveResults = arguments.haveToSaveResults();
|
||||
this->config.model = experiment.getModel();
|
||||
this->config.score = experiment.getScore();
|
||||
this->config.discretize = experiment.isDiscretized();
|
||||
this->config.stratified = experiment.isStratified();
|
||||
this->config.smooth_strategy = experiment.getSmoothStrategy();
|
||||
this->config.n_folds = experiment.getNFolds();
|
||||
this->config.seeds = experiment.getRandomSeeds();
|
||||
this->config.quiet = experiment.isQuiet();
|
||||
}
|
||||
json GridExperiment::getResults()
|
||||
{
|
||||
return computed_results;
|
||||
}
|
||||
std::vector<std::string> GridExperiment::filterDatasets(Datasets& datasets) const
|
||||
{
|
||||
return filesToTest;
|
||||
}
|
||||
json GridExperiment::initializeResults()
|
||||
{
|
||||
json results;
|
||||
return results;
|
||||
}
|
||||
void GridExperiment::save(json& results)
|
||||
{
|
||||
}
|
||||
void GridExperiment::compile_results(json& results, json& all_results, std::string& model)
|
||||
{
|
||||
auto datasets = Datasets(false, Paths::datasets());
|
||||
nlohmann::json temp = all_results; // To restore the order of the data by dataset name
|
||||
all_results = temp;
|
||||
for (const auto& result_item : all_results.items()) {
|
||||
// each result has the results of all the outer folds as each one were a different task
|
||||
auto dataset_name = result_item.key();
|
||||
auto data = result_item.value();
|
||||
auto result = json::object();
|
||||
int data_size = data.size();
|
||||
auto score = torch::zeros({ data_size }, torch::kFloat64);
|
||||
auto score_train = torch::zeros({ data_size }, torch::kFloat64);
|
||||
auto time_test = torch::zeros({ data_size }, torch::kFloat64);
|
||||
auto time_train = torch::zeros({ data_size }, torch::kFloat64);
|
||||
auto nodes = torch::zeros({ data_size }, torch::kFloat64);
|
||||
auto leaves = torch::zeros({ data_size }, torch::kFloat64);
|
||||
auto depth = torch::zeros({ data_size }, torch::kFloat64);
|
||||
auto& dataset = datasets.getDataset(dataset_name);
|
||||
dataset.load();
|
||||
//
|
||||
// Prepare Result
|
||||
//
|
||||
auto partial_result = PartialResult();
|
||||
partial_result.setSamples(dataset.getNSamples()).setFeatures(dataset.getNFeatures()).setClasses(dataset.getNClasses());
|
||||
partial_result.setHyperparameters(experiment.getHyperParameters().get(dataset_name));
|
||||
for (int fold = 0; fold < data_size; ++fold) {
|
||||
partial_result.addScoreTest(data[fold]["score"]);
|
||||
partial_result.addScoreTrain(0.0);
|
||||
partial_result.addTimeTest(data[fold]["time"]);
|
||||
partial_result.addTimeTrain(data[fold]["time_train"]);
|
||||
score[fold] = data[fold]["score"].get<double>();
|
||||
time_test[fold] = data[fold]["time"].get<double>();
|
||||
time_train[fold] = data[fold]["time_train"].get<double>();
|
||||
nodes[fold] = data[fold]["nodes"].get<double>();
|
||||
leaves[fold] = data[fold]["leaves"].get<double>();
|
||||
depth[fold] = data[fold]["depth"].get<double>();
|
||||
}
|
||||
partial_result.setGraph(std::vector<std::string>());
|
||||
partial_result.setScoreTest(torch::mean(score).item<double>()).setScoreTrain(0.0);
|
||||
partial_result.setScoreTestStd(torch::std(score).item<double>()).setScoreTrainStd(0.0);
|
||||
partial_result.setTrainTime(torch::mean(time_train).item<double>()).setTestTime(torch::mean(time_test).item<double>());
|
||||
partial_result.setTrainTimeStd(torch::std(time_train).item<double>()).setTestTimeStd(torch::std(time_test).item<double>());
|
||||
partial_result.setNodes(torch::mean(nodes).item<double>()).setLeaves(torch::mean(leaves).item<double>()).setDepth(torch::mean(depth).item<double>());
|
||||
partial_result.setDataset(dataset_name).setNotes(std::vector<std::string>());
|
||||
partial_result.setConfusionMatrices(json::array());
|
||||
experiment.addResult(partial_result);
|
||||
}
|
||||
auto clf = Models::instance()->create(experiment.getModel());
|
||||
experiment.setModelVersion(clf->getVersion());
|
||||
computed_results = results;
|
||||
}
|
||||
json GridExperiment::store_result(std::vector<std::string>& names, Task_Result& result, json& results)
|
||||
{
|
||||
json json_result = {
|
||||
{ "score", result.score },
|
||||
{ "combination", result.idx_combination },
|
||||
{ "fold", result.n_fold },
|
||||
{ "time", result.time },
|
||||
{ "time_train", result.time_train },
|
||||
{ "dataset", result.idx_dataset },
|
||||
{ "nodes", result.nodes },
|
||||
{ "leaves", result.leaves },
|
||||
{ "depth", result.depth },
|
||||
{ "process", result.process },
|
||||
{ "task", result.task }
|
||||
};
|
||||
auto name = names[result.idx_dataset];
|
||||
if (!results.contains(name)) {
|
||||
results[name] = json::array();
|
||||
}
|
||||
results[name].push_back(json_result);
|
||||
return results;
|
||||
}
|
||||
void GridExperiment::consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result)
|
||||
{
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
Timer train_timer, test_timer;
|
||||
json task = tasks[n_task];
|
||||
auto model = config.model;
|
||||
auto dataset_name = task["dataset"].get<std::string>();
|
||||
auto idx_dataset = task["idx_dataset"].get<int>();
|
||||
auto seed = task["seed"].get<int>();
|
||||
auto n_fold = task["fold"].get<int>();
|
||||
bool stratified = config.stratified;
|
||||
bayesnet::Smoothing_t smooth;
|
||||
if (config.smooth_strategy == "ORIGINAL")
|
||||
smooth = bayesnet::Smoothing_t::ORIGINAL;
|
||||
else if (config.smooth_strategy == "LAPLACE")
|
||||
smooth = bayesnet::Smoothing_t::LAPLACE;
|
||||
else if (config.smooth_strategy == "CESTNIK")
|
||||
smooth = bayesnet::Smoothing_t::CESTNIK;
|
||||
//
|
||||
// Generate the hyperparameters combinations
|
||||
//
|
||||
auto& dataset = datasets.getDataset(dataset_name);
|
||||
dataset.load();
|
||||
auto [X, y] = dataset.getTensors();
|
||||
auto features = dataset.getFeatures();
|
||||
auto className = dataset.getClassName();
|
||||
//
|
||||
// Start working on task
|
||||
//
|
||||
folding::Fold* fold;
|
||||
if (stratified)
|
||||
fold = new folding::StratifiedKFold(config.n_folds, y, seed);
|
||||
else
|
||||
fold = new folding::KFold(config.n_folds, y.size(0), seed);
|
||||
train_timer.start();
|
||||
auto [train, test] = fold->getFold(n_fold);
|
||||
auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test);
|
||||
auto states = dataset.getStates(); // Get the states of the features Once they are discretized
|
||||
|
||||
//
|
||||
// Build Classifier with selected hyperparameters
|
||||
//
|
||||
auto clf = Models::instance()->create(config.model);
|
||||
auto valid = clf->getValidHyperparameters();
|
||||
auto hyperparameters = experiment.getHyperParameters();
|
||||
hyperparameters.check(valid, dataset_name);
|
||||
clf->setHyperparameters(hyperparameters.get(dataset_name));
|
||||
//
|
||||
// Train model
|
||||
//
|
||||
clf->fit(X_train, y_train, features, className, states, smooth);
|
||||
auto train_time = train_timer.getDuration();
|
||||
//
|
||||
// Test model
|
||||
//
|
||||
test_timer.start();
|
||||
double score = clf->score(X_test, y_test);
|
||||
delete fold;
|
||||
auto test_time = test_timer.getDuration();
|
||||
//
|
||||
// Return the result
|
||||
//
|
||||
result->idx_dataset = task["idx_dataset"].get<int>();
|
||||
result->idx_combination = 0;
|
||||
result->score = score;
|
||||
result->n_fold = n_fold;
|
||||
result->time = test_time;
|
||||
result->time_train = train_time;
|
||||
result->nodes = clf->getNumberOfNodes();
|
||||
result->leaves = clf->getNumberOfEdges();
|
||||
result->depth = clf->getNumberOfStates();
|
||||
result->process = config_mpi.rank;
|
||||
result->task = n_task;
|
||||
//
|
||||
// Update progress bar
|
||||
//
|
||||
std::cout << get_color_rank(config_mpi.rank) << std::flush;
|
||||
}
|
||||
} /* namespace platform */
|
38
src/grid/GridExperiment.h
Normal file
38
src/grid/GridExperiment.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#ifndef GRIDEXPERIMENT_H
|
||||
#define GRIDEXPERIMENT_H
|
||||
#include <string>
|
||||
#include <mpi.h>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "common/Datasets.h"
|
||||
#include "main/Experiment.h"
|
||||
#include "main/HyperParameters.h"
|
||||
#include "main/ArgumentsExperiment.h"
|
||||
#include "GridBase.h"
|
||||
|
||||
|
||||
namespace platform {
|
||||
using json = nlohmann::ordered_json;
|
||||
class GridExperiment : public GridBase {
|
||||
public:
|
||||
explicit GridExperiment(ArgumentsExperiment& program, struct ConfigGrid& config);
|
||||
~GridExperiment() = default;
|
||||
json getResults();
|
||||
Experiment& getExperiment() { return experiment; }
|
||||
size_t numFiles() const { return filesToTest.size(); }
|
||||
bool haveToSaveResults() const { return saveResults; }
|
||||
private:
|
||||
ArgumentsExperiment& arguments;
|
||||
Experiment experiment;
|
||||
json computed_results;
|
||||
bool saveResults = false;
|
||||
std::vector<std::string> filesToTest;
|
||||
void save(json& results);
|
||||
json initializeResults();
|
||||
std::vector<std::string> filterDatasets(Datasets& datasets) const;
|
||||
void compile_results(json& results, json& all_results, std::string& model);
|
||||
json store_result(std::vector<std::string>& names, Task_Result& result, json& results);
|
||||
void consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result);
|
||||
};
|
||||
} /* namespace platform */
|
||||
#endif
|
@@ -1,441 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <cstddef>
|
||||
#include <torch/torch.h>
|
||||
#include "GridSearch.h"
|
||||
#include "Models.h"
|
||||
#include "Paths.h"
|
||||
#include "folding.hpp"
|
||||
#include "Colors.h"
|
||||
|
||||
namespace platform {
|
||||
std::string get_date()
|
||||
{
|
||||
time_t rawtime;
|
||||
tm* timeinfo;
|
||||
time(&rawtime);
|
||||
timeinfo = std::localtime(&rawtime);
|
||||
std::ostringstream oss;
|
||||
oss << std::put_time(timeinfo, "%Y-%m-%d");
|
||||
return oss.str();
|
||||
}
|
||||
std::string get_time()
|
||||
{
|
||||
time_t rawtime;
|
||||
tm* timeinfo;
|
||||
time(&rawtime);
|
||||
timeinfo = std::localtime(&rawtime);
|
||||
std::ostringstream oss;
|
||||
oss << std::put_time(timeinfo, "%H:%M:%S");
|
||||
return oss.str();
|
||||
}
|
||||
std::string get_color_rank(int rank)
|
||||
{
|
||||
auto colors = { Colors::WHITE(), Colors::RED(), Colors::GREEN(), Colors::BLUE(), Colors::MAGENTA(), Colors::CYAN() };
|
||||
return *(colors.begin() + rank % colors.size());
|
||||
}
|
||||
GridSearch::GridSearch(struct ConfigGrid& config) : config(config)
|
||||
{
|
||||
}
|
||||
json GridSearch::loadResults()
|
||||
{
|
||||
std::ifstream file(Paths::grid_output(config.model));
|
||||
if (file.is_open()) {
|
||||
return json::parse(file);
|
||||
}
|
||||
return json();
|
||||
}
|
||||
std::vector<std::string> GridSearch::filterDatasets(Datasets& datasets) const
|
||||
{
|
||||
// Load datasets
|
||||
auto datasets_names = datasets.getNames();
|
||||
if (config.continue_from != NO_CONTINUE()) {
|
||||
// Continue previous execution:
|
||||
if (std::find(datasets_names.begin(), datasets_names.end(), config.continue_from) == datasets_names.end()) {
|
||||
throw std::invalid_argument("Dataset " + config.continue_from + " not found");
|
||||
}
|
||||
// Remove datasets already processed
|
||||
std::vector<string>::iterator it = datasets_names.begin();
|
||||
while (it != datasets_names.end()) {
|
||||
if (*it != config.continue_from) {
|
||||
it = datasets_names.erase(it);
|
||||
} else {
|
||||
if (config.only)
|
||||
++it;
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Exclude datasets
|
||||
for (const auto& name : config.excluded) {
|
||||
auto dataset = name.get<std::string>();
|
||||
auto it = std::find(datasets_names.begin(), datasets_names.end(), dataset);
|
||||
if (it == datasets_names.end()) {
|
||||
throw std::invalid_argument("Dataset " + dataset + " already excluded or doesn't exist!");
|
||||
}
|
||||
datasets_names.erase(it);
|
||||
}
|
||||
return datasets_names;
|
||||
}
|
||||
json GridSearch::build_tasks_mpi(int rank)
|
||||
{
|
||||
auto tasks = json::array();
|
||||
auto grid = GridData(Paths::grid_input(config.model));
|
||||
auto datasets = Datasets(false, Paths::datasets());
|
||||
auto all_datasets = datasets.getNames();
|
||||
auto datasets_names = filterDatasets(datasets);
|
||||
for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) {
|
||||
auto dataset = datasets_names[idx_dataset];
|
||||
for (const auto& seed : config.seeds) {
|
||||
auto combinations = grid.getGrid(dataset);
|
||||
for (int n_fold = 0; n_fold < config.n_folds; n_fold++) {
|
||||
json task = {
|
||||
{ "dataset", dataset },
|
||||
{ "idx_dataset", idx_dataset},
|
||||
{ "seed", seed },
|
||||
{ "fold", n_fold},
|
||||
};
|
||||
tasks.push_back(task);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Shuffle the array so heavy datasets are spread across the workers
|
||||
std::mt19937 g{ 271 }; // Use fixed seed to obtain the same shuffle
|
||||
std::shuffle(tasks.begin(), tasks.end(), g);
|
||||
std::cout << get_color_rank(rank) << "* Number of tasks: " << tasks.size() << std::endl;
|
||||
std::cout << "|";
|
||||
for (int i = 0; i < tasks.size(); ++i) {
|
||||
std::cout << (i + 1) % 10;
|
||||
}
|
||||
std::cout << "|" << std::endl << "|" << std::flush;
|
||||
return tasks;
|
||||
}
|
||||
void process_task_mpi_consumer(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result)
|
||||
{
|
||||
// initialize
|
||||
Timer timer;
|
||||
timer.start();
|
||||
json task = tasks[n_task];
|
||||
auto model = config.model;
|
||||
auto grid = GridData(Paths::grid_input(model));
|
||||
auto dataset = task["dataset"].get<std::string>();
|
||||
auto idx_dataset = task["idx_dataset"].get<int>();
|
||||
auto seed = task["seed"].get<int>();
|
||||
auto n_fold = task["fold"].get<int>();
|
||||
bool stratified = config.stratified;
|
||||
// Generate the hyperparamters combinations
|
||||
auto combinations = grid.getGrid(dataset);
|
||||
auto [X, y] = datasets.getTensors(dataset);
|
||||
auto states = datasets.getStates(dataset);
|
||||
auto features = datasets.getFeatures(dataset);
|
||||
auto className = datasets.getClassName(dataset);
|
||||
//
|
||||
// Start working on task
|
||||
//
|
||||
folding::Fold* fold;
|
||||
if (stratified)
|
||||
fold = new folding::StratifiedKFold(config.n_folds, y, seed);
|
||||
else
|
||||
fold = new folding::KFold(config.n_folds, y.size(0), seed);
|
||||
auto [train, test] = fold->getFold(n_fold);
|
||||
auto train_t = torch::tensor(train);
|
||||
auto test_t = torch::tensor(test);
|
||||
auto X_train = X.index({ "...", train_t });
|
||||
auto y_train = y.index({ train_t });
|
||||
auto X_test = X.index({ "...", test_t });
|
||||
auto y_test = y.index({ test_t });
|
||||
double best_fold_score = 0.0;
|
||||
int best_idx_combination = -1;
|
||||
json best_fold_hyper;
|
||||
for (int idx_combination = 0; idx_combination < combinations.size(); ++idx_combination) {
|
||||
auto hyperparam_line = combinations[idx_combination];
|
||||
auto hyperparameters = platform::HyperParameters(datasets.getNames(), hyperparam_line);
|
||||
folding::Fold* nested_fold;
|
||||
if (config.stratified)
|
||||
nested_fold = new folding::StratifiedKFold(config.nested, y_train, seed);
|
||||
else
|
||||
nested_fold = new folding::KFold(config.nested, y_train.size(0), seed);
|
||||
double score = 0.0;
|
||||
for (int n_nested_fold = 0; n_nested_fold < config.nested; n_nested_fold++) {
|
||||
// Nested level fold
|
||||
auto [train_nested, test_nested] = nested_fold->getFold(n_nested_fold);
|
||||
auto train_nested_t = torch::tensor(train_nested);
|
||||
auto test_nested_t = torch::tensor(test_nested);
|
||||
auto X_nested_train = X_train.index({ "...", train_nested_t });
|
||||
auto y_nested_train = y_train.index({ train_nested_t });
|
||||
auto X_nested_test = X_train.index({ "...", test_nested_t });
|
||||
auto y_nested_test = y_train.index({ test_nested_t });
|
||||
// Build Classifier with selected hyperparameters
|
||||
auto clf = Models::instance()->create(config.model);
|
||||
auto valid = clf->getValidHyperparameters();
|
||||
hyperparameters.check(valid, dataset);
|
||||
clf->setHyperparameters(hyperparameters.get(dataset));
|
||||
// Train model
|
||||
clf->fit(X_nested_train, y_nested_train, features, className, states);
|
||||
// Test model
|
||||
score += clf->score(X_nested_test, y_nested_test);
|
||||
}
|
||||
delete nested_fold;
|
||||
score /= config.nested;
|
||||
if (score > best_fold_score) {
|
||||
best_fold_score = score;
|
||||
best_idx_combination = idx_combination;
|
||||
best_fold_hyper = hyperparam_line;
|
||||
}
|
||||
}
|
||||
delete fold;
|
||||
// Build Classifier with the best hyperparameters to obtain the best score
|
||||
auto hyperparameters = platform::HyperParameters(datasets.getNames(), best_fold_hyper);
|
||||
auto clf = Models::instance()->create(config.model);
|
||||
auto valid = clf->getValidHyperparameters();
|
||||
hyperparameters.check(valid, dataset);
|
||||
clf->setHyperparameters(best_fold_hyper);
|
||||
clf->fit(X_train, y_train, features, className, states);
|
||||
best_fold_score = clf->score(X_test, y_test);
|
||||
// Return the result
|
||||
result->idx_dataset = task["idx_dataset"].get<int>();
|
||||
result->idx_combination = best_idx_combination;
|
||||
result->score = best_fold_score;
|
||||
result->n_fold = n_fold;
|
||||
result->time = timer.getDuration();
|
||||
// Update progress bar
|
||||
std::cout << get_color_rank(config_mpi.rank) << "*" << std::flush;
|
||||
}
|
||||
json store_result(std::vector<std::string>& names, Task_Result& result, json& results)
|
||||
{
|
||||
json json_result = {
|
||||
{ "score", result.score },
|
||||
{ "combination", result.idx_combination },
|
||||
{ "fold", result.n_fold },
|
||||
{ "time", result.time },
|
||||
{ "dataset", result.idx_dataset }
|
||||
};
|
||||
auto name = names[result.idx_dataset];
|
||||
if (!results.contains(name)) {
|
||||
results[name] = json::array();
|
||||
}
|
||||
results[name].push_back(json_result);
|
||||
return results;
|
||||
}
|
||||
json producer(std::vector<std::string>& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
|
||||
{
|
||||
Task_Result result;
|
||||
json results;
|
||||
int num_tasks = tasks.size();
|
||||
|
||||
//
|
||||
// 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
|
||||
//
|
||||
for (int i = 0; i < num_tasks; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
||||
if (status.MPI_TAG == TAG_RESULT) {
|
||||
//Store result
|
||||
store_result(names, result, results);
|
||||
}
|
||||
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_TASK, MPI_COMM_WORLD);
|
||||
}
|
||||
//
|
||||
// 2a.2 Producer will send the end message to all the consumers
|
||||
//
|
||||
for (int i = 0; i < config_mpi.n_procs - 1; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
||||
if (status.MPI_TAG == TAG_RESULT) {
|
||||
//Store result
|
||||
store_result(names, result, results);
|
||||
}
|
||||
MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_END, MPI_COMM_WORLD);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
void select_best_results_folds(json& results, json& all_results, std::string& model)
|
||||
{
|
||||
Timer timer;
|
||||
auto grid = GridData(Paths::grid_input(model));
|
||||
//
|
||||
// Select the best result of the computed outer folds
|
||||
//
|
||||
for (const auto& result : all_results.items()) {
|
||||
// each result has the results of all the outer folds as each one were a different task
|
||||
double best_score = 0.0;
|
||||
json best;
|
||||
for (const auto& result_fold : result.value()) {
|
||||
double score = result_fold["score"].get<double>();
|
||||
if (score > best_score) {
|
||||
best_score = score;
|
||||
best = result_fold;
|
||||
}
|
||||
}
|
||||
auto dataset = result.key();
|
||||
auto combinations = grid.getGrid(dataset);
|
||||
json json_best = {
|
||||
{ "score", best_score },
|
||||
{ "hyperparameters", combinations[best["combination"].get<int>()] },
|
||||
{ "date", get_date() + " " + get_time() },
|
||||
{ "grid", grid.getInputGrid(dataset) },
|
||||
{ "duration", timer.translate2String(best["time"].get<double>()) }
|
||||
};
|
||||
results[dataset] = json_best;
|
||||
}
|
||||
}
|
||||
void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result)
|
||||
{
|
||||
Task_Result result;
|
||||
//
|
||||
// 2b.1 Consumers announce to the producer that they are ready to receive a task
|
||||
//
|
||||
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_QUERY, MPI_COMM_WORLD);
|
||||
int task;
|
||||
while (true) {
|
||||
MPI_Status status;
|
||||
//
|
||||
// 2b.2 Consumers receive the task from the producer and process it
|
||||
//
|
||||
MPI_Recv(&task, 1, MPI_INT, config_mpi.manager, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
||||
if (status.MPI_TAG == TAG_END) {
|
||||
break;
|
||||
}
|
||||
process_task_mpi_consumer(config, config_mpi, tasks, task, datasets, &result);
|
||||
//
|
||||
// 2b.3 Consumers send the result to the producer
|
||||
//
|
||||
MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_RESULT, MPI_COMM_WORLD);
|
||||
}
|
||||
}
|
||||
void GridSearch::go(struct ConfigMPI& config_mpi)
|
||||
{
|
||||
/*
|
||||
* Each task is a json object with the following structure:
|
||||
* {
|
||||
* "dataset": "dataset_name",
|
||||
* "idx_dataset": idx_dataset, // used to identify the dataset in the results
|
||||
* // this index is relative to the used datasets in the actual run not to the whole datasets
|
||||
* "seed": # of seed to use,
|
||||
* "Fold": # of fold to process
|
||||
* }
|
||||
*
|
||||
* The overall process consists in these steps:
|
||||
* 0. Create the MPI result type & tasks
|
||||
* 0.1 Create the MPI result type
|
||||
* 0.2 Manager creates the tasks
|
||||
* 1. Manager will broadcast the tasks to all the processes
|
||||
* 1.1 Broadcast the number of tasks
|
||||
* 1.2 Broadcast the length of the following string
|
||||
* 1.2 Broadcast the tasks as a char* string
|
||||
* 2a. Producer delivers the tasks to the consumers
|
||||
* 2a.1 Producer will loop to send all the tasks to the consumers and receive the results
|
||||
* 2a.2 Producer will send the end message to all the consumers
|
||||
* 2b. Consumers process the tasks and send the results to the producer
|
||||
* 2b.1 Consumers announce to the producer that they are ready to receive a task
|
||||
* 2b.2 Consumers receive the task from the producer and process it
|
||||
* 2b.3 Consumers send the result to the producer
|
||||
* 3. Manager select the bests sccores for each dataset
|
||||
* 3.1 Loop thru all the results obtained from each outer fold (task) and select the best
|
||||
* 3.2 Save the results
|
||||
*/
|
||||
//
|
||||
// 0.1 Create the MPI result type
|
||||
//
|
||||
Task_Result result;
|
||||
int tasks_size;
|
||||
MPI_Datatype MPI_Result;
|
||||
MPI_Datatype type[5] = { MPI_UNSIGNED, MPI_UNSIGNED, MPI_INT, MPI_DOUBLE, MPI_DOUBLE };
|
||||
int blocklen[5] = { 1, 1, 1, 1, 1 };
|
||||
MPI_Aint disp[5];
|
||||
disp[0] = offsetof(Task_Result, idx_dataset);
|
||||
disp[1] = offsetof(Task_Result, idx_combination);
|
||||
disp[2] = offsetof(Task_Result, n_fold);
|
||||
disp[3] = offsetof(Task_Result, score);
|
||||
disp[4] = offsetof(Task_Result, time);
|
||||
MPI_Type_create_struct(5, blocklen, disp, type, &MPI_Result);
|
||||
MPI_Type_commit(&MPI_Result);
|
||||
//
|
||||
// 0.2 Manager creates the tasks
|
||||
//
|
||||
char* msg;
|
||||
json tasks;
|
||||
if (config_mpi.rank == config_mpi.manager) {
|
||||
timer.start();
|
||||
tasks = build_tasks_mpi(config_mpi.rank);
|
||||
auto tasks_str = tasks.dump();
|
||||
tasks_size = tasks_str.size();
|
||||
msg = new char[tasks_size + 1];
|
||||
strcpy(msg, tasks_str.c_str());
|
||||
}
|
||||
//
|
||||
// 1. Manager will broadcast the tasks to all the processes
|
||||
//
|
||||
MPI_Bcast(&tasks_size, 1, MPI_INT, config_mpi.manager, MPI_COMM_WORLD);
|
||||
if (config_mpi.rank != config_mpi.manager) {
|
||||
msg = new char[tasks_size + 1];
|
||||
}
|
||||
MPI_Bcast(msg, tasks_size + 1, MPI_CHAR, config_mpi.manager, MPI_COMM_WORLD);
|
||||
tasks = json::parse(msg);
|
||||
delete[] msg;
|
||||
auto datasets = Datasets(config.discretize, Paths::datasets());
|
||||
if (config_mpi.rank == config_mpi.manager) {
|
||||
//
|
||||
// 2a. Producer delivers the tasks to the consumers
|
||||
//
|
||||
auto datasets_names = filterDatasets(datasets);
|
||||
json all_results = producer(datasets_names, tasks, config_mpi, MPI_Result);
|
||||
std::cout << get_color_rank(config_mpi.rank) << "|" << std::endl;
|
||||
//
|
||||
// 3. Manager select the bests sccores for each dataset
|
||||
//
|
||||
auto results = initializeResults();
|
||||
select_best_results_folds(results, all_results, config.model);
|
||||
//
|
||||
// 3.2 Save the results
|
||||
//
|
||||
save(results);
|
||||
} else {
|
||||
//
|
||||
// 2b. Consumers process the tasks and send the results to the producer
|
||||
//
|
||||
consumer(datasets, tasks, config, config_mpi, MPI_Result);
|
||||
}
|
||||
}
|
||||
json GridSearch::initializeResults()
|
||||
{
|
||||
// Load previous results if continue is set
|
||||
json results;
|
||||
if (config.continue_from != NO_CONTINUE()) {
|
||||
if (!config.quiet)
|
||||
std::cout << "* Loading previous results" << std::endl;
|
||||
try {
|
||||
std::ifstream file(Paths::grid_output(config.model));
|
||||
if (file.is_open()) {
|
||||
results = json::parse(file);
|
||||
results = results["results"];
|
||||
}
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "* There were no previous results" << std::endl;
|
||||
std::cerr << "* Initizalizing new results" << std::endl;
|
||||
results = json();
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
void GridSearch::save(json& results)
|
||||
{
|
||||
std::ofstream file(Paths::grid_output(config.model));
|
||||
json output = {
|
||||
{ "model", config.model },
|
||||
{ "score", config.score },
|
||||
{ "discretize", config.discretize },
|
||||
{ "stratified", config.stratified },
|
||||
{ "n_folds", config.n_folds },
|
||||
{ "seeds", config.seeds },
|
||||
{ "date", get_date() + " " + get_time()},
|
||||
{ "nested", config.nested},
|
||||
{ "platform", config.platform },
|
||||
{ "duration", timer.getDurationString(true)},
|
||||
{ "results", results }
|
||||
|
||||
};
|
||||
file << output.dump(4);
|
||||
}
|
||||
} /* namespace platform */
|
259
src/grid/GridSearch.cpp
Normal file
259
src/grid/GridSearch.cpp
Normal file
@@ -0,0 +1,259 @@
|
||||
#include <iostream>
|
||||
#include <torch/torch.h>
|
||||
#include <folding.hpp>
|
||||
#include "main/Models.h"
|
||||
#include "common/Paths.h"
|
||||
#include "common/Utils.h"
|
||||
#include "common/Colors.h"
|
||||
#include "GridSearch.h"
|
||||
|
||||
namespace platform {
|
||||
GridSearch::GridSearch(struct ConfigGrid& config) : GridBase(config)
|
||||
{
|
||||
}
|
||||
json GridSearch::loadResults()
|
||||
{
|
||||
std::ifstream file(Paths::grid_output(config.model));
|
||||
if (file.is_open()) {
|
||||
return json::parse(file);
|
||||
}
|
||||
return json();
|
||||
}
|
||||
std::vector<std::string> GridSearch::filterDatasets(Datasets& datasets) const
|
||||
{
|
||||
// Load datasets
|
||||
auto datasets_names = datasets.getNames();
|
||||
if (config.continue_from != NO_CONTINUE()) {
|
||||
// Continue previous execution:
|
||||
if (std::find(datasets_names.begin(), datasets_names.end(), config.continue_from) == datasets_names.end()) {
|
||||
throw std::invalid_argument("Dataset " + config.continue_from + " not found");
|
||||
}
|
||||
// Remove datasets already processed
|
||||
std::vector<string>::iterator it = datasets_names.begin();
|
||||
while (it != datasets_names.end()) {
|
||||
if (*it != config.continue_from) {
|
||||
it = datasets_names.erase(it);
|
||||
} else {
|
||||
if (config.only)
|
||||
++it;
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Exclude datasets
|
||||
for (const auto& name : config.excluded) {
|
||||
auto dataset = name.get<std::string>();
|
||||
auto it = std::find(datasets_names.begin(), datasets_names.end(), dataset);
|
||||
if (it == datasets_names.end()) {
|
||||
throw std::invalid_argument("Dataset " + dataset + " already excluded or doesn't exist!");
|
||||
}
|
||||
datasets_names.erase(it);
|
||||
}
|
||||
return datasets_names;
|
||||
}
|
||||
json GridSearch::initializeResults()
|
||||
{
|
||||
// Load previous results if continue is set
|
||||
json results;
|
||||
if (config.continue_from != NO_CONTINUE()) {
|
||||
if (!config.quiet)
|
||||
std::cout << Colors::RESET() << "* Loading previous results" << std::endl;
|
||||
try {
|
||||
std::ifstream file(Paths::grid_output(config.model));
|
||||
if (file.is_open()) {
|
||||
results = json::parse(file);
|
||||
results = results["results"];
|
||||
}
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "* There were no previous results" << std::endl;
|
||||
std::cerr << "* Initizalizing new results" << std::endl;
|
||||
results = json();
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
void GridSearch::save(json& results)
|
||||
{
|
||||
std::ofstream file(Paths::grid_output(config.model));
|
||||
json output = {
|
||||
{ "model", config.model },
|
||||
{ "score", config.score },
|
||||
{ "discretize", config.discretize },
|
||||
{ "stratified", config.stratified },
|
||||
{ "n_folds", config.n_folds },
|
||||
{ "seeds", config.seeds },
|
||||
{ "date", get_date() + " " + get_time()},
|
||||
{ "nested", config.nested},
|
||||
{ "platform", config.platform },
|
||||
{ "duration", timer.getDurationString(true)},
|
||||
{ "results", results }
|
||||
|
||||
};
|
||||
file << output.dump(4);
|
||||
}
|
||||
void GridSearch::compile_results(json& results, json& all_results, std::string& model)
|
||||
{
|
||||
Timer timer;
|
||||
auto grid = GridData(Paths::grid_input(model));
|
||||
//
|
||||
// Select the best result of the computed outer folds
|
||||
//
|
||||
for (const auto& result : all_results.items()) {
|
||||
// each result has the results of all the outer folds as each one were a different task
|
||||
double best_score = 0.0;
|
||||
json best;
|
||||
for (const auto& result_fold : result.value()) {
|
||||
double score = result_fold["score"].get<double>();
|
||||
if (score > best_score) {
|
||||
best_score = score;
|
||||
best = result_fold;
|
||||
}
|
||||
}
|
||||
auto dataset = result.key();
|
||||
auto combinations = grid.getGrid(dataset);
|
||||
json json_best = {
|
||||
{ "score", best_score },
|
||||
{ "hyperparameters", combinations[best["combination"].get<int>()] },
|
||||
{ "date", get_date() + " " + get_time() },
|
||||
{ "grid", grid.getInputGrid(dataset) },
|
||||
{ "duration", timer.translate2String(best["time"].get<double>()) }
|
||||
};
|
||||
results[dataset] = json_best;
|
||||
}
|
||||
}
|
||||
json GridSearch::store_result(std::vector<std::string>& names, Task_Result& result, json& results)
|
||||
{
|
||||
json json_result = {
|
||||
{ "score", result.score },
|
||||
{ "combination", result.idx_combination },
|
||||
{ "fold", result.n_fold },
|
||||
{ "time", result.time },
|
||||
{ "dataset", result.idx_dataset },
|
||||
{ "process", result.process },
|
||||
{ "task", result.task }
|
||||
};
|
||||
auto name = names[result.idx_dataset];
|
||||
if (!results.contains(name)) {
|
||||
results[name] = json::array();
|
||||
}
|
||||
results[name].push_back(json_result);
|
||||
return results;
|
||||
}
|
||||
void GridSearch::consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result)
|
||||
{
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
Timer timer;
|
||||
timer.start();
|
||||
json task = tasks[n_task];
|
||||
auto model = config.model;
|
||||
auto grid = GridData(Paths::grid_input(model));
|
||||
auto dataset_name = task["dataset"].get<std::string>();
|
||||
auto idx_dataset = task["idx_dataset"].get<int>();
|
||||
auto seed = task["seed"].get<int>();
|
||||
auto n_fold = task["fold"].get<int>();
|
||||
bool stratified = config.stratified;
|
||||
bayesnet::Smoothing_t smooth;
|
||||
if (config.smooth_strategy == "ORIGINAL")
|
||||
smooth = bayesnet::Smoothing_t::ORIGINAL;
|
||||
else if (config.smooth_strategy == "LAPLACE")
|
||||
smooth = bayesnet::Smoothing_t::LAPLACE;
|
||||
else if (config.smooth_strategy == "CESTNIK")
|
||||
smooth = bayesnet::Smoothing_t::CESTNIK;
|
||||
//
|
||||
// Generate the hyperparameters combinations
|
||||
//
|
||||
auto& dataset = datasets.getDataset(dataset_name);
|
||||
auto combinations = grid.getGrid(dataset_name);
|
||||
dataset.load();
|
||||
auto [X, y] = dataset.getTensors();
|
||||
auto features = dataset.getFeatures();
|
||||
auto className = dataset.getClassName();
|
||||
//
|
||||
// Start working on task
|
||||
//
|
||||
folding::Fold* fold;
|
||||
if (stratified)
|
||||
fold = new folding::StratifiedKFold(config.n_folds, y, seed);
|
||||
else
|
||||
fold = new folding::KFold(config.n_folds, y.size(0), seed);
|
||||
auto [train, test] = fold->getFold(n_fold);
|
||||
auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test);
|
||||
auto states = dataset.getStates(); // Get the states of the features Once they are discretized
|
||||
float best_fold_score = 0.0;
|
||||
int best_idx_combination = -1;
|
||||
json best_fold_hyper;
|
||||
for (int idx_combination = 0; idx_combination < combinations.size(); ++idx_combination) {
|
||||
auto hyperparam_line = combinations[idx_combination];
|
||||
auto hyperparameters = platform::HyperParameters(datasets.getNames(), hyperparam_line);
|
||||
folding::Fold* nested_fold;
|
||||
if (config.stratified)
|
||||
nested_fold = new folding::StratifiedKFold(config.nested, y_train, seed);
|
||||
else
|
||||
nested_fold = new folding::KFold(config.nested, y_train.size(0), seed);
|
||||
double score = 0.0;
|
||||
for (int n_nested_fold = 0; n_nested_fold < config.nested; n_nested_fold++) {
|
||||
//
|
||||
// Nested level fold
|
||||
//
|
||||
auto [train_nested, test_nested] = nested_fold->getFold(n_nested_fold);
|
||||
auto train_nested_t = torch::tensor(train_nested);
|
||||
auto test_nested_t = torch::tensor(test_nested);
|
||||
auto X_nested_train = X_train.index({ "...", train_nested_t });
|
||||
auto y_nested_train = y_train.index({ train_nested_t });
|
||||
auto X_nested_test = X_train.index({ "...", test_nested_t });
|
||||
auto y_nested_test = y_train.index({ test_nested_t });
|
||||
//
|
||||
// Build Classifier with selected hyperparameters
|
||||
//
|
||||
auto clf = Models::instance()->create(config.model);
|
||||
auto valid = clf->getValidHyperparameters();
|
||||
hyperparameters.check(valid, dataset_name);
|
||||
clf->setHyperparameters(hyperparameters.get(dataset_name));
|
||||
//
|
||||
// Train model
|
||||
//
|
||||
clf->fit(X_nested_train, y_nested_train, features, className, states, smooth);
|
||||
//
|
||||
// Test model
|
||||
//
|
||||
score += clf->score(X_nested_test, y_nested_test);
|
||||
}
|
||||
delete nested_fold;
|
||||
score /= config.nested;
|
||||
if (score > best_fold_score) {
|
||||
best_fold_score = score;
|
||||
best_idx_combination = idx_combination;
|
||||
best_fold_hyper = hyperparam_line;
|
||||
}
|
||||
}
|
||||
delete fold;
|
||||
//
|
||||
// Build Classifier with the best hyperparameters to obtain the best score
|
||||
//
|
||||
auto hyperparameters = platform::HyperParameters(datasets.getNames(), best_fold_hyper);
|
||||
auto clf = Models::instance()->create(config.model);
|
||||
auto valid = clf->getValidHyperparameters();
|
||||
hyperparameters.check(valid, dataset_name);
|
||||
clf->setHyperparameters(best_fold_hyper);
|
||||
clf->fit(X_train, y_train, features, className, states, smooth);
|
||||
best_fold_score = clf->score(X_test, y_test);
|
||||
//
|
||||
// Return the result
|
||||
//
|
||||
result->idx_dataset = task["idx_dataset"].get<int>();
|
||||
result->idx_combination = best_idx_combination;
|
||||
result->score = best_fold_score;
|
||||
result->n_fold = n_fold;
|
||||
result->time = timer.getDuration();
|
||||
result->process = config_mpi.rank;
|
||||
result->task = n_task;
|
||||
//
|
||||
// Update progress bar
|
||||
//
|
||||
std::cout << get_color_rank(config_mpi.rank) << std::flush;
|
||||
}
|
||||
} /* namespace platform */
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user