Compare commits

...

87 Commits

Author SHA1 Message Date
1a534888d6 Fix report format 2023-08-19 23:30:44 +02:00
59ffd179f4 Fix report format 2023-08-19 21:26:48 +02:00
9972738deb Add list datasets and add locale format 2023-08-19 19:05:16 +02:00
bafcb26bb6 Add manage to build target 2023-08-18 13:43:53 +02:00
2d7999d5f2 Add manage to release targets 2023-08-18 13:43:13 +02:00
a6bb22dfb5 Complete first BoostAODE 2023-08-18 11:50:34 +02:00
704dc937be Remove FeatureSel, add SelectKBest to BayesMetrics 2023-08-16 19:05:18 +02:00
a3e665eed6 make weights double 2023-08-16 12:46:09 +02:00
918a7b4180 Remove unneeded output 2023-08-16 12:36:38 +02:00
80b20f35b4 Fix weights mistakes in computation 2023-08-16 12:32:51 +02:00
4d4780c1d5 Add BoostAODE model based on AODE 2023-08-15 16:16:04 +02:00
fa612c531e Complete Adding weights to Models 2023-08-15 15:59:56 +02:00
24b68f9ae2 Add weigths as parameter 2023-08-15 15:04:56 +02:00
a062ebf445 Merge pull request 'reports' (#4) from reports into boostAode
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/4
2023-08-14 16:58:48 +00:00
2a3fc9aa45 Add colors and enhace input control 2023-08-14 17:03:06 +02:00
55d21294d5 Add class Paths and enhance input 2023-08-14 00:40:31 +02:00
3691cb4a61 Add totals and filter by scoreName and model 2023-08-13 18:13:00 +02:00
054567c65a Add sorting capacity 2023-08-13 17:10:18 +02:00
2729b92f06 Summary list 2023-08-13 16:19:17 +02:00
f26ea1f0ac Add weights to BayesMetrics 2023-08-13 12:56:06 +02:00
af0419c9da First approx with const 1 weights 2023-08-13 00:59:02 +02:00
90c92e5c56 Merge pull request 'Add states as result in Proposal methods' (#3) from optimize_memory into main
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/3
2023-08-12 14:16:55 +00:00
182b52a887 Add states as result in Proposal methods 2023-08-12 16:16:17 +02:00
6679b90a82 Merge pull request 'optimize_memory' (#2) from optimize_memory into main
Reviewed-on: https://gitea.rmontanana.es:11000/rmontanana/BayesNet/pulls/2
2023-08-12 14:15:03 +00:00
405887f833 Solved Ld poor results 2023-08-12 11:49:18 +02:00
3a85481a5a Redo pass states to Network Fit needed in crossval
fix mistake in headerline (report)
2023-08-12 11:10:53 +02:00
0ad5505c16 Spodeld working with poor accuracy 2023-08-10 02:06:18 +02:00
323444b74a const functions 2023-08-08 01:53:41 +02:00
ef1bffcac3 Fixed normal classifiers 2023-08-07 13:50:11 +02:00
06db8f51ce Refactor library and models to lighten data stored
Refactro Ensemble to inherit from Classifier insted of BaseClassifier
2023-08-07 12:49:37 +02:00
e74565ba01 update clang-tidy 2023-08-07 00:44:12 +02:00
2da0fb5d8f Merge branch 'main' into TANNew 2023-08-06 11:40:10 +02:00
14ea51648a Complete AODELd 2023-08-06 11:31:44 +02:00
9e94f4e140 Rename suffix of proposal classifier to Ld 2023-08-05 23:23:31 +02:00
1d0fd629c9 Add SPODENew to models 2023-08-05 23:11:36 +02:00
506ef34c6f Add report output to main 2023-08-05 20:29:05 +02:00
7f45495837 Refactor New classifiers to extract predict 2023-08-05 18:39:48 +02:00
1a09ccca4c Add KDBNew fix computeCPT error 2023-08-05 14:40:42 +02:00
a1c6ab18f3 TANNew restructured with poor results 2023-08-04 20:11:22 +02:00
64ac8fb4f2 TANNew as a TAN variant working 2023-08-04 19:42:18 +02:00
c568ba111d Add Proposal class 2023-08-04 13:05:12 +02:00
45c1d052ac Compile TANNew with poor accuracy 2023-08-04 01:35:45 +02:00
eb1cec58a3 Complete nxm 2023-08-03 20:22:33 +02:00
f520b40016 Almost complete proposal in TANNew 2023-08-02 02:21:55 +02:00
cdfb45d2cb Add topological order to Network 2023-08-02 00:56:52 +02:00
f63a9a64f9 Update Makefile to add Release & Debug build 2023-08-01 19:02:37 +02:00
285f0938a6 Update mdlp library 2023-08-01 17:33:01 +02:00
8f8f9773ce Make TANNew same as TAN with local discretization 2023-08-01 13:17:12 +02:00
a9ba21560d Add environment platform to experiment result 2023-08-01 10:55:53 +02:00
a18fbe5594 Begin implementation 2023-07-31 19:53:55 +02:00
adf650d257 Max threading 2023-07-31 18:49:18 +02:00
43bb017d5d Fix problem with tensors way 2023-07-30 19:00:02 +02:00
53697648e7 Merge branch 'aftermath' into main 2023-07-30 01:05:31 +02:00
4ebc9c2013 Complete fixing the linter warnings 2023-07-30 00:16:58 +02:00
b882569169 Fix some more lint warnings 2023-07-30 00:04:18 +02:00
8b2ed26ab7 Fix some lint warnings 2023-07-29 20:37:51 +02:00
5efa3beaee Fix some lint warnings 2023-07-29 20:20:38 +02:00
9a0449c12d Fix some lint warnings 2023-07-29 19:38:42 +02:00
7222119dfb Refactor experiment crossvalidation 2023-07-29 19:00:39 +02:00
cb54f61a69 Refactor Models to be a singleton factory
Add Registrar of models
2023-07-29 18:22:15 +02:00
07d572a98c Add Model factory 2023-07-29 17:27:43 +02:00
c4f3e6f19a Refactor crossvalidation to remove unneeded params 2023-07-29 16:49:06 +02:00
adc0ca238f Refactor cross_validation 2023-07-29 16:44:07 +02:00
b9e76becce Add show experiment 2023-07-29 16:31:36 +02:00
85cb447283 Add Dataset, Models and DotEnv 2023-07-29 16:21:38 +02:00
b03e84044a Fix some mistakes in timer and output format 2023-07-27 18:40:04 +02:00
7f7ddad36a Fix stratified folding mistake in remainders 2023-07-27 16:51:27 +02:00
3d8fea7a37 Complete Experiment 2023-07-27 15:49:58 +02:00
bc214a496c Adding Datasets management 2023-07-27 01:56:06 +02:00
3e954ba841 Complete json output compatible with benchmark 2023-07-26 19:01:39 +02:00
6f7fb290b0 Add json lib and json result generation 2023-07-26 17:49:03 +02:00
49a49a9dcd Fix Experiment 2023-07-26 14:11:49 +02:00
af7a1d2b40 Fix score with tensors and finis sample 2023-07-26 13:29:47 +02:00
4a54bd42a2 Fix some mistakes 2023-07-26 12:53:01 +02:00
099b4bea09 Fix some mistakes in tensors treatment 2023-07-26 01:39:01 +02:00
be06e475f0 Refactor tensor2vector 2023-07-24 13:22:53 +02:00
c10ebca0e0 Add Experiment, Result and Timer classes 2023-07-24 01:15:12 +02:00
0c226371cc Ensemble Experiment, Folding, Classifiers and Network together 2023-07-23 14:10:28 +02:00
644b6c9be0 Begin experiment 2023-07-23 01:47:57 +02:00
9981ad1811 Refactor Library renaming Base classes 2023-07-22 23:07:56 +02:00
41cceece20 Complete Stratified K Fold 2023-07-22 11:23:35 +02:00
f6e154bc6e Begin Stratified KFold 2023-07-21 21:49:02 +02:00
a2622a4fb6 Begin Folding 2023-07-21 16:07:50 +02:00
d8218f9713 refactor sample to use new argparse library 2023-07-21 02:12:47 +02:00
48bfa02e1d Add clang-tidy conf 2023-07-20 23:55:01 +02:00
f519003766 Remove unneeded files 2023-07-20 18:56:10 +02:00
8ddfd58a50 Fix some mistakes to correct tests 2023-07-20 18:55:56 +02:00
89 changed files with 3396 additions and 1287 deletions

16
.clang-tidy Normal file
View File

@@ -0,0 +1,16 @@
---
Checks: '-*,
clang-*,
bugprone-*,
cppcoreguidelines-*,
modernize-*,
performance-*,
-cppcoreguidelines-pro-type-vararg,
-modernize-use-trailing-return-type,
-bugprone-exception-escape'
HeaderFilterRegex: 'src/*'
AnalyzeTemporaryDtors: false
WarningsAsErrors: ''
FormatStyle: file
...

12
.gitmodules vendored Normal file
View File

@@ -0,0 +1,12 @@
[submodule "lib/mdlp"]
path = lib/mdlp
url = https://github.com/rmontanana/mdlp
[submodule "lib/catch2"]
path = lib/catch2
url = https://github.com/catchorg/Catch2.git
[submodule "lib/argparse"]
path = lib/argparse
url = https://github.com/p-ranav/argparse
[submodule "lib/json"]
path = lib/json
url = https://github.com/nlohmann/json.git

51
.vscode/launch.json vendored
View File

@@ -4,22 +4,55 @@
{ {
"type": "lldb", "type": "lldb",
"request": "launch", "request": "launch",
"name": "bayesnet", "name": "sample",
"program": "${workspaceFolder}/build/sample/main", "program": "${workspaceFolder}/build/sample/BayesNetSample",
"args": [ "args": [
"-f", "-d",
"iris" "iris",
"-m",
"KDB",
"-s",
"271",
"-p",
"/Users/rmontanana/Code/discretizbench/datasets/",
], ],
"cwd": "${workspaceFolder}", //"cwd": "${workspaceFolder}/build/sample/",
"preLaunchTask": "CMake: build"
}, },
{ {
"type": "lldb", "type": "lldb",
"request": "launch", "request": "launch",
"name": "aout", "name": "experiment",
"program": "${workspaceFolder}/a.out", "program": "${workspaceFolder}/build/src/Platform/main",
"args": [
"-m",
"BoostAODE",
"-p",
"/Users/rmontanana/Code/discretizbench/datasets",
"--discretize",
"--stratified",
"-d",
"iris"
],
"cwd": "/Users/rmontanana/Code/discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "manage",
"program": "${workspaceFolder}/build/src/Platform/manage",
"args": [
"-n",
"20"
],
"cwd": "/Users/rmontanana/Code/discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "list",
"program": "${workspaceFolder}/build/src/Platform/list",
"args": [], "args": [],
"cwd": "${workspaceFolder}" "cwd": "/Users/rmontanana/Code/discretizbench",
}, },
{ {
"name": "Build & debug active file", "name": "Build & debug active file",

View File

@@ -97,7 +97,12 @@
"future": "cpp", "future": "cpp",
"queue": "cpp", "queue": "cpp",
"typeindex": "cpp", "typeindex": "cpp",
"shared_mutex": "cpp" "shared_mutex": "cpp",
"*.ipp": "cpp",
"cassert": "cpp",
"charconv": "cpp",
"source_location": "cpp",
"ranges": "cpp"
}, },
"cmake.configureOnOpen": false, "cmake.configureOnOpen": false,
"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools" "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools"

23
.vscode/tasks.json vendored
View File

@@ -32,6 +32,29 @@
], ],
"group": "build", "group": "build",
"detail": "Task generated by Debugger." "detail": "Task generated by Debugger."
},
{
"type": "cppbuild",
"label": "C/C++: g++ build active file",
"command": "/usr/bin/g++",
"args": [
"-fdiagnostics-color=always",
"-g",
"${file}",
"-o",
"${fileDirname}/${fileBasenameNoExtension}"
],
"options": {
"cwd": "${fileDirname}"
},
"problemMatcher": [
"$gcc"
],
"group": {
"kind": "build",
"isDefault": true
},
"detail": "Task generated by Debugger."
} }
] ]
} }

View File

@@ -7,6 +7,10 @@ project(BayesNet
LANGUAGES CXX LANGUAGES CXX
) )
if (CODE_COVERAGE AND NOT ENABLE_TESTING)
MESSAGE(FATAL_ERROR "Code coverage requires testing enabled")
endif (CODE_COVERAGE AND NOT ENABLE_TESTING)
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
if (POLICY CMP0135) if (POLICY CMP0135)
@@ -24,21 +28,40 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
# Options # Options
# ------- # -------
option(ENABLE_CLANG_TIDY "Enable to add clang tidy." OFF) option(ENABLE_CLANG_TIDY "Enable to add clang tidy." OFF)
option(ENABLE_TESTING "Unit testing build" ON) option(ENABLE_TESTING "Unit testing build" OFF)
option(CODE_COVERAGE "Collect coverage from test library" ON) option(CODE_COVERAGE "Collect coverage from test library" OFF)
set(CMAKE_BUILD_TYPE "Debug")
# CMakes modules # CMakes modules
# -------------- # --------------
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
include(AddGitSubmodule)
if (CODE_COVERAGE)
enable_testing()
include(CodeCoverage)
MESSAGE("Code coverage enabled")
set(CMAKE_C_FLAGS " ${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage")
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
endif (CODE_COVERAGE)
if (ENABLE_CLANG_TIDY)
include(StaticAnalyzers) # clang-tidy
endif (ENABLE_CLANG_TIDY)
# External libraries - dependencies of BayesNet
# ---------------------------------------------
# include(FetchContent)
add_git_submodule("lib/mdlp")
add_git_submodule("lib/argparse")
add_git_submodule("lib/json")
# Subdirectories # Subdirectories
# -------------- # --------------
add_subdirectory(config) add_subdirectory(config)
add_subdirectory(${BayesNet_SOURCE_DIR}/src/BayesNet) add_subdirectory(lib/Files)
add_subdirectory(${BayesNet_SOURCE_DIR}/src/Platform) add_subdirectory(src/BayesNet)
add_subdirectory(src/Platform)
add_subdirectory(sample) add_subdirectory(sample)
file(GLOB BayesNet_HEADERS CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.h ${BayesNet_SOURCE_DIR}/BayesNet/*.hpp) file(GLOB BayesNet_HEADERS CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.h ${BayesNet_SOURCE_DIR}/BayesNet/*.hpp)
@@ -47,18 +70,11 @@ file(GLOB Platform_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/Platform
# Testing # Testing
# ------- # -------
if (ENABLE_TESTING) if (ENABLE_TESTING)
MESSAGE("Testing enabled") MESSAGE("Testing enabled")
enable_testing() add_git_submodule("lib/catch2")
if (CODE_COVERAGE)
include(CodeCoverage)
MESSAGE("Code coverage enabled")
set(CMAKE_C_FLAGS " ${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage")
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
endif (CODE_COVERAGE)
find_package(Catch2 3 REQUIRED)
include(CTest) include(CTest)
include(Catch)
add_subdirectory(tests) add_subdirectory(tests)
endif (ENABLE_TESTING) endif (ENABLE_TESTING)

View File

@@ -14,14 +14,28 @@ setup: ## Install dependencies for tests and coverage
dependency: ## Create a dependency graph diagram of the project (build/dependency.png) dependency: ## Create a dependency graph diagram of the project (build/dependency.png)
cd build && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png cd build && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png
build: ## Build the project build: ## Build the main and BayesNetSample
@echo ">>> Building BayesNet ..."; cmake --build build -t main -t BayesNetSample -t manage -t list -j 32
clean: ## Clean the debug info
@echo ">>> Cleaning Debug BayesNet ...";
find . -name "*.gcda" -print0 | xargs -0 rm
@echo ">>> Done";
debug: ## Build a debug version of the project
@echo ">>> Building Debug BayesNet ...";
@if [ -d ./build ]; then rm -rf ./build; fi @if [ -d ./build ]; then rm -rf ./build; fi
@mkdir build; @mkdir build;
cmake -S . -B build; \ cmake -S . -B build -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON; \
cd build; \ cmake --build build -j 32;
make; \ @echo ">>> Done";
release: ## Build a Release version of the project
@echo ">>> Building Release BayesNet ...";
@if [ -d ./build ]; then rm -rf ./build; fi
@mkdir build;
cmake -S . -B build -D CMAKE_BUILD_TYPE=Release; \
cmake --build build -t main -t BayesNetSample -t manage -t list -j 32;
@echo ">>> Done"; @echo ">>> Done";
test: ## Run tests test: ## Run tests

12
TAN_iris.dot Normal file
View File

@@ -0,0 +1,12 @@
digraph BayesNet {
label=<BayesNet >
fontsize=30
fontcolor=blue
labelloc=t
layout=circo
class [shape=circle, fontcolor=red, fillcolor=lightblue, style=filled ]
class -> sepallength class -> sepalwidth class -> petallength class -> petalwidth petallength [shape=circle]
petallength -> sepallength petalwidth [shape=circle]
sepallength [shape=circle]
sepallength -> sepalwidth sepalwidth [shape=circle]
sepalwidth -> petalwidth }

View File

@@ -0,0 +1,12 @@
function(add_git_submodule dir)
find_package(Git REQUIRED)
if(NOT EXISTS ${dir}/CMakeLists.txt)
message(STATUS "🚨 Adding git submodule => ${dir}")
execute_process(COMMAND ${GIT_EXECUTABLE}
submodule update --init --recursive -- ${dir}
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
endif()
add_subdirectory(${dir})
endfunction(add_git_submodule)

View File

@@ -0,0 +1,22 @@
if(ENABLE_CLANG_TIDY)
find_program(CLANG_TIDY_COMMAND NAMES clang-tidy)
if(NOT CLANG_TIDY_COMMAND)
message(WARNING "🔴 CMake_RUN_CLANG_TIDY is ON but clang-tidy is not found!")
set(CMAKE_CXX_CLANG_TIDY "" CACHE STRING "" FORCE)
else()
message(STATUS "🟢 CMake_RUN_CLANG_TIDY is ON")
set(CLANGTIDY_EXTRA_ARGS
"-extra-arg=-Wno-unknown-warning-option"
)
set(CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_COMMAND};-p=${CMAKE_BINARY_DIR};${CLANGTIDY_EXTRA_ARGS}" CACHE STRING "" FORCE)
add_custom_target(clang-tidy
COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target ${CMAKE_PROJECT_NAME}
COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target clang-tidy
COMMENT "Running clang-tidy..."
)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
endif()
endif(ENABLE_CLANG_TIDY)

View File

@@ -0,0 +1 @@
null

View File

@@ -1,5 +1,4 @@
filter = src/ filter = src/
exclude = external/ exclude-directories = build/lib/
exclude = tests/
print-summary = yes print-summary = yes
sort-percentage = yes sort-percentage = yes

View File

@@ -2,6 +2,7 @@
#include <fstream> #include <fstream>
#include <sstream> #include <sstream>
#include <map> #include <map>
#include <iostream>
using namespace std; using namespace std;
@@ -42,7 +43,7 @@ vector<int>& ArffFiles::getY()
return y; return y;
} }
void ArffFiles::load(const string& fileName, bool classLast) void ArffFiles::loadCommon(string fileName)
{ {
ifstream file(fileName); ifstream file(fileName);
if (!file.is_open()) { if (!file.is_open()) {
@@ -74,24 +75,51 @@ void ArffFiles::load(const string& fileName, bool classLast)
file.close(); file.close();
if (attributes.empty()) if (attributes.empty())
throw invalid_argument("No attributes found"); throw invalid_argument("No attributes found");
}
void ArffFiles::load(const string& fileName, bool classLast)
{
int labelIndex;
loadCommon(fileName);
if (classLast) { if (classLast) {
className = get<0>(attributes.back()); className = get<0>(attributes.back());
classType = get<1>(attributes.back()); classType = get<1>(attributes.back());
attributes.pop_back(); attributes.pop_back();
labelIndex = static_cast<int>(attributes.size());
} else { } else {
className = get<0>(attributes.front()); className = get<0>(attributes.front());
classType = get<1>(attributes.front()); classType = get<1>(attributes.front());
attributes.erase(attributes.begin()); attributes.erase(attributes.begin());
labelIndex = 0;
} }
generateDataset(classLast); generateDataset(labelIndex);
}
void ArffFiles::load(const string& fileName, const string& name)
{
int labelIndex;
loadCommon(fileName);
bool found = false;
for (int i = 0; i < attributes.size(); ++i) {
if (attributes[i].first == name) {
className = get<0>(attributes[i]);
classType = get<1>(attributes[i]);
attributes.erase(attributes.begin() + i);
labelIndex = i;
found = true;
break;
}
}
if (!found) {
throw invalid_argument("Class name not found");
}
generateDataset(labelIndex);
} }
void ArffFiles::generateDataset(bool classLast) void ArffFiles::generateDataset(int labelIndex)
{ {
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size())); X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
auto yy = vector<string>(lines.size(), ""); auto yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0; auto removeLines = vector<int>(); // Lines with missing values
for (size_t i = 0; i < lines.size(); i++) { for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]); stringstream ss(lines[i]);
string value; string value;
@@ -101,10 +129,20 @@ void ArffFiles::generateDataset(bool classLast)
if (pos++ == labelIndex) { if (pos++ == labelIndex) {
yy[i] = value; yy[i] = value;
} else { } else {
if (value == "?") {
X[xIndex++][i] = -1;
removeLines.push_back(i);
} else
X[xIndex++][i] = stof(value); X[xIndex++][i] = stof(value);
} }
} }
} }
for (auto i : removeLines) {
yy.erase(yy.begin() + i);
for (auto& x : X) {
x.erase(x.begin() + i);
}
}
y = factorize(yy); y = factorize(yy);
} }

View File

@@ -14,12 +14,12 @@ private:
string classType; string classType;
vector<vector<float>> X; vector<vector<float>> X;
vector<int> y; vector<int> y;
void generateDataset(int);
void generateDataset(bool); void loadCommon(string);
public: public:
ArffFiles(); ArffFiles();
void load(const string&, bool = true); void load(const string&, bool = true);
void load(const string&, const string&);
vector<string> getLines() const; vector<string> getLines() const;
unsigned long int getSize() const; unsigned long int getSize() const;
string getClassName() const; string getClassName() const;

1
lib/Files/CMakeLists.txt Normal file
View File

@@ -0,0 +1 @@
add_library(ArffFiles ArffFiles.cc)

1
lib/argparse Submodule

Submodule lib/argparse added at b0930ab028

1
lib/catch2 Submodule

Submodule lib/catch2 added at 4acc51828f

1
lib/json Submodule

Submodule lib/json added at 5d2754306d

1
lib/mdlp Submodule

Submodule lib/mdlp added at 5708dc3de9

View File

@@ -1,4 +1,7 @@
include_directories(${BayesNet_SOURCE_DIR}/src/Platform) include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
add_executable(sample sample.cc ${BayesNet_SOURCE_DIR}/src/Platform/ArffFiles.cc ${BayesNet_SOURCE_DIR}/src/Platform/CPPFImdlp.cpp ${BayesNet_SOURCE_DIR}/src/Platform/Metrics.cpp ${BayesNet_SOURCE_DIR}/src/Platform/typesFImdlp.h ${BayesNet_HEADERS}) include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
target_link_libraries(sample BayesNet "${TORCH_LIBRARIES}") include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include)
add_executable(BayesNetSample sample.cc ${BayesNet_SOURCE_DIR}/src/Platform/Folding.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(BayesNetSample BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")

View File

@@ -1,96 +1,19 @@
#include <iostream> #include <iostream>
#include <string>
#include <torch/torch.h> #include <torch/torch.h>
#include <thread> #include <string>
#include <getopt.h> #include <map>
#include <argparse/argparse.hpp>
#include "ArffFiles.h" #include "ArffFiles.h"
#include "Network.h" #include "BayesMetrics.h"
#include "Metrics.hpp"
#include "CPPFImdlp.h" #include "CPPFImdlp.h"
#include "KDB.h" #include "Folding.h"
#include "SPODE.h" #include "Models.h"
#include "AODE.h" #include "modelRegister.h"
#include "TAN.h"
using namespace std; using namespace std;
const string PATH = "data/"; const string PATH = "../../data/";
/* print a description of all supported options */
void usage(const char* path)
{
/* take only the last portion of the path */
const char* basename = strrchr(path, '/');
basename = basename ? basename + 1 : path;
cout << "usage: " << basename << "[OPTION]" << endl;
cout << " -h, --help\t\t Print this help and exit." << endl;
cout
<< " -f, --file[=FILENAME]\t {diabetes, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}."
<< endl;
cout << " -p, --path[=FILENAME]\t folder where the data files are located, default " << PATH << endl;
cout << " -m, --model={AODE, KDB, SPODE, TAN}\t " << endl;
}
tuple<string, string, string> parse_arguments(int argc, char** argv)
{
string file_name;
string model_name;
string path = PATH;
const vector<struct option> long_options = {
{"help", no_argument, nullptr, 'h'},
{"file", required_argument, nullptr, 'f'},
{"path", required_argument, nullptr, 'p'},
{"model", required_argument, nullptr, 'm'},
{nullptr, no_argument, nullptr, 0}
};
while (true) {
const auto c = getopt_long(argc, argv, "hf:p:m:", long_options.data(), nullptr);
if (c == -1)
break;
switch (c) {
case 'h':
usage(argv[0]);
exit(0);
case 'f':
file_name = string(optarg);
break;
case 'm':
model_name = string(optarg);
break;
case 'p':
path = optarg;
if (path.back() != '/')
path += '/';
break;
case '?':
usage(argv[0]);
exit(1);
default:
abort();
}
}
if (file_name.empty()) {
usage(argv[0]);
exit(1);
}
return make_tuple(file_name, path, model_name);
}
inline constexpr auto hash_conv(const std::string_view sv)
{
unsigned long hash{ 5381 };
for (unsigned char c : sv) {
hash = ((hash << 5) + hash) ^ c;
}
return hash;
}
inline constexpr auto operator"" _sh(const char* str, size_t len)
{
return hash_conv(std::string_view{ str, len });
}
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features) pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)
{ {
@@ -116,8 +39,23 @@ bool file_exists(const std::string& name)
return false; return false;
} }
} }
pair<vector<vector<int>>, vector<int>> extract_indices(vector<int> indices, vector<vector<int>> X, vector<int> y)
{
vector<vector<int>> Xr; // nxm
vector<int> yr;
for (int col = 0; col < X.size(); ++col) {
Xr.push_back(vector<int>());
}
for (auto index : indices) {
for (int col = 0; col < X.size(); ++col) {
Xr[col].push_back(X[col][index]);
}
yr.push_back(y[index]);
}
return { Xr, yr };
}
tuple<string, string, string> get_options(int argc, char** argv) int main(int argc, char** argv)
{ {
map<string, bool> datasets = { map<string, bool> datasets = {
{"diabetes", true}, {"diabetes", true},
@@ -129,97 +67,185 @@ tuple<string, string, string> get_options(int argc, char** argv)
{"liver-disorders", true}, {"liver-disorders", true},
{"mfeat-factors", true}, {"mfeat-factors", true},
}; };
vector <string> models = { "AODE", "KDB", "SPODE", "TAN" }; auto valid_datasets = vector<string>();
string file_name; transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
string path; [](const pair<string, bool>& pair) { return pair.first; });
string model_name; argparse::ArgumentParser program("BayesNetSample");
tie(file_name, path, model_name) = parse_arguments(argc, argv); program.add_argument("-d", "--dataset")
if (datasets.find(file_name) == datasets.end()) { .help("Dataset file name")
cout << "Invalid file name: " << file_name << endl; .action([valid_datasets](const std::string& value) {
usage(argv[0]); if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
exit(1); return value;
} }
if (!file_exists(path + file_name + ".arff")) { throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
cout << "Data File " << path + file_name + ".arff" << " does not exist" << endl;
usage(argv[0]);
exit(1);
} }
if (find(models.begin(), models.end(), model_name) == models.end()) { );
cout << "Invalid model name: " << model_name << endl; program.add_argument("-p", "--path")
usage(argv[0]); .help(" folder where the data files are located, default")
exit(1); .default_value(string{ PATH }
);
program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->toString())
.action([](const std::string& value) {
static const vector<string> choices = platform::Models::instance()->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
} }
return { file_name, path, model_name }; throw runtime_error("Model must be one of " + platform::Models::instance()->toString());
}
);
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw runtime_error(err.what());
}
catch (...) {
throw runtime_error("Number of folds must be an integer");
}});
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
bool class_last, stratified, tensors, dump_cpt;
string model_name, file_name, path, complete_file_name;
int nFolds, seed;
try {
program.parse_args(argc, argv);
file_name = program.get<string>("dataset");
path = program.get<string>("path");
model_name = program.get<string>("model");
complete_file_name = path + file_name + ".arff";
stratified = program.get<bool>("stratified");
tensors = program.get<bool>("tensors");
nFolds = program.get<int>("folds");
seed = program.get<int>("seed");
dump_cpt = program.get<bool>("dumpcpt");
class_last = datasets[file_name];
if (!file_exists(complete_file_name)) {
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
}
}
catch (const exception& err) {
cerr << err.what() << endl;
cerr << program;
exit(1);
} }
int main(int argc, char** argv) /*
{ * Begin Processing
string file_name, path, model_name; */
tie(file_name, path, model_name) = get_options(argc, argv); auto ypred = torch::tensor({ 1,2,3,2,2,3,4,5,2,1 });
auto handler = ArffFiles(); auto y = torch::tensor({ 0,0,0,0,2,3,4,0,0,0 });
handler.load(path + file_name + ".arff"); auto weights = torch::ones({ 10 }, kDouble);
// Get Dataset X, y auto mask = ypred == y;
vector<mdlp::samples_t>& X = handler.getX(); cout << "ypred:" << ypred << endl;
mdlp::labels_t& y = handler.getY(); cout << "y:" << y << endl;
// Get className & Features cout << "weights:" << weights << endl;
auto className = handler.getClassName(); cout << "mask:" << mask << endl;
vector<string> features; double value_to_add = 0.5;
for (auto feature : handler.getAttributes()) { weights += mask.to(torch::kDouble) * value_to_add;
features.push_back(feature.first); cout << "New weights:" << weights << endl;
} auto masked_weights = weights * mask.to(weights.dtype());
// Discretize Dataset double sum_of_weights = masked_weights.sum().item<double>();
vector<mdlp::labels_t> Xd; cout << "Sum of weights: " << sum_of_weights << endl;
map<string, int> maxes; //weights.index_put_({ mask }, weights + 10);
tie(Xd, maxes) = discretize(X, y, features); // auto handler = ArffFiles();
maxes[className] = *max_element(y.begin(), y.end()) + 1; // handler.load(complete_file_name, class_last);
map<string, vector<int>> states; // // Get Dataset X, y
for (auto feature : features) { // vector<mdlp::samples_t>& X = handler.getX();
states[feature] = vector<int>(maxes[feature]); // mdlp::labels_t& y = handler.getY();
} // // Get className & Features
states[className] = vector<int>( // auto className = handler.getClassName();
maxes[className]); // vector<string> features;
double score; // auto attributes = handler.getAttributes();
vector<string> lines; // transform(attributes.begin(), attributes.end(), back_inserter(features),
vector<string> graph; // [](const pair<string, string>& item) { return item.first; });
auto kdb = bayesnet::KDB(2); // // Discretize Dataset
auto aode = bayesnet::AODE(); // auto [Xd, maxes] = discretize(X, y, features);
auto spode = bayesnet::SPODE(2); // maxes[className] = *max_element(y.begin(), y.end()) + 1;
auto tan = bayesnet::TAN(); // map<string, vector<int>> states;
switch (hash_conv(model_name)) { // for (auto feature : features) {
case "AODE"_sh: // states[feature] = vector<int>(maxes[feature]);
aode.fit(Xd, y, features, className, states); // }
lines = aode.show(); // states[className] = vector<int>(maxes[className]);
score = aode.score(Xd, y); // auto clf = platform::Models::instance()->create(model_name);
graph = aode.graph(); // clf->fit(Xd, y, features, className, states);
break; // if (dump_cpt) {
case "KDB"_sh: // cout << "--- CPT Tables ---" << endl;
kdb.fit(Xd, y, features, className, states); // clf->dump_cpt();
lines = kdb.show(); // }
score = kdb.score(Xd, y); // auto lines = clf->show();
graph = kdb.graph(); // for (auto line : lines) {
break; // cout << line << endl;
case "SPODE"_sh: // }
spode.fit(Xd, y, features, className, states); // cout << "--- Topological Order ---" << endl;
lines = spode.show(); // auto order = clf->topological_order();
score = spode.score(Xd, y); // for (auto name : order) {
graph = spode.graph(); // cout << name << ", ";
break; // }
case "TAN"_sh: // cout << "end." << endl;
tan.fit(Xd, y, features, className, states); // auto score = clf->score(Xd, y);
lines = tan.show(); // cout << "Score: " << score << endl;
score = tan.score(Xd, y); // auto graph = clf->graph();
graph = tan.graph(); // auto dot_file = model_name + "_" + file_name;
break; // ofstream file(dot_file + ".dot");
} // file << graph;
for (auto line : lines) { // file.close();
cout << line << endl; // cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl;
} // cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl;
cout << "Score: " << score << endl; // string stratified_string = stratified ? " Stratified" : "";
auto dot_file = model_name + "_" + file_name; // cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl;
ofstream file(dot_file + ".dot"); // cout << "==========================================" << endl;
file << graph; // torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
file.close(); // torch::Tensor yt = torch::tensor(y, torch::kInt32);
cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; // for (int i = 0; i < features.size(); ++i) {
cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; // Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
return 0; // }
// float total_score = 0, total_score_train = 0, score_train, score_test;
// Fold* fold;
// if (stratified)
// fold = new StratifiedKFold(nFolds, y, seed);
// else
// fold = new KFold(nFolds, y.size(), seed);
// for (auto i = 0; i < nFolds; ++i) {
// auto [train, test] = fold->getFold(i);
// cout << "Fold: " << i + 1 << endl;
// if (tensors) {
// auto ttrain = torch::tensor(train, torch::kInt64);
// auto ttest = torch::tensor(test, torch::kInt64);
// torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
// torch::Tensor ytraint = yt.index({ ttrain });
// torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
// torch::Tensor ytestt = yt.index({ ttest });
// clf->fit(Xtraint, ytraint, features, className, states);
// auto temp = clf->predict(Xtraint);
// score_train = clf->score(Xtraint, ytraint);
// score_test = clf->score(Xtestt, ytestt);
// } else {
// auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
// auto [Xtest, ytest] = extract_indices(test, Xd, y);
// clf->fit(Xtrain, ytrain, features, className, states);
// score_train = clf->score(Xtrain, ytrain);
// score_test = clf->score(Xtest, ytest);
// }
// if (dump_cpt) {
// cout << "--- CPT Tables ---" << endl;
// clf->dump_cpt();
// }
// total_score_train += score_train;
// total_score += score_test;
// cout << "Score Train: " << score_train << endl;
// cout << "Score Test : " << score_test << endl;
// cout << "-------------------------------------------------------------------------------" << endl;
// }
// cout << "**********************************************************************************" << endl;
// cout << "Average Score Train: " << total_score_train / nFolds << endl;
// cout << "Average Score Test : " << total_score / nFolds << endl;return 0;
} }

View File

@@ -2,14 +2,16 @@
namespace bayesnet { namespace bayesnet {
AODE::AODE() : Ensemble() {} AODE::AODE() : Ensemble() {}
void AODE::train() void AODE::buildModel(const torch::Tensor& weights)
{ {
models.clear(); models.clear();
for (int i = 0; i < features.size(); ++i) { for (int i = 0; i < features.size(); ++i) {
models.push_back(std::make_unique<SPODE>(i)); models.push_back(std::make_unique<SPODE>(i));
} }
n_models = models.size();
significanceModels = vector<double>(n_models, 1.0);
} }
vector<string> AODE::graph(string title) vector<string> AODE::graph(const string& title) const
{ {
return Ensemble::graph(title); return Ensemble::graph(title);
} }

View File

@@ -5,10 +5,11 @@
namespace bayesnet { namespace bayesnet {
class AODE : public Ensemble { class AODE : public Ensemble {
protected: protected:
void train() override; void buildModel(const torch::Tensor& weights) override;
public: public:
AODE(); AODE();
vector<string> graph(string title = "AODE"); virtual ~AODE() {};
vector<string> graph(const string& title = "AODE") const override;
}; };
} }
#endif #endif

40
src/BayesNet/AODELd.cc Normal file
View File

@@ -0,0 +1,40 @@
#include "AODELd.h"
#include "Models.h"
namespace bayesnet {
using namespace std;
AODELd::AODELd() : Ensemble(), Proposal(dataset, features, className) {}
AODELd& AODELd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
{
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
features = features_;
className = className_;
Xf = X_;
y = y_;
// Fills vectors Xv & yv with the data from tensors X_ (discretized) & y
states = fit_local_discretization(y);
// We have discretized the input data
// 1st we need to fit the model to build the normal TAN structure, TAN::fit initializes the base Bayesian network
Ensemble::fit(dataset, features, className, states);
return *this;
}
void AODELd::buildModel(const torch::Tensor& weights)
{
models.clear();
for (int i = 0; i < features.size(); ++i) {
models.push_back(std::make_unique<SPODELd>(i));
}
n_models = models.size();
}
void AODELd::trainModel(const torch::Tensor& weights)
{
for (const auto& model : models) {
model->fit(Xf, y, features, className, states);
}
}
vector<string> AODELd::graph(const string& name) const
{
return Ensemble::graph(name);
}
}

21
src/BayesNet/AODELd.h Normal file
View File

@@ -0,0 +1,21 @@
#ifndef AODELD_H
#define AODELD_H
#include "Ensemble.h"
#include "Proposal.h"
#include "SPODELd.h"
namespace bayesnet {
using namespace std;
class AODELd : public Ensemble, public Proposal {
protected:
void trainModel(const torch::Tensor& weights) override;
void buildModel(const torch::Tensor& weights) override;
public:
AODELd();
AODELd& fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_) override;
virtual ~AODELd() = default;
vector<string> graph(const string& name = "AODE") const override;
static inline string version() { return "0.0.1"; };
};
}
#endif // !AODELD_H

View File

@@ -1,127 +0,0 @@
#include "BaseClassifier.h"
#include "bayesnetUtils.h"
namespace bayesnet {
using namespace std;
using namespace torch;
BaseClassifier::BaseClassifier(Network model) : model(model), m(0), n(0), metrics(Metrics()), fitted(false) {}
BaseClassifier& BaseClassifier::build(vector<string>& features, string className, map<string, vector<int>>& states)
{
dataset = torch::cat({ X, y.view({y.size(0), 1}) }, 1);
this->features = features;
this->className = className;
this->states = states;
checkFitParameters();
auto n_classes = states[className].size();
metrics = Metrics(dataset, features, className, n_classes);
train();
model.fit(Xv, yv, features, className);
fitted = true;
return *this;
}
BaseClassifier& BaseClassifier::fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states)
{
this->X = torch::zeros({ static_cast<int64_t>(X[0].size()), static_cast<int64_t>(X.size()) }, kInt64);
Xv = X;
for (int i = 0; i < X.size(); ++i) {
this->X.index_put_({ "...", i }, torch::tensor(X[i], kInt64));
}
this->y = torch::tensor(y, kInt64);
yv = y;
return build(features, className, states);
}
void BaseClassifier::checkFitParameters()
{
auto sizes = X.sizes();
m = sizes[0];
n = sizes[1];
if (m != y.size(0)) {
throw invalid_argument("X and y must have the same number of samples");
}
if (n != features.size()) {
throw invalid_argument("X and features must have the same number of features");
}
if (states.find(className) == states.end()) {
throw invalid_argument("className not found in states");
}
for (auto feature : features) {
if (states.find(feature) == states.end()) {
throw invalid_argument("feature [" + feature + "] not found in states");
}
}
}
Tensor BaseClassifier::predict(Tensor& X)
{
if (!fitted) {
throw logic_error("Classifier has not been fitted");
}
auto m_ = X.size(0);
auto n_ = X.size(1);
vector<vector<int>> Xd(n_, vector<int>(m_, 0));
for (auto i = 0; i < n_; i++) {
auto temp = X.index({ "...", i });
Xd[i] = vector<int>(temp.data_ptr<int>(), temp.data_ptr<int>() + m_);
}
auto yp = model.predict(Xd);
auto ypred = torch::tensor(yp, torch::kInt64);
return ypred;
}
vector<int> BaseClassifier::predict(vector<vector<int>>& X)
{
if (!fitted) {
throw logic_error("Classifier has not been fitted");
}
auto m_ = X[0].size();
auto n_ = X.size();
vector<vector<int>> Xd(n_, vector<int>(m_, 0));
for (auto i = 0; i < n_; i++) {
Xd[i] = vector<int>(X[i].begin(), X[i].end());
}
auto yp = model.predict(Xd);
return yp;
}
float BaseClassifier::score(Tensor& X, Tensor& y)
{
if (!fitted) {
throw logic_error("Classifier has not been fitted");
}
Tensor y_pred = predict(X);
return (y_pred == y).sum().item<float>() / y.size(0);
}
float BaseClassifier::score(vector<vector<int>>& X, vector<int>& y)
{
if (!fitted) {
throw logic_error("Classifier has not been fitted");
}
auto m_ = X[0].size();
auto n_ = X.size();
vector<vector<int>> Xd(n_, vector<int>(m_, 0));
for (auto i = 0; i < n_; i++) {
Xd[i] = vector<int>(X[i].begin(), X[i].end());
}
return model.score(Xd, y);
}
vector<string> BaseClassifier::show()
{
return model.show();
}
void BaseClassifier::addNodes()
{
// Add all nodes to the network
for (auto feature : features) {
model.addNode(feature, states[feature].size());
}
model.addNode(className, states[className].size());
}
int BaseClassifier::getNumberOfNodes()
{
// Features does not include class
return fitted ? model.getFeatures().size() + 1 : 0;
}
int BaseClassifier::getNumberOfEdges()
{
return fitted ? model.getEdges().size() : 0;
}
}

View File

@@ -1,48 +1,32 @@
#ifndef CLASSIFIERS_H #ifndef BASE_H
#define CLASSIFIERS_H #define BASE_H
#include <torch/torch.h> #include <torch/torch.h>
#include "Network.h" #include <vector>
#include "Metrics.hpp"
using namespace std;
using namespace torch;
namespace bayesnet { namespace bayesnet {
using namespace std;
class BaseClassifier { class BaseClassifier {
private:
bool fitted;
BaseClassifier& build(vector<string>& features, string className, map<string, vector<int>>& states);
protected: protected:
Network model; virtual void trainModel(const torch::Tensor& weights) = 0;
int m, n; // m: number of samples, n: number of features
Tensor X;
vector<vector<int>> Xv;
Tensor y;
vector<int> yv;
Tensor dataset;
Metrics metrics;
vector<string> features;
string className;
map<string, vector<int>> states;
void checkFitParameters();
virtual void train() = 0;
public: public:
BaseClassifier(Network model); // X is nxm vector, y is nx1 vector
virtual BaseClassifier& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states) = 0;
// X is nxm tensor, y is nx1 tensor
virtual BaseClassifier& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) = 0;
virtual BaseClassifier& fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states) = 0;
virtual BaseClassifier& fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states, const torch::Tensor& weights) = 0;
virtual ~BaseClassifier() = default; virtual ~BaseClassifier() = default;
BaseClassifier& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states); torch::Tensor virtual predict(torch::Tensor& X) = 0;
void addNodes(); vector<int> virtual predict(vector<vector<int>>& X) = 0;
int getNumberOfNodes(); float virtual score(vector<vector<int>>& X, vector<int>& y) = 0;
int getNumberOfEdges(); float virtual score(torch::Tensor& X, torch::Tensor& y) = 0;
Tensor predict(Tensor& X); int virtual getNumberOfNodes()const = 0;
vector<int> predict(vector<vector<int>>& X); int virtual getNumberOfEdges()const = 0;
float score(Tensor& X, Tensor& y); int virtual getNumberOfStates() const = 0;
float score(vector<vector<int>>& X, vector<int>& y); vector<string> virtual show() const = 0;
vector<string> show(); vector<string> virtual graph(const string& title = "") const = 0;
virtual vector<string> graph(string title) = 0; const string inline getVersion() const { return "0.1.0"; };
vector<string> virtual topological_order() = 0;
void virtual dump_cpt()const = 0;
}; };
} }
#endif #endif

View File

@@ -1,24 +1,50 @@
#include "Metrics.hpp" #include "BayesMetrics.h"
#include "Mst.h" #include "Mst.h"
using namespace std;
namespace bayesnet { namespace bayesnet {
Metrics::Metrics(torch::Tensor& samples, vector<string>& features, string& className, int classNumStates) //samples is nxm tensor used to fit the model
Metrics::Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates)
: samples(samples) : samples(samples)
, features(features) , features(features)
, className(className) , className(className)
, classNumStates(classNumStates) , classNumStates(classNumStates)
{ {
} }
//samples is nxm vector used to fit the model
Metrics::Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates) Metrics::Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates)
: features(features) : features(features)
, className(className) , className(className)
, classNumStates(classNumStates) , classNumStates(classNumStates)
, samples(torch::zeros({ static_cast<int>(vsamples[0].size()), static_cast<int>(vsamples.size() + 1) }, torch::kInt32))
{ {
samples = torch::zeros({ static_cast<int64_t>(vsamples[0].size()), static_cast<int64_t>(vsamples.size() + 1) }, torch::kInt64);
for (int i = 0; i < vsamples.size(); ++i) { for (int i = 0; i < vsamples.size(); ++i) {
samples.index_put_({ "...", i }, torch::tensor(vsamples[i], torch::kInt64)); samples.index_put_({ i, "..." }, torch::tensor(vsamples[i], torch::kInt32));
} }
samples.index_put_({ "...", -1 }, torch::tensor(labels, torch::kInt64)); samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
}
vector<int> Metrics::SelectKBestWeighted(const torch::Tensor& weights, unsigned k)
{
auto n = samples.size(0) - 1;
if (k == 0) {
k = n;
}
// compute scores
scoresKBest.reserve(n);
auto label = samples.index({ -1, "..." });
for (int i = 0; i < n; ++i) {
scoresKBest.push_back(mutualInformation(label, samples.index({ i, "..." }), weights));
featuresKBest.push_back(i);
}
// sort & reduce scores and features
sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j)
{ return scoresKBest[i] > scoresKBest[j]; });
sort(scoresKBest.begin(), scoresKBest.end(), std::greater<double>());
featuresKBest.resize(k);
scoresKBest.resize(k);
return featuresKBest;
}
vector<double> Metrics::getScoresKBest() const
{
return scoresKBest;
} }
vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source) vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
{ {
@@ -31,28 +57,30 @@ namespace bayesnet {
} }
return result; return result;
} }
torch::Tensor Metrics::conditionalEdge() torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights)
{ {
auto result = vector<double>(); auto result = vector<double>();
auto source = vector<string>(features); auto source = vector<string>(features);
source.push_back(className); source.push_back(className);
auto combinations = doCombinations(source); auto combinations = doCombinations(source);
double totalWeight = weights.sum().item<double>();
// Compute class prior // Compute class prior
auto margin = torch::zeros({ classNumStates }); auto margin = torch::zeros({ classNumStates }, torch::kFloat);
for (int value = 0; value < classNumStates; ++value) { for (int value = 0; value < classNumStates; ++value) {
auto mask = samples.index({ "...", -1 }) == value; auto mask = samples.index({ -1, "..." }) == value;
margin[value] = mask.sum().item<float>() / samples.sizes()[0]; margin[value] = mask.sum().item<double>() / samples.size(1);
} }
for (auto [first, second] : combinations) { for (auto [first, second] : combinations) {
int64_t index_first = find(features.begin(), features.end(), first) - features.begin(); int index_first = find(features.begin(), features.end(), first) - features.begin();
int64_t index_second = find(features.begin(), features.end(), second) - features.begin(); int index_second = find(features.begin(), features.end(), second) - features.begin();
double accumulated = 0; double accumulated = 0;
for (int value = 0; value < classNumStates; ++value) { for (int value = 0; value < classNumStates; ++value) {
auto mask = samples.index({ "...", -1 }) == value; auto mask = samples.index({ -1, "..." }) == value;
auto first_dataset = samples.index({ mask, index_first }); auto first_dataset = samples.index({ index_first, mask });
auto second_dataset = samples.index({ mask, index_second }); auto second_dataset = samples.index({ index_second, mask });
auto mi = mutualInformation(first_dataset, second_dataset); auto weights_dataset = weights.index({ mask });
auto pb = margin[value].item<float>(); auto mi = mutualInformation(first_dataset, second_dataset, weights_dataset);
auto pb = margin[value].item<double>();
accumulated += pb * mi; accumulated += pb * mi;
} }
result.push_back(accumulated); result.push_back(accumulated);
@@ -68,34 +96,36 @@ namespace bayesnet {
} }
return matrix; return matrix;
} }
vector<float> Metrics::conditionalEdgeWeights() // To use in Python
vector<float> Metrics::conditionalEdgeWeights(vector<float>& weights_)
{ {
auto matrix = conditionalEdge(); const torch::Tensor weights = torch::tensor(weights_);
auto matrix = conditionalEdge(weights);
std::vector<float> v(matrix.data_ptr<float>(), matrix.data_ptr<float>() + matrix.numel()); std::vector<float> v(matrix.data_ptr<float>(), matrix.data_ptr<float>() + matrix.numel());
return v; return v;
} }
double Metrics::entropy(torch::Tensor& feature) double Metrics::entropy(const torch::Tensor& feature, const torch::Tensor& weights)
{ {
torch::Tensor counts = feature.bincount(); torch::Tensor counts = feature.bincount(weights);
int totalWeight = counts.sum().item<int>(); double totalWeight = counts.sum().item<double>();
torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
torch::Tensor logProbs = torch::log(probs); torch::Tensor logProbs = torch::log(probs);
torch::Tensor entropy = -probs * logProbs; torch::Tensor entropy = -probs * logProbs;
return entropy.nansum().item<double>(); return entropy.nansum().item<double>();
} }
// H(Y|X) = sum_{x in X} p(x) H(Y|X=x) // H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
double Metrics::conditionalEntropy(torch::Tensor& firstFeature, torch::Tensor& secondFeature) double Metrics::conditionalEntropy(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights)
{ {
int numSamples = firstFeature.sizes()[0]; int numSamples = firstFeature.sizes()[0];
torch::Tensor featureCounts = secondFeature.bincount(); torch::Tensor featureCounts = secondFeature.bincount(weights);
unordered_map<int, unordered_map<int, double>> jointCounts; unordered_map<int, unordered_map<int, double>> jointCounts;
double totalWeight = 0; double totalWeight = 0;
for (auto i = 0; i < numSamples; i++) { for (auto i = 0; i < numSamples; i++) {
jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += 1; jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += weights[i].item<double>();
totalWeight += 1; totalWeight += weights[i].item<float>();
} }
if (totalWeight == 0) if (totalWeight == 0)
throw invalid_argument("Total weight should not be zero"); return 0;
double entropyValue = 0; double entropyValue = 0;
for (int value = 0; value < featureCounts.sizes()[0]; ++value) { for (int value = 0; value < featureCounts.sizes()[0]; ++value) {
double p_f = featureCounts[value].item<double>() / totalWeight; double p_f = featureCounts[value].item<double>() / totalWeight;
@@ -113,18 +143,17 @@ namespace bayesnet {
return entropyValue; return entropyValue;
} }
// I(X;Y) = H(Y) - H(Y|X) // I(X;Y) = H(Y) - H(Y|X)
double Metrics::mutualInformation(torch::Tensor& firstFeature, torch::Tensor& secondFeature) double Metrics::mutualInformation(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights)
{ {
return entropy(firstFeature) - conditionalEntropy(firstFeature, secondFeature); return entropy(firstFeature, weights) - conditionalEntropy(firstFeature, secondFeature, weights);
} }
/* /*
Compute the maximum spanning tree considering the weights as distances Compute the maximum spanning tree considering the weights as distances
and the indices of the weights as nodes of this square matrix using and the indices of the weights as nodes of this square matrix using
Kruskal algorithm Kruskal algorithm
*/ */
vector<pair<int, int>> Metrics::maximumSpanningTree(vector<string> features, Tensor& weights, int root) vector<pair<int, int>> Metrics::maximumSpanningTree(const vector<string>& features, const Tensor& weights, const int root)
{ {
auto result = vector<pair<int, int>>();
auto mst = MST(features, weights, root); auto mst = MST(features, weights, root);
return mst.maximumSpanningTree(); return mst.maximumSpanningTree();
} }

View File

@@ -0,0 +1,32 @@
#ifndef BAYESNET_METRICS_H
#define BAYESNET_METRICS_H
#include <torch/torch.h>
#include <vector>
#include <string>
namespace bayesnet {
using namespace std;
using namespace torch;
class Metrics {
private:
Tensor samples; // nxm tensor used to fit the model
vector<string> features;
string className;
int classNumStates = 0;
vector<double> scoresKBest;
vector<int> featuresKBest; // sorted indices of the features
double entropy(const Tensor& feature, const Tensor& weights);
double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
vector<pair<string, string>> doCombinations(const vector<string>&);
public:
Metrics() = default;
Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates);
vector<int> SelectKBestWeighted(const torch::Tensor& weights, unsigned k = 0);
vector<double> getScoresKBest() const;
double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
vector<float> conditionalEdgeWeights(vector<float>& weights); // To use in Python
Tensor conditionalEdge(const torch::Tensor& weights);
vector<pair<int, int>> maximumSpanningTree(const vector<string>& features, const Tensor& weights, const int root);
};
}
#endif

82
src/BayesNet/BoostAODE.cc Normal file
View File

@@ -0,0 +1,82 @@
#include "BoostAODE.h"
#include "BayesMetrics.h"
namespace bayesnet {
BoostAODE::BoostAODE() : Ensemble() {}
void BoostAODE::buildModel(const torch::Tensor& weights)
{
// Models shall be built in trainModel
}
void BoostAODE::trainModel(const torch::Tensor& weights)
{
models.clear();
n_models = 0;
int max_models = .1 * n > 10 ? .1 * n : n;
Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
auto X_ = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." });
auto y_ = dataset.index({ -1, "..." });
bool exitCondition = false;
bool repeatSparent = false;
vector<int> featuresUsed;
// Step 0: Set the finish condition
// if not repeatSparent a finish condition is run out of features
// n_models == max_models
int numClasses = states[className].size();
while (!exitCondition) {
// Step 1: Build ranking with mutual information
auto featureSelection = metrics.SelectKBestWeighted(weights_, n); // Get all the features sorted
auto feature = featureSelection[0];
unique_ptr<Classifier> model;
if (!repeatSparent) {
if (n_models == 0) {
models.resize(n); // Resize for n==nfeatures SPODEs
significanceModels.resize(n);
}
bool found = false;
for (int i = 0; i < featureSelection.size(); ++i) {
if (find(featuresUsed.begin(), featuresUsed.end(), i) != featuresUsed.end()) {
continue;
}
found = true;
feature = i;
featuresUsed.push_back(feature);
n_models++;
break;
}
if (!found) {
exitCondition = true;
continue;
}
}
model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
auto ypred = model->predict(X_);
// Step 3.1: Compute the classifier amout of say
auto mask_wrong = ypred != y_;
auto masked_weights = weights_ * mask_wrong.to(weights_.dtype());
double wrongWeights = masked_weights.sum().item<double>();
double significance = wrongWeights == 0 ? 1 : 0.5 * log((1 - wrongWeights) / wrongWeights);
// Step 3.2: Update weights for next classifier
// Step 3.2.1: Update weights of wrong samples
weights_ += mask_wrong.to(weights_.dtype()) * exp(significance) * weights_;
// Step 3.3: Normalise the weights
double totalWeights = torch::sum(weights_).item<double>();
weights_ = weights_ / totalWeights;
// Step 3.4: Store classifier and its accuracy to weigh its future vote
if (!repeatSparent) {
models[feature] = std::move(model);
significanceModels[feature] = significance;
} else {
models.push_back(std::move(model));
significanceModels.push_back(significance);
n_models++;
}
exitCondition = n_models == max_models;
}
weights.copy_(weights_);
}
vector<string> BoostAODE::graph(const string& title) const
{
return Ensemble::graph(title);
}
}

16
src/BayesNet/BoostAODE.h Normal file
View File

@@ -0,0 +1,16 @@
#ifndef BOOSTAODE_H
#define BOOSTAODE_H
#include "Ensemble.h"
#include "SPODE.h"
namespace bayesnet {
class BoostAODE : public Ensemble {
protected:
void buildModel(const torch::Tensor& weights) override;
void trainModel(const torch::Tensor& weights) override;
public:
BoostAODE();
virtual ~BoostAODE() {};
vector<string> graph(const string& title = "BoostAODE") const override;
};
}
#endif

View File

@@ -1,2 +1,8 @@
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc Metrics.cc BaseClassifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc Mst.cc) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
target_link_libraries(BayesNet "${TORCH_LIBRARIES}") include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")

155
src/BayesNet/Classifier.cc Normal file
View File

@@ -0,0 +1,155 @@
#include "Classifier.h"
#include "bayesnetUtils.h"
namespace bayesnet {
using namespace torch;
Classifier::Classifier(Network model) : model(model), m(0), n(0), metrics(Metrics()), fitted(false) {}
Classifier& Classifier::build(vector<string>& features, string className, map<string, vector<int>>& states, const torch::Tensor& weights)
{
this->features = features;
this->className = className;
this->states = states;
m = dataset.size(1);
n = dataset.size(0) - 1;
checkFitParameters();
auto n_classes = states[className].size();
metrics = Metrics(dataset, features, className, n_classes);
model.initialize();
buildModel(weights);
trainModel(weights);
fitted = true;
return *this;
}
void Classifier::buildDataset(Tensor& ytmp)
{
try {
auto yresized = torch::transpose(ytmp.view({ ytmp.size(0), 1 }), 0, 1);
dataset = torch::cat({ dataset, yresized }, 0);
}
catch (const std::exception& e) {
std::cerr << e.what() << '\n';
cout << "X dimensions: " << dataset.sizes() << "\n";
cout << "y dimensions: " << ytmp.sizes() << "\n";
exit(1);
}
}
void Classifier::trainModel(const torch::Tensor& weights)
{
model.fit(dataset, weights, features, className, states);
}
// X is nxm where n is the number of features and m the number of samples
Classifier& Classifier::fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states)
{
dataset = X;
buildDataset(y);
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble);
return build(features, className, states, weights);
}
// X is nxm where n is the number of features and m the number of samples
Classifier& Classifier::fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states)
{
dataset = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, kInt32);
for (int i = 0; i < X.size(); ++i) {
dataset.index_put_({ i, "..." }, torch::tensor(X[i], kInt32));
}
auto ytmp = torch::tensor(y, kInt32);
buildDataset(ytmp);
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble);
return build(features, className, states, weights);
}
Classifier& Classifier::fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states)
{
this->dataset = dataset;
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble);
return build(features, className, states, weights);
}
Classifier& Classifier::fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states, const torch::Tensor& weights)
{
this->dataset = dataset;
return build(features, className, states, weights);
}
void Classifier::checkFitParameters()
{
if (n != features.size()) {
throw invalid_argument("X " + to_string(n) + " and features " + to_string(features.size()) + " must have the same number of features");
}
if (states.find(className) == states.end()) {
throw invalid_argument("className not found in states");
}
for (auto feature : features) {
if (states.find(feature) == states.end()) {
throw invalid_argument("feature [" + feature + "] not found in states");
}
}
}
Tensor Classifier::predict(Tensor& X)
{
if (!fitted) {
throw logic_error("Classifier has not been fitted");
}
return model.predict(X);
}
vector<int> Classifier::predict(vector<vector<int>>& X)
{
if (!fitted) {
throw logic_error("Classifier has not been fitted");
}
auto m_ = X[0].size();
auto n_ = X.size();
vector<vector<int>> Xd(n_, vector<int>(m_, 0));
for (auto i = 0; i < n_; i++) {
Xd[i] = vector<int>(X[i].begin(), X[i].end());
}
auto yp = model.predict(Xd);
return yp;
}
float Classifier::score(Tensor& X, Tensor& y)
{
if (!fitted) {
throw logic_error("Classifier has not been fitted");
}
Tensor y_pred = predict(X);
return (y_pred == y).sum().item<float>() / y.size(0);
}
float Classifier::score(vector<vector<int>>& X, vector<int>& y)
{
if (!fitted) {
throw logic_error("Classifier has not been fitted");
}
return model.score(X, y);
}
vector<string> Classifier::show() const
{
return model.show();
}
void Classifier::addNodes()
{
// Add all nodes to the network
for (const auto& feature : features) {
model.addNode(feature);
}
model.addNode(className);
}
int Classifier::getNumberOfNodes() const
{
// Features does not include class
return fitted ? model.getFeatures().size() + 1 : 0;
}
int Classifier::getNumberOfEdges() const
{
return fitted ? model.getNumEdges() : 0;
}
int Classifier::getNumberOfStates() const
{
return fitted ? model.getStates() : 0;
}
vector<string> Classifier::topological_order()
{
return model.topological_sort();
}
void Classifier::dump_cpt() const
{
model.dump_cpt();
}
}

52
src/BayesNet/Classifier.h Normal file
View File

@@ -0,0 +1,52 @@
#ifndef CLASSIFIER_H
#define CLASSIFIER_H
#include <torch/torch.h>
#include "BaseClassifier.h"
#include "Network.h"
#include "BayesMetrics.h"
using namespace std;
using namespace torch;
namespace bayesnet {
class Classifier : public BaseClassifier {
private:
void buildDataset(torch::Tensor& y);
Classifier& build(vector<string>& features, string className, map<string, vector<int>>& states, const torch::Tensor& weights);
protected:
bool fitted;
int m, n; // m: number of samples, n: number of features
Network model;
Metrics metrics;
vector<string> features;
string className;
map<string, vector<int>> states;
Tensor dataset; // (n+1)xm tensor
void checkFitParameters();
virtual void buildModel(const torch::Tensor& weights) = 0;
void trainModel(const torch::Tensor& weights) override;
public:
Classifier(Network model);
virtual ~Classifier() = default;
Classifier& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states) override;
Classifier& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override;
Classifier& fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states) override;
Classifier& fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states, const torch::Tensor& weights) override;
void addNodes();
int getNumberOfNodes() const override;
int getNumberOfEdges() const override;
int getNumberOfStates() const override;
Tensor predict(Tensor& X) override;
vector<int> predict(vector<vector<int>>& X) override;
float score(Tensor& X, Tensor& y) override;
float score(vector<vector<int>>& X, vector<int>& y) override;
vector<string> show() const override;
vector<string> topological_order() override;
void dump_cpt() const override;
};
}
#endif

View File

@@ -1,64 +1,54 @@
#include "Ensemble.h" #include "Ensemble.h"
namespace bayesnet { namespace bayesnet {
using namespace std;
using namespace torch; using namespace torch;
Ensemble::Ensemble() : m(0), n(0), n_models(0), metrics(Metrics()), fitted(false) {} Ensemble::Ensemble() : Classifier(Network()) {}
Ensemble& Ensemble::build(vector<string>& features, string className, map<string, vector<int>>& states)
void Ensemble::trainModel(const torch::Tensor& weights)
{ {
dataset = cat({ X, y.view({y.size(0), 1}) }, 1);
this->features = features;
this->className = className;
this->states = states;
auto n_classes = states[className].size();
metrics = Metrics(dataset, features, className, n_classes);
// Build models
train();
// Train models
n_models = models.size(); n_models = models.size();
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
models[i]->fit(Xv, yv, features, className, states); // fit with vectors
models[i]->fit(dataset, features, className, states);
} }
fitted = true;
return *this;
} }
Ensemble& Ensemble::fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states) vector<int> Ensemble::voting(Tensor& y_pred)
{ {
this->X = torch::zeros({ static_cast<int64_t>(X[0].size()), static_cast<int64_t>(X.size()) }, kInt64); auto y_pred_ = y_pred.accessor<int, 2>();
Xv = X; vector<int> y_pred_final;
for (int i = 0; i < X.size(); ++i) { for (int i = 0; i < y_pred.size(0); ++i) {
this->X.index_put_({ "...", i }, torch::tensor(X[i], kInt64)); vector<double> votes(y_pred.size(1), 0);
for (int j = 0; j < y_pred.size(1); ++j) {
votes[y_pred_[i][j]] += significanceModels[j];
} }
this->y = torch::tensor(y, kInt64); // argsort in descending order
yv = y; auto indices = argsort(votes);
return build(features, className, states); y_pred_final.push_back(indices[0]);
}
return y_pred_final;
} }
Tensor Ensemble::predict(Tensor& X) Tensor Ensemble::predict(Tensor& X)
{ {
if (!fitted) { if (!fitted) {
throw logic_error("Ensemble has not been fitted"); throw logic_error("Ensemble has not been fitted");
} }
Tensor y_pred = torch::zeros({ X.size(0), n_models }, kInt64); Tensor y_pred = torch::zeros({ X.size(1), n_models }, kInt32);
//Create a threadpool
auto threads{ vector<thread>() };
mutex mtx;
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
y_pred.index_put_({ "...", i }, models[i]->predict(X)); threads.push_back(thread([&, i]() {
auto ypredict = models[i]->predict(X);
lock_guard<mutex> lock(mtx);
y_pred.index_put_({ "...", i }, ypredict);
}));
}
for (auto& thread : threads) {
thread.join();
} }
return torch::tensor(voting(y_pred)); return torch::tensor(voting(y_pred));
} }
vector<int> Ensemble::voting(Tensor& y_pred)
{
auto y_pred_ = y_pred.accessor<int64_t, 2>();
vector<int> y_pred_final;
for (int i = 0; i < y_pred.size(0); ++i) {
vector<float> votes(states[className].size(), 0);
for (int j = 0; j < y_pred.size(1); ++j) {
votes[y_pred_[i][j]] += 1;
}
auto indices = argsort(votes);
y_pred_final.push_back(indices[0]);
}
return y_pred_final;
}
vector<int> Ensemble::predict(vector<vector<int>>& X) vector<int> Ensemble::predict(vector<vector<int>>& X)
{ {
if (!fitted) { if (!fitted) {
@@ -70,12 +60,26 @@ namespace bayesnet {
for (auto i = 0; i < n_; i++) { for (auto i = 0; i < n_; i++) {
Xd[i] = vector<int>(X[i].begin(), X[i].end()); Xd[i] = vector<int>(X[i].begin(), X[i].end());
} }
Tensor y_pred = torch::zeros({ m_, n_models }, kInt64); Tensor y_pred = torch::zeros({ m_, n_models }, kInt32);
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
y_pred.index_put_({ "...", i }, torch::tensor(models[i]->predict(Xd), kInt64)); y_pred.index_put_({ "...", i }, torch::tensor(models[i]->predict(Xd), kInt32));
} }
return voting(y_pred); return voting(y_pred);
} }
float Ensemble::score(Tensor& X, Tensor& y)
{
if (!fitted) {
throw logic_error("Ensemble has not been fitted");
}
auto y_pred = predict(X);
int correct = 0;
for (int i = 0; i < y_pred.size(0); ++i) {
if (y_pred[i].item<int>() == y[i].item<int>()) {
correct++;
}
}
return (double)correct / y_pred.size(0);
}
float Ensemble::score(vector<vector<int>>& X, vector<int>& y) float Ensemble::score(vector<vector<int>>& X, vector<int>& y)
{ {
if (!fitted) { if (!fitted) {
@@ -89,9 +93,8 @@ namespace bayesnet {
} }
} }
return (double)correct / y_pred.size(); return (double)correct / y_pred.size();
} }
vector<string> Ensemble::show() vector<string> Ensemble::show() const
{ {
auto result = vector<string>(); auto result = vector<string>();
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
@@ -100,7 +103,7 @@ namespace bayesnet {
} }
return result; return result;
} }
vector<string> Ensemble::graph(string title) vector<string> Ensemble::graph(const string& title) const
{ {
auto result = vector<string>(); auto result = vector<string>();
for (auto i = 0; i < n_models; ++i) { for (auto i = 0; i < n_models; ++i) {
@@ -109,4 +112,28 @@ namespace bayesnet {
} }
return result; return result;
} }
int Ensemble::getNumberOfNodes() const
{
int nodes = 0;
for (auto i = 0; i < n_models; ++i) {
nodes += models[i]->getNumberOfNodes();
}
return nodes;
}
int Ensemble::getNumberOfEdges() const
{
int edges = 0;
for (auto i = 0; i < n_models; ++i) {
edges += models[i]->getNumberOfEdges();
}
return edges;
}
int Ensemble::getNumberOfStates() const
{
int nstates = 0;
for (auto i = 0; i < n_models; ++i) {
nstates += models[i]->getNumberOfStates();
}
return nstates;
}
} }

View File

@@ -1,42 +1,41 @@
#ifndef ENSEMBLE_H #ifndef ENSEMBLE_H
#define ENSEMBLE_H #define ENSEMBLE_H
#include <torch/torch.h> #include <torch/torch.h>
#include "BaseClassifier.h" #include "Classifier.h"
#include "Metrics.hpp" #include "BayesMetrics.h"
#include "bayesnetUtils.h" #include "bayesnetUtils.h"
using namespace std; using namespace std;
using namespace torch; using namespace torch;
namespace bayesnet { namespace bayesnet {
class Ensemble { class Ensemble : public Classifier {
private: private:
bool fitted;
long n_models;
Ensemble& build(vector<string>& features, string className, map<string, vector<int>>& states); Ensemble& build(vector<string>& features, string className, map<string, vector<int>>& states);
protected: protected:
vector<unique_ptr<BaseClassifier>> models; unsigned n_models;
int m, n; // m: number of samples, n: number of features vector<unique_ptr<Classifier>> models;
Tensor X; vector<double> significanceModels;
vector<vector<int>> Xv; void trainModel(const torch::Tensor& weights) override;
Tensor y;
vector<int> yv;
Tensor dataset;
Metrics metrics;
vector<string> features;
string className;
map<string, vector<int>> states;
void virtual train() = 0;
vector<int> voting(Tensor& y_pred); vector<int> voting(Tensor& y_pred);
public: public:
Ensemble(); Ensemble();
virtual ~Ensemble() = default; virtual ~Ensemble() = default;
Ensemble& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states); Tensor predict(Tensor& X) override;
Tensor predict(Tensor& X); vector<int> predict(vector<vector<int>>& X) override;
vector<int> predict(vector<vector<int>>& X); float score(Tensor& X, Tensor& y) override;
float score(Tensor& X, Tensor& y); float score(vector<vector<int>>& X, vector<int>& y) override;
float score(vector<vector<int>>& X, vector<int>& y); int getNumberOfNodes() const override;
vector<string> show(); int getNumberOfEdges() const override;
vector<string> graph(string title); int getNumberOfStates() const override;
vector<string> show() const override;
vector<string> graph(const string& title) const override;
vector<string> topological_order() override
{
return vector<string>();
}
void dump_cpt() const override
{
}
}; };
} }
#endif #endif

View File

@@ -1,11 +1,10 @@
#include "KDB.h" #include "KDB.h"
namespace bayesnet { namespace bayesnet {
using namespace std;
using namespace torch; using namespace torch;
KDB::KDB(int k, float theta) : BaseClassifier(Network()), k(k), theta(theta) {} KDB::KDB(int k, float theta) : Classifier(Network()), k(k), theta(theta) {}
void KDB::train() void KDB::buildModel(const torch::Tensor& weights)
{ {
/* /*
1. For each feature Xi, compute mutual information, I(X;C), 1. For each feature Xi, compute mutual information, I(X;C),
@@ -28,25 +27,25 @@ namespace bayesnet {
*/ */
// 1. For each feature Xi, compute mutual information, I(X;C), // 1. For each feature Xi, compute mutual information, I(X;C),
// where C is the class. // where C is the class.
vector <float> mi; addNodes();
const Tensor& y = dataset.index({ -1, "..." });
vector<double> mi;
for (auto i = 0; i < features.size(); i++) { for (auto i = 0; i < features.size(); i++) {
Tensor firstFeature = X.index({ "...", i }); Tensor firstFeature = dataset.index({ i, "..." });
mi.push_back(metrics.mutualInformation(firstFeature, y)); mi.push_back(metrics.mutualInformation(firstFeature, y, weights));
} }
// 2. Compute class conditional mutual information I(Xi;XjIC), f or each // 2. Compute class conditional mutual information I(Xi;XjIC), f or each
auto conditionalEdgeWeights = metrics.conditionalEdge(); auto conditionalEdgeWeights = metrics.conditionalEdge(weights);
// 3. Let the used variable list, S, be empty. // 3. Let the used variable list, S, be empty.
vector<int> S; vector<int> S;
// 4. Let the DAG network being constructed, BN, begin with a single // 4. Let the DAG network being constructed, BN, begin with a single
// class node, C. // class node, C.
model.addNode(className, states[className].size());
// 5. Repeat until S includes all domain features // 5. Repeat until S includes all domain features
// 5.1. Select feature Xmax which is not in S and has the largest value // 5.1. Select feature Xmax which is not in S and has the largest value
// I(Xmax;C). // I(Xmax;C).
auto order = argsort(mi); auto order = argsort(mi);
for (auto idx : order) { for (auto idx : order) {
// 5.2. Add a node to BN representing Xmax. // 5.2. Add a node to BN representing Xmax.
model.addNode(features[idx], states[features[idx]].size());
// 5.3. Add an arc from C to Xmax in BN. // 5.3. Add an arc from C to Xmax in BN.
model.addEdge(className, features[idx]); model.addEdge(className, features[idx]);
// 5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with // 5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with
@@ -80,11 +79,12 @@ namespace bayesnet {
exit_cond = num == n_edges || candidates.size(0) == 0; exit_cond = num == n_edges || candidates.size(0) == 0;
} }
} }
vector<string> KDB::graph(string title) vector<string> KDB::graph(const string& title) const
{ {
string header{ title };
if (title == "KDB") { if (title == "KDB") {
title += " (k=" + to_string(k) + ", theta=" + to_string(theta) + ")"; header += " (k=" + to_string(k) + ", theta=" + to_string(theta) + ")";
} }
return model.graph(title); return model.graph(header);
} }
} }

View File

@@ -1,20 +1,22 @@
#ifndef KDB_H #ifndef KDB_H
#define KDB_H #define KDB_H
#include "BaseClassifier.h" #include <torch/torch.h>
#include "Classifier.h"
#include "bayesnetUtils.h" #include "bayesnetUtils.h"
namespace bayesnet { namespace bayesnet {
using namespace std; using namespace std;
using namespace torch; using namespace torch;
class KDB : public BaseClassifier { class KDB : public Classifier {
private: private:
int k; int k;
float theta; float theta;
void add_m_edges(int idx, vector<int>& S, Tensor& weights); void add_m_edges(int idx, vector<int>& S, Tensor& weights);
protected: protected:
void train() override; void buildModel(const torch::Tensor& weights) override;
public: public:
KDB(int k, float theta = 0.03); explicit KDB(int k, float theta = 0.03);
vector<string> graph(string name = "KDB") override; virtual ~KDB() {};
vector<string> graph(const string& name = "KDB") const override;
}; };
} }
#endif #endif

30
src/BayesNet/KDBLd.cc Normal file
View File

@@ -0,0 +1,30 @@
#include "KDBLd.h"
namespace bayesnet {
using namespace std;
KDBLd::KDBLd(int k) : KDB(k), Proposal(dataset, features, className) {}
KDBLd& KDBLd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
{
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
features = features_;
className = className_;
Xf = X_;
y = y_;
// Fills vectors Xv & yv with the data from tensors X_ (discretized) & y
states = fit_local_discretization(y);
// We have discretized the input data
// 1st we need to fit the model to build the normal KDB structure, KDB::fit initializes the base Bayesian network
KDB::fit(dataset, features, className, states);
states = localDiscretizationProposal(states, model);
return *this;
}
Tensor KDBLd::predict(Tensor& X)
{
auto Xt = prepareX(X);
return KDB::predict(Xt);
}
vector<string> KDBLd::graph(const string& name) const
{
return KDB::graph(name);
}
}

19
src/BayesNet/KDBLd.h Normal file
View File

@@ -0,0 +1,19 @@
#ifndef KDBLD_H
#define KDBLD_H
#include "KDB.h"
#include "Proposal.h"
namespace bayesnet {
using namespace std;
class KDBLd : public KDB, public Proposal {
private:
public:
explicit KDBLd(int k);
virtual ~KDBLd() = default;
KDBLd& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override;
vector<string> graph(const string& name = "KDB") const override;
Tensor predict(Tensor& X) override;
static inline string version() { return "0.0.1"; };
};
}
#endif // !KDBLD_H

View File

@@ -1,28 +0,0 @@
#ifndef BAYESNET_METRICS_H
#define BAYESNET_METRICS_H
#include <torch/torch.h>
#include <vector>
#include <string>
namespace bayesnet {
using namespace std;
using namespace torch;
class Metrics {
private:
Tensor samples;
vector<string> features;
string className;
int classNumStates;
public:
Metrics() = default;
Metrics(Tensor&, vector<string>&, string&, int);
Metrics(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&, const int);
double entropy(Tensor&);
double conditionalEntropy(Tensor&, Tensor&);
double mutualInformation(Tensor&, Tensor&);
vector<float> conditionalEdgeWeights();
Tensor conditionalEdge();
vector<pair<string, string>> doCombinations(const vector<string>&);
vector<pair<int, int>> maximumSpanningTree(vector<string> features, Tensor& weights, int root);
};
}
#endif

View File

@@ -7,9 +7,8 @@
namespace bayesnet { namespace bayesnet {
using namespace std; using namespace std;
Graph::Graph(int V) Graph::Graph(int V) : V(V), parent(vector<int>(V))
{ {
parent = vector<int>(V);
for (int i = 0; i < V; i++) for (int i = 0; i < V; i++)
parent[i] = i; parent[i] = i;
G.clear(); G.clear();
@@ -34,10 +33,10 @@ namespace bayesnet {
} }
void Graph::kruskal_algorithm() void Graph::kruskal_algorithm()
{ {
int i, uSt, vEd;
// sort the edges ordered on decreasing weight // sort the edges ordered on decreasing weight
sort(G.begin(), G.end(), [](auto& left, auto& right) {return left.first > right.first;}); sort(G.begin(), G.end(), [](const auto& left, const auto& right) {return left.first > right.first;});
for (i = 0; i < G.size(); i++) { for (int i = 0; i < G.size(); i++) {
int uSt, vEd;
uSt = find_set(G[i].second.first); uSt = find_set(G[i].second.first);
vEd = find_set(G[i].second.second); vEd = find_set(G[i].second.second);
if (uSt != vEd) { if (uSt != vEd) {
@@ -95,7 +94,7 @@ namespace bayesnet {
return result; return result;
} }
MST::MST(vector<string>& features, Tensor& weights, int root) : features(features), weights(weights), root(root) {} MST::MST(const vector<string>& features, const Tensor& weights, const int root) : features(features), weights(weights), root(root) {}
vector<pair<int, int>> MST::maximumSpanningTree() vector<pair<int, int>> MST::maximumSpanningTree()
{ {
auto num_features = features.size(); auto num_features = features.size();
@@ -103,7 +102,7 @@ namespace bayesnet {
// Make a complete graph // Make a complete graph
for (int i = 0; i < num_features - 1; ++i) { for (int i = 0; i < num_features - 1; ++i) {
for (int j = i; j < num_features; ++j) { for (int j = i + 1; j < num_features; ++j) {
g.addEdge(i, j, weights[i][j].item<float>()); g.addEdge(i, j, weights[i][j].item<float>());
} }
} }

View File

@@ -10,10 +10,10 @@ namespace bayesnet {
private: private:
Tensor weights; Tensor weights;
vector<string> features; vector<string> features;
int root; int root = 0;
public: public:
MST() = default; MST() = default;
MST(vector<string>& features, Tensor& weights, int root); MST(const vector<string>& features, const Tensor& weights, const int root);
vector<pair<int, int>> maximumSpanningTree(); vector<pair<int, int>> maximumSpanningTree();
}; };
class Graph { class Graph {
@@ -23,7 +23,7 @@ namespace bayesnet {
vector <pair<float, pair<int, int>>> T; // vector for mst vector <pair<float, pair<int, int>>> T; // vector for mst
vector<int> parent; vector<int> parent;
public: public:
Graph(int V); explicit Graph(int V);
void addEdge(int u, int v, float wt); void addEdge(int u, int v, float wt);
int find_set(int i); int find_set(int i);
void union_set(int u, int v); void union_set(int u, int v);

View File

@@ -1,16 +1,26 @@
#include <thread> #include <thread>
#include <mutex> #include <mutex>
#include "Network.h" #include "Network.h"
#include "bayesnetUtils.h"
namespace bayesnet { namespace bayesnet {
Network::Network() : laplaceSmoothing(1), features(vector<string>()), className(""), classNumStates(0), maxThreads(0.8), fitted(false) {} Network::Network() : features(vector<string>()), className(""), classNumStates(0), fitted(false) {}
Network::Network(float maxT) : laplaceSmoothing(1), features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {} Network::Network(float maxT) : features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {}
Network::Network(float maxT, int smoothing) : laplaceSmoothing(smoothing), features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {} Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other.
Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other.getmaxThreads()), fitted(other.fitted) getmaxThreads()), fitted(other.fitted)
{ {
for (auto& pair : other.nodes) { for (const auto& pair : other.nodes) {
nodes[pair.first] = make_unique<Node>(*pair.second); nodes[pair.first] = std::make_unique<Node>(*pair.second);
} }
} }
void Network::initialize()
{
features = vector<string>();
className = "";
classNumStates = 0;
fitted = false;
nodes.clear();
samples = torch::Tensor();
}
float Network::getmaxThreads() float Network::getmaxThreads()
{ {
return maxThreads; return maxThreads;
@@ -19,27 +29,28 @@ namespace bayesnet {
{ {
return samples; return samples;
} }
void Network::addNode(string name, int numStates) void Network::addNode(const string& name)
{ {
if (name == "") {
throw invalid_argument("Node name cannot be empty");
}
if (nodes.find(name) != nodes.end()) {
return;
}
if (find(features.begin(), features.end(), name) == features.end()) { if (find(features.begin(), features.end(), name) == features.end()) {
features.push_back(name); features.push_back(name);
} }
if (nodes.find(name) != nodes.end()) { nodes[name] = std::make_unique<Node>(name);
// if node exists update its number of states
nodes[name]->setNumStates(numStates);
return;
} }
nodes[name] = make_unique<Node>(name, numStates); vector<string> Network::getFeatures() const
}
vector<string> Network::getFeatures()
{ {
return features; return features;
} }
int Network::getClassNumStates() int Network::getClassNumStates() const
{ {
return classNumStates; return classNumStates;
} }
int Network::getStates() int Network::getStates() const
{ {
int result = 0; int result = 0;
for (auto& node : nodes) { for (auto& node : nodes) {
@@ -47,7 +58,7 @@ namespace bayesnet {
} }
return result; return result;
} }
string Network::getClassName() string Network::getClassName() const
{ {
return className; return className;
} }
@@ -67,7 +78,7 @@ namespace bayesnet {
recStack.erase(nodeId); // remove node from recursion stack before function ends recStack.erase(nodeId); // remove node from recursion stack before function ends
return false; return false;
} }
void Network::addEdge(const string parent, const string child) void Network::addEdge(const string& parent, const string& child)
{ {
if (nodes.find(parent) == nodes.end()) { if (nodes.find(parent) == nodes.end()) {
throw invalid_argument("Parent node " + parent + " does not exist"); throw invalid_argument("Parent node " + parent + " does not exist");
@@ -87,27 +98,82 @@ namespace bayesnet {
nodes[child]->removeParent(nodes[parent].get()); nodes[child]->removeParent(nodes[parent].get());
throw invalid_argument("Adding this edge forms a cycle in the graph."); throw invalid_argument("Adding this edge forms a cycle in the graph.");
} }
} }
map<string, std::unique_ptr<Node>>& Network::getNodes() map<string, std::unique_ptr<Node>>& Network::getNodes()
{ {
return nodes; return nodes;
} }
void Network::fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<string>& featureNames, const string& className) void Network::checkFitData(int n_samples, int n_features, int n_samples_y, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states, const torch::Tensor& weights)
{ {
features = featureNames; if (weights.size(0) != n_samples) {
this->className = className; throw invalid_argument("Weights (" + to_string(weights.size(0)) + ") must have the same number of elements as samples (" + to_string(n_samples) + ") in Network::fit");
dataset.clear();
// Build dataset & tensor of samples
samples = torch::zeros({ static_cast<int64_t>(input_data[0].size()), static_cast<int64_t>(input_data.size() + 1) }, torch::kInt64);
for (int i = 0; i < featureNames.size(); ++i) {
dataset[featureNames[i]] = input_data[i];
samples.index_put_({ "...", i }, torch::tensor(input_data[i], torch::kInt64));
} }
dataset[className] = labels; if (n_samples != n_samples_y) {
samples.index_put_({ "...", -1 }, torch::tensor(labels, torch::kInt64)); throw invalid_argument("X and y must have the same number of samples in Network::fit (" + to_string(n_samples) + " != " + to_string(n_samples_y) + ")");
classNumStates = *max_element(labels.begin(), labels.end()) + 1; }
if (n_features != featureNames.size()) {
throw invalid_argument("X and features must have the same number of features in Network::fit (" + to_string(n_features) + " != " + to_string(featureNames.size()) + ")");
}
if (n_features != features.size() - 1) {
throw invalid_argument("X and local features must have the same number of features in Network::fit (" + to_string(n_features) + " != " + to_string(features.size() - 1) + ")");
}
if (find(features.begin(), features.end(), className) == features.end()) {
throw invalid_argument("className not found in Network::features");
}
for (auto& feature : featureNames) {
if (find(features.begin(), features.end(), feature) == features.end()) {
throw invalid_argument("Feature " + feature + " not found in Network::features");
}
if (states.find(feature) == states.end()) {
throw invalid_argument("Feature " + feature + " not found in states");
}
}
}
void Network::setStates(const map<string, vector<int>>& states)
{
// Set states to every Node in the network
for (int i = 0; i < features.size(); ++i) {
nodes[features[i]]->setNumStates(states.at(features[i]).size());
}
classNumStates = nodes[className]->getNumStates();
}
// X comes in nxm, where n is the number of features and m the number of samples
void Network::fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
{
checkFitData(X.size(1), X.size(0), y.size(0), featureNames, className, states, weights);
this->className = className;
Tensor ytmp = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
samples = torch::cat({ X , ytmp }, 0);
for (int i = 0; i < featureNames.size(); ++i) {
auto row_feature = X.index({ i, "..." });
}
completeFit(states, weights);
}
void Network::fit(const torch::Tensor& samples, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
{
checkFitData(samples.size(1), samples.size(0) - 1, samples.size(1), featureNames, className, states, weights);
this->className = className;
this->samples = samples;
completeFit(states, weights);
}
// input_data comes in nxm, where n is the number of features and m the number of samples
void Network::fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<float>& weights_, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states)
{
const torch::Tensor weights = torch::tensor(weights_, torch::kFloat64);
checkFitData(input_data[0].size(), input_data.size(), labels.size(), featureNames, className, states, weights);
this->className = className;
// Build tensor of samples (nxm) (n+1 because of the class)
samples = torch::zeros({ static_cast<int>(input_data.size() + 1), static_cast<int>(input_data[0].size()) }, torch::kInt32);
for (int i = 0; i < featureNames.size(); ++i) {
samples.index_put_({ i, "..." }, torch::tensor(input_data[i], torch::kInt32));
}
samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
completeFit(states, weights);
}
void Network::completeFit(const map<string, vector<int>>& states, const torch::Tensor& weights)
{
setStates(states);
laplaceSmoothing = 1.0 / samples.size(1); // To use in CPT computation
int maxThreadsRunning = static_cast<int>(std::thread::hardware_concurrency() * maxThreads); int maxThreadsRunning = static_cast<int>(std::thread::hardware_concurrency() * maxThreads);
if (maxThreadsRunning < 1) { if (maxThreadsRunning < 1) {
maxThreadsRunning = 1; maxThreadsRunning = 1;
@@ -117,16 +183,10 @@ namespace bayesnet {
condition_variable cv; condition_variable cv;
int activeThreads = 0; int activeThreads = 0;
int nextNodeIndex = 0; int nextNodeIndex = 0;
while (nextNodeIndex < nodes.size()) { while (nextNodeIndex < nodes.size()) {
unique_lock<mutex> lock(mtx); unique_lock<mutex> lock(mtx);
cv.wait(lock, [&activeThreads, &maxThreadsRunning]() { return activeThreads < maxThreadsRunning; }); cv.wait(lock, [&activeThreads, &maxThreadsRunning]() { return activeThreads < maxThreadsRunning; });
threads.emplace_back([this, &nextNodeIndex, &mtx, &cv, &activeThreads, &weights]() {
if (nextNodeIndex >= nodes.size()) {
break; // No more work remaining
}
threads.emplace_back([this, &nextNodeIndex, &mtx, &cv, &activeThreads]() {
while (true) { while (true) {
unique_lock<mutex> lock(mtx); unique_lock<mutex> lock(mtx);
if (nextNodeIndex >= nodes.size()) { if (nextNodeIndex >= nodes.size()) {
@@ -135,8 +195,7 @@ namespace bayesnet {
auto& pair = *std::next(nodes.begin(), nextNodeIndex); auto& pair = *std::next(nodes.begin(), nextNodeIndex);
++nextNodeIndex; ++nextNodeIndex;
lock.unlock(); lock.unlock();
pair.second->computeCPT(samples, features, laplaceSmoothing, weights);
pair.second->computeCPT(dataset, laplaceSmoothing);
lock.lock(); lock.lock();
nodes[pair.first] = std::move(pair.second); nodes[pair.first] = std::move(pair.second);
lock.unlock(); lock.unlock();
@@ -145,7 +204,6 @@ namespace bayesnet {
--activeThreads; --activeThreads;
cv.notify_one(); cv.notify_one();
}); });
++activeThreads; ++activeThreads;
} }
for (auto& thread : threads) { for (auto& thread : threads) {
@@ -153,7 +211,39 @@ namespace bayesnet {
} }
fitted = true; fitted = true;
} }
torch::Tensor Network::predict_tensor(const torch::Tensor& samples, const bool proba)
{
if (!fitted) {
throw logic_error("You must call fit() before calling predict()");
}
torch::Tensor result;
result = torch::zeros({ samples.size(1), classNumStates }, torch::kFloat64);
for (int i = 0; i < samples.size(1); ++i) {
const Tensor sample = samples.index({ "...", i });
auto psample = predict_sample(sample);
auto temp = torch::tensor(psample, torch::kFloat64);
// result.index_put_({ i, "..." }, torch::tensor(predict_sample(sample), torch::kFloat64));
result.index_put_({ i, "..." }, temp);
}
if (proba)
return result;
else
return result.argmax(1);
}
// Return mxn tensor of probabilities
Tensor Network::predict_proba(const Tensor& samples)
{
return predict_tensor(samples, true);
}
// Return mxn tensor of probabilities
Tensor Network::predict(const Tensor& samples)
{
return predict_tensor(samples, false);
}
// Return mx1 vector of predictions
// tsamples is nxm vector of samples
vector<int> Network::predict(const vector<vector<int>>& tsamples) vector<int> Network::predict(const vector<vector<int>>& tsamples)
{ {
if (!fitted) { if (!fitted) {
@@ -174,6 +264,7 @@ namespace bayesnet {
} }
return predictions; return predictions;
} }
// Return mxn vector of probabilities
vector<vector<double>> Network::predict_proba(const vector<vector<int>>& tsamples) vector<vector<double>> Network::predict_proba(const vector<vector<int>>& tsamples)
{ {
if (!fitted) { if (!fitted) {
@@ -201,19 +292,33 @@ namespace bayesnet {
} }
return (double)correct / y_pred.size(); return (double)correct / y_pred.size();
} }
// Return 1xn vector of probabilities
vector<double> Network::predict_sample(const vector<int>& sample) vector<double> Network::predict_sample(const vector<int>& sample)
{ {
// Ensure the sample size is equal to the number of features // Ensure the sample size is equal to the number of features
if (sample.size() != features.size()) { if (sample.size() != features.size() - 1) {
throw invalid_argument("Sample size (" + to_string(sample.size()) + throw invalid_argument("Sample size (" + to_string(sample.size()) +
") does not match the number of features (" + to_string(features.size()) + ")"); ") does not match the number of features (" + to_string(features.size() - 1) + ")");
} }
map<string, int> evidence; map<string, int> evidence;
for (int i = 0; i < sample.size(); ++i) { for (int i = 0; i < sample.size(); ++i) {
evidence[features[i]] = sample[i]; evidence[features[i]] = sample[i];
} }
return exactInference(evidence); return exactInference(evidence);
}
// Return 1xn vector of probabilities
vector<double> Network::predict_sample(const Tensor& sample)
{
// Ensure the sample size is equal to the number of features
if (sample.size(0) != features.size() - 1) {
throw invalid_argument("Sample size (" + to_string(sample.size(0)) +
") does not match the number of features (" + to_string(features.size() - 1) + ")");
}
map<string, int> evidence;
for (int i = 0; i < sample.size(0); ++i) {
evidence[features[i]] = sample[i].item<int>();
}
return exactInference(evidence);
} }
double Network::computeFactor(map<string, int>& completeEvidence) double Network::computeFactor(map<string, int>& completeEvidence)
{ {
@@ -240,15 +345,12 @@ namespace bayesnet {
for (auto& thread : threads) { for (auto& thread : threads) {
thread.join(); thread.join();
} }
// Normalize result // Normalize result
double sum = accumulate(result.begin(), result.end(), 0.0); double sum = accumulate(result.begin(), result.end(), 0.0);
for (double& value : result) { transform(result.begin(), result.end(), result.begin(), [sum](const double& value) { return value / sum; });
value /= sum;
}
return result; return result;
} }
vector<string> Network::show() vector<string> Network::show() const
{ {
vector<string> result; vector<string> result;
// Draw the network // Draw the network
@@ -261,7 +363,7 @@ namespace bayesnet {
} }
return result; return result;
} }
vector<string> Network::graph(string title) vector<string> Network::graph(const string& title) const
{ {
auto output = vector<string>(); auto output = vector<string>();
auto prefix = "digraph BayesNet {\nlabel=<BayesNet "; auto prefix = "digraph BayesNet {\nlabel=<BayesNet ";
@@ -275,7 +377,7 @@ namespace bayesnet {
output.push_back("}\n"); output.push_back("}\n");
return output; return output;
} }
vector<pair<string, string>> Network::getEdges() vector<pair<string, string>> Network::getEdges() const
{ {
auto edges = vector<pair<string, string>>(); auto edges = vector<pair<string, string>>();
for (const auto& node : nodes) { for (const auto& node : nodes) {
@@ -287,5 +389,53 @@ namespace bayesnet {
} }
return edges; return edges;
} }
int Network::getNumEdges() const
{
return getEdges().size();
}
vector<string> Network::topological_sort()
{
/* Check if al the fathers of every node are before the node */
auto result = features;
result.erase(remove(result.begin(), result.end(), className), result.end());
bool ending{ false };
int idx = 0;
while (!ending) {
ending = true;
for (auto feature : features) {
auto fathers = nodes[feature]->getParents();
for (const auto& father : fathers) {
auto fatherName = father->getName();
if (fatherName == className) {
continue;
}
// Check if father is placed before the actual feature
auto it = find(result.begin(), result.end(), fatherName);
if (it != result.end()) {
auto it2 = find(result.begin(), result.end(), feature);
if (it2 != result.end()) {
if (distance(it, it2) < 0) {
// if it is not, insert it before the feature
result.erase(remove(result.begin(), result.end(), fatherName), result.end());
result.insert(it2, fatherName);
ending = false;
}
} else {
throw logic_error("Error in topological sort because of node " + feature + " is not in result");
}
} else {
throw logic_error("Error in topological sort because of node father " + fatherName + " is not in result");
}
}
}
}
return result;
}
void Network::dump_cpt() const
{
for (auto& node : nodes) {
cout << "* " << node.first << ": (" << node.second->getNumStates() << ") : " << node.second->getCPT().sizes() << endl;
cout << node.second->getCPT() << endl;
}
}
} }

View File

@@ -7,46 +7,51 @@
namespace bayesnet { namespace bayesnet {
class Network { class Network {
private: private:
map<string, std::unique_ptr<Node>> nodes; map<string, unique_ptr<Node>> nodes;
map<string, vector<int>> dataset;
bool fitted; bool fitted;
float maxThreads; float maxThreads = 0.95;
int classNumStates; int classNumStates;
vector<string> features; vector<string> features; // Including classname
string className; string className;
int laplaceSmoothing; double laplaceSmoothing;
torch::Tensor samples; torch::Tensor samples; // nxm tensor used to fit the model
bool isCyclic(const std::string&, std::unordered_set<std::string>&, std::unordered_set<std::string>&); bool isCyclic(const std::string&, std::unordered_set<std::string>&, std::unordered_set<std::string>&);
vector<double> predict_sample(const vector<int>&); vector<double> predict_sample(const vector<int>&);
vector<double> predict_sample(const torch::Tensor&);
vector<double> exactInference(map<string, int>&); vector<double> exactInference(map<string, int>&);
double computeFactor(map<string, int>&); double computeFactor(map<string, int>&);
double mutual_info(torch::Tensor&, torch::Tensor&); void completeFit(const map<string, vector<int>>& states, const torch::Tensor& weights);
double entropy(torch::Tensor&); void checkFitData(int n_features, int n_samples, int n_samples_y, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states, const torch::Tensor& weights);
double conditionalEntropy(torch::Tensor&, torch::Tensor&); void setStates(const map<string, vector<int>>&);
double mutualInformation(torch::Tensor&, torch::Tensor&);
public: public:
Network(); Network();
Network(float, int); explicit Network(float);
Network(float); explicit Network(Network&);
Network(Network&);
torch::Tensor& getSamples(); torch::Tensor& getSamples();
float getmaxThreads(); float getmaxThreads();
void addNode(string, int); void addNode(const string&);
void addEdge(const string, const string); void addEdge(const string&, const string&);
map<string, std::unique_ptr<Node>>& getNodes(); map<string, std::unique_ptr<Node>>& getNodes();
vector<string> getFeatures(); vector<string> getFeatures() const;
int getStates(); int getStates() const;
vector<pair<string, string>> getEdges(); vector<pair<string, string>> getEdges() const;
int getClassNumStates(); int getNumEdges() const;
string getClassName(); int getClassNumStates() const;
void fit(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&); string getClassName() const;
vector<int> predict(const vector<vector<int>>&); void fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<float>& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
//Computes the conditional edge weight of variable index u and v conditioned on class_node void fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
torch::Tensor conditionalEdgeWeight(); void fit(const torch::Tensor& samples, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
vector<vector<double>> predict_proba(const vector<vector<int>>&); vector<int> predict(const vector<vector<int>>&); // Return mx1 vector of predictions
torch::Tensor predict(const torch::Tensor&); // Return mx1 tensor of predictions
torch::Tensor predict_tensor(const torch::Tensor& samples, const bool proba);
vector<vector<double>> predict_proba(const vector<vector<int>>&); // Return mxn vector of probabilities
torch::Tensor predict_proba(const torch::Tensor&); // Return mxn tensor of probabilities
double score(const vector<vector<int>>&, const vector<int>&); double score(const vector<vector<int>>&, const vector<int>&);
vector<string> show(); vector<string> topological_sort();
vector<string> graph(string title); // Returns a vector of strings representing the graph in graphviz format vector<string> show() const;
vector<string> graph(const string& title) const; // Returns a vector of strings representing the graph in graphviz format
void initialize();
void dump_cpt() const;
inline string version() { return "0.1.0"; } inline string version() { return "0.1.0"; }
}; };
} }

View File

@@ -2,10 +2,18 @@
namespace bayesnet { namespace bayesnet {
Node::Node(const std::string& name, int numStates) Node::Node(const std::string& name)
: name(name), numStates(numStates), cpTable(torch::Tensor()), parents(vector<Node*>()), children(vector<Node*>()) : name(name), numStates(0), cpTable(torch::Tensor()), parents(vector<Node*>()), children(vector<Node*>())
{ {
} }
void Node::clear()
{
parents.clear();
children.clear();
cpTable = torch::Tensor();
dimensions.clear();
numStates = 0;
}
string Node::getName() const string Node::getName() const
{ {
return name; return name;
@@ -76,25 +84,34 @@ namespace bayesnet {
} }
return result; return result;
} }
void Node::computeCPT(map<string, vector<int>>& dataset, const int laplaceSmoothing) void Node::computeCPT(const torch::Tensor& dataset, const vector<string>& features, const double laplaceSmoothing, const torch::Tensor& weights)
{ {
dimensions.clear();
// Get dimensions of the CPT // Get dimensions of the CPT
dimensions.push_back(numStates); dimensions.push_back(numStates);
for (auto father : getParents()) { transform(parents.begin(), parents.end(), back_inserter(dimensions), [](const auto& parent) { return parent->getNumStates(); });
dimensions.push_back(father->getNumStates());
}
auto length = dimensions.size();
// Create a tensor of zeros with the dimensions of the CPT // Create a tensor of zeros with the dimensions of the CPT
cpTable = torch::zeros(dimensions, torch::kFloat) + laplaceSmoothing; cpTable = torch::zeros(dimensions, torch::kFloat) + laplaceSmoothing;
// Fill table with counts // Fill table with counts
for (int n_sample = 0; n_sample < dataset[name].size(); ++n_sample) { auto pos = find(features.begin(), features.end(), name);
if (pos == features.end()) {
throw logic_error("Feature " + name + " not found in dataset");
}
int name_index = pos - features.begin();
for (int n_sample = 0; n_sample < dataset.size(1); ++n_sample) {
torch::List<c10::optional<torch::Tensor>> coordinates; torch::List<c10::optional<torch::Tensor>> coordinates;
coordinates.push_back(torch::tensor(dataset[name][n_sample])); coordinates.push_back(dataset.index({ name_index, n_sample }));
for (auto father : getParents()) { for (auto parent : parents) {
coordinates.push_back(torch::tensor(dataset[father->getName()][n_sample])); pos = find(features.begin(), features.end(), parent->getName());
if (pos == features.end()) {
throw logic_error("Feature parent " + parent->getName() + " not found in dataset");
}
int parent_index = pos - features.begin();
coordinates.push_back(dataset.index({ parent_index, n_sample }));
} }
// Increment the count of the corresponding coordinate // Increment the count of the corresponding coordinate
cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + 1); cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + weights.index({ n_sample }).item<double>());
} }
// Normalize the counts // Normalize the counts
cpTable = cpTable / cpTable.sum(0); cpTable = cpTable / cpTable.sum(0);
@@ -104,19 +121,15 @@ namespace bayesnet {
torch::List<c10::optional<torch::Tensor>> coordinates; torch::List<c10::optional<torch::Tensor>> coordinates;
// following predetermined order of indices in the cpTable (see Node.h) // following predetermined order of indices in the cpTable (see Node.h)
coordinates.push_back(torch::tensor(evidence[name])); coordinates.push_back(torch::tensor(evidence[name]));
for (auto parent : getParents()) { transform(parents.begin(), parents.end(), back_inserter(coordinates), [&evidence](const auto& parent) { return torch::tensor(evidence[parent->getName()]); });
coordinates.push_back(torch::tensor(evidence[parent->getName()]));
}
return cpTable.index({ coordinates }).item<float>(); return cpTable.index({ coordinates }).item<float>();
} }
vector<string> Node::graph(string className) vector<string> Node::graph(const string& className)
{ {
auto output = vector<string>(); auto output = vector<string>();
auto suffix = name == className ? ", fontcolor=red, fillcolor=lightblue, style=filled " : ""; auto suffix = name == className ? ", fontcolor=red, fillcolor=lightblue, style=filled " : "";
output.push_back(name + " [shape=circle" + suffix + "] \n"); output.push_back(name + " [shape=circle" + suffix + "] \n");
for (auto& child : children) { transform(children.begin(), children.end(), back_inserter(output), [this](const auto& child) { return name + " -> " + child->getName(); });
output.push_back(name + " -> " + child->getName());
}
return output; return output;
} }
} }

View File

@@ -16,7 +16,8 @@ namespace bayesnet {
vector<int64_t> dimensions; // dimensions of the cpTable vector<int64_t> dimensions; // dimensions of the cpTable
public: public:
vector<pair<string, string>> combinations(const vector<string>&); vector<pair<string, string>> combinations(const vector<string>&);
Node(const std::string&, int); explicit Node(const string&);
void clear();
void addParent(Node*); void addParent(Node*);
void addChild(Node*); void addChild(Node*);
void removeParent(Node*); void removeParent(Node*);
@@ -25,11 +26,11 @@ namespace bayesnet {
vector<Node*>& getParents(); vector<Node*>& getParents();
vector<Node*>& getChildren(); vector<Node*>& getChildren();
torch::Tensor& getCPT(); torch::Tensor& getCPT();
void computeCPT(map<string, vector<int>>&, const int); void computeCPT(const torch::Tensor& dataset, const vector<string>& features, const double laplaceSmoothing, const torch::Tensor& weights);
int getNumStates() const; int getNumStates() const;
void setNumStates(int); void setNumStates(int);
unsigned minFill(); unsigned minFill();
vector<string> graph(string clasName); // Returns a vector of strings representing the graph in graphviz format vector<string> graph(const string& clasName); // Returns a vector of strings representing the graph in graphviz format
float getFactorValue(map<string, int>&); float getFactorValue(map<string, int>&);
}; };
} }

110
src/BayesNet/Proposal.cc Normal file
View File

@@ -0,0 +1,110 @@
#include "Proposal.h"
#include "ArffFiles.h"
namespace bayesnet {
Proposal::Proposal(torch::Tensor& dataset_, vector<string>& features_, string& className_) : pDataset(dataset_), pFeatures(features_), pClassName(className_) {}
Proposal::~Proposal()
{
for (auto& [key, value] : discretizers) {
delete value;
}
}
map<string, vector<int>> Proposal::localDiscretizationProposal(const map<string, vector<int>>& oldStates, Network& model)
{
// order of local discretization is important. no good 0, 1, 2...
// although we rediscretize features after the local discretization of every feature
auto order = model.topological_sort();
auto& nodes = model.getNodes();
map<string, vector<int>> states = oldStates;
vector<int> indicesToReDiscretize;
bool upgrade = false; // Flag to check if we need to upgrade the model
for (auto feature : order) {
auto nodeParents = nodes[feature]->getParents();
if (nodeParents.size() < 2) continue; // Only has class as parent
upgrade = true;
int index = find(pFeatures.begin(), pFeatures.end(), feature) - pFeatures.begin();
indicesToReDiscretize.push_back(index); // We need to re-discretize this feature
vector<string> parents;
transform(nodeParents.begin(), nodeParents.end(), back_inserter(parents), [](const auto& p) { return p->getName(); });
// Remove class as parent as it will be added later
parents.erase(remove(parents.begin(), parents.end(), pClassName), parents.end());
// Get the indices of the parents
vector<int> indices;
indices.push_back(-1); // Add class index
transform(parents.begin(), parents.end(), back_inserter(indices), [&](const auto& p) {return find(pFeatures.begin(), pFeatures.end(), p) - pFeatures.begin(); });
// Now we fit the discretizer of the feature, conditioned on its parents and the class i.e. discretizer.fit(X[index], X[indices] + y)
vector<string> yJoinParents(Xf.size(1));
for (auto idx : indices) {
for (int i = 0; i < Xf.size(1); ++i) {
yJoinParents[i] += to_string(pDataset.index({ idx, i }).item<int>());
}
}
auto arff = ArffFiles();
auto yxv = arff.factorize(yJoinParents);
auto xvf_ptr = Xf.index({ index }).data_ptr<float>();
auto xvf = vector<mdlp::precision_t>(xvf_ptr, xvf_ptr + Xf.size(1));
discretizers[feature]->fit(xvf, yxv);
//
//
//
// auto tmp = discretizers[feature]->transform(xvf);
// Xv[index] = tmp;
// auto xStates = vector<int>(discretizers[pFeatures[index]]->getCutPoints().size() + 1);
// iota(xStates.begin(), xStates.end(), 0);
// //Update new states of the feature/node
// states[feature] = xStates;
}
if (upgrade) {
// Discretize again X (only the affected indices) with the new fitted discretizers
for (auto index : indicesToReDiscretize) {
auto Xt_ptr = Xf.index({ index }).data_ptr<float>();
auto Xt = vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
pDataset.index_put_({ index, "..." }, torch::tensor(discretizers[pFeatures[index]]->transform(Xt)));
auto xStates = vector<int>(discretizers[pFeatures[index]]->getCutPoints().size() + 1);
iota(xStates.begin(), xStates.end(), 0);
//Update new states of the feature/node
states[pFeatures[index]] = xStates;
}
const torch::Tensor weights = torch::full({ pDataset.size(1) }, 1.0 / pDataset.size(1), torch::kDouble);
model.fit(pDataset, weights, pFeatures, pClassName, states);
}
return states;
}
map<string, vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y)
{
// Discretize the continuous input data and build pDataset (Classifier::dataset)
int m = Xf.size(1);
int n = Xf.size(0);
map<string, vector<int>> states;
pDataset = torch::zeros({ n + 1, m }, kInt32);
auto yv = vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0));
// discretize input data by feature(row)
for (auto i = 0; i < pFeatures.size(); ++i) {
auto* discretizer = new mdlp::CPPFImdlp();
auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
auto Xt = vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
discretizer->fit(Xt, yv);
pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->transform(Xt)));
auto xStates = vector<int>(discretizer->getCutPoints().size() + 1);
iota(xStates.begin(), xStates.end(), 0);
states[pFeatures[i]] = xStates;
discretizers[pFeatures[i]] = discretizer;
}
int n_classes = torch::max(y).item<int>() + 1;
auto yStates = vector<int>(n_classes);
iota(yStates.begin(), yStates.end(), 0);
states[pClassName] = yStates;
pDataset.index_put_({ n, "..." }, y);
return states;
}
torch::Tensor Proposal::prepareX(torch::Tensor& X)
{
auto Xtd = torch::zeros_like(X, torch::kInt32);
for (int i = 0; i < X.size(0); ++i) {
auto Xt = vector<float>(X[i].data_ptr<float>(), X[i].data_ptr<float>() + X.size(1));
auto Xd = discretizers[pFeatures[i]]->transform(Xt);
Xtd.index_put_({ i }, torch::tensor(Xd, torch::kInt32));
}
return Xtd;
}
}

29
src/BayesNet/Proposal.h Normal file
View File

@@ -0,0 +1,29 @@
#ifndef PROPOSAL_H
#define PROPOSAL_H
#include <string>
#include <map>
#include <torch/torch.h>
#include "Network.h"
#include "CPPFImdlp.h"
#include "Classifier.h"
namespace bayesnet {
class Proposal {
public:
Proposal(torch::Tensor& pDataset, vector<string>& features_, string& className_);
virtual ~Proposal();
protected:
torch::Tensor prepareX(torch::Tensor& X);
map<string, vector<int>> localDiscretizationProposal(const map<string, vector<int>>& states, Network& model);
map<string, vector<int>> fit_local_discretization(const torch::Tensor& y);
torch::Tensor Xf; // X continuous nxm tensor
torch::Tensor y; // y discrete nx1 tensor
map<string, mdlp::CPPFImdlp*> discretizers;
private:
torch::Tensor& pDataset; // (n+1)xm tensor
vector<string>& pFeatures;
string& pClassName;
};
}
#endif

View File

@@ -2,9 +2,9 @@
namespace bayesnet { namespace bayesnet {
SPODE::SPODE(int root) : BaseClassifier(Network()), root(root) {} SPODE::SPODE(int root) : Classifier(Network()), root(root) {}
void SPODE::train() void SPODE::buildModel(const torch::Tensor& weights)
{ {
// 0. Add all nodes to the model // 0. Add all nodes to the model
addNodes(); addNodes();
@@ -17,7 +17,7 @@ namespace bayesnet {
} }
} }
} }
vector<string> SPODE::graph(string name ) vector<string> SPODE::graph(const string& name) const
{ {
return model.graph(name); return model.graph(name);
} }

View File

@@ -1,15 +1,17 @@
#ifndef SPODE_H #ifndef SPODE_H
#define SPODE_H #define SPODE_H
#include "BaseClassifier.h" #include "Classifier.h"
namespace bayesnet { namespace bayesnet {
class SPODE : public BaseClassifier { class SPODE : public Classifier {
private: private:
int root; int root;
protected: protected:
void train() override; void buildModel(const torch::Tensor& weights) override;
public: public:
SPODE(int root); explicit SPODE(int root);
vector<string> graph(string name = "SPODE") override; virtual ~SPODE() {};
vector<string> graph(const string& name = "SPODE") const override;
}; };
} }
#endif #endif

46
src/BayesNet/SPODELd.cc Normal file
View File

@@ -0,0 +1,46 @@
#include "SPODELd.h"
namespace bayesnet {
using namespace std;
SPODELd::SPODELd(int root) : SPODE(root), Proposal(dataset, features, className) {}
SPODELd& SPODELd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
{
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
features = features_;
className = className_;
Xf = X_;
y = y_;
// Fills vectors Xv & yv with the data from tensors X_ (discretized) & y
states = fit_local_discretization(y);
// We have discretized the input data
// 1st we need to fit the model to build the normal SPODE structure, SPODE::fit initializes the base Bayesian network
SPODE::fit(dataset, features, className, states);
states = localDiscretizationProposal(states, model);
return *this;
}
SPODELd& SPODELd::fit(torch::Tensor& dataset, vector<string>& features_, string className_, map<string, vector<int>>& states_)
{
Xf = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }).clone();
y = dataset.index({ -1, "..." }).clone();
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
features = features_;
className = className_;
// Fills vectors Xv & yv with the data from tensors X_ (discretized) & y
states = fit_local_discretization(y);
// We have discretized the input data
// 1st we need to fit the model to build the normal SPODE structure, SPODE::fit initializes the base Bayesian network
SPODE::fit(dataset, features, className, states);
states = localDiscretizationProposal(states, model);
return *this;
}
Tensor SPODELd::predict(Tensor& X)
{
auto Xt = prepareX(X);
return SPODE::predict(Xt);
}
vector<string> SPODELd::graph(const string& name) const
{
return SPODE::graph(name);
}
}

19
src/BayesNet/SPODELd.h Normal file
View File

@@ -0,0 +1,19 @@
#ifndef SPODELD_H
#define SPODELD_H
#include "SPODE.h"
#include "Proposal.h"
namespace bayesnet {
using namespace std;
class SPODELd : public SPODE, public Proposal {
public:
explicit SPODELd(int root);
virtual ~SPODELd() = default;
SPODELd& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override;
SPODELd& fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states) override;
vector<string> graph(const string& name = "SPODE") const override;
Tensor predict(Tensor& X) override;
static inline string version() { return "0.0.1"; };
};
}
#endif // !SPODELD_H

View File

@@ -1,30 +1,29 @@
#include "TAN.h" #include "TAN.h"
namespace bayesnet { namespace bayesnet {
using namespace std;
using namespace torch; using namespace torch;
TAN::TAN() : BaseClassifier(Network()) {} TAN::TAN() : Classifier(Network()) {}
void TAN::train() void TAN::buildModel(const torch::Tensor& weights)
{ {
// 0. Add all nodes to the model // 0. Add all nodes to the model
addNodes(); addNodes();
// 1. Compute mutual information between each feature and the class and set the root node // 1. Compute mutual information between each feature and the class and set the root node
// as the highest mutual information with the class // as the highest mutual information with the class
auto mi = vector <pair<int, float >>(); auto mi = vector <pair<int, float >>();
Tensor class_dataset = dataset.index({ "...", -1 }); Tensor class_dataset = dataset.index({ -1, "..." });
for (int i = 0; i < static_cast<int>(features.size()); ++i) { for (int i = 0; i < static_cast<int>(features.size()); ++i) {
Tensor feature_dataset = dataset.index({ "...", i }); Tensor feature_dataset = dataset.index({ i, "..." });
auto mi_value = metrics.mutualInformation(class_dataset, feature_dataset); auto mi_value = metrics.mutualInformation(class_dataset, feature_dataset, weights);
mi.push_back({ i, mi_value }); mi.push_back({ i, mi_value });
} }
sort(mi.begin(), mi.end(), [](auto& left, auto& right) {return left.second < right.second;}); sort(mi.begin(), mi.end(), [](const auto& left, const auto& right) {return left.second < right.second;});
auto root = mi[mi.size() - 1].first; auto root = mi[mi.size() - 1].first;
// 2. Compute mutual information between each feature and the class // 2. Compute mutual information between each feature and the class
auto weights = metrics.conditionalEdge(); auto weights_matrix = metrics.conditionalEdge(weights);
// 3. Compute the maximum spanning tree // 3. Compute the maximum spanning tree
auto mst = metrics.maximumSpanningTree(features, weights, root); auto mst = metrics.maximumSpanningTree(features, weights_matrix, root);
// 4. Add edges from the maximum spanning tree to the model // 4. Add edges from the maximum spanning tree to the model
for (auto i = 0; i < mst.size(); ++i) { for (auto i = 0; i < mst.size(); ++i) {
auto [from, to] = mst[i]; auto [from, to] = mst[i];
@@ -35,7 +34,7 @@ namespace bayesnet {
model.addEdge(className, feature); model.addEdge(className, feature);
} }
} }
vector<string> TAN::graph(string title) vector<string> TAN::graph(const string& title) const
{ {
return model.graph(title); return model.graph(title);
} }

View File

@@ -1,16 +1,17 @@
#ifndef TAN_H #ifndef TAN_H
#define TAN_H #define TAN_H
#include "BaseClassifier.h" #include "Classifier.h"
namespace bayesnet { namespace bayesnet {
using namespace std; using namespace std;
using namespace torch; using namespace torch;
class TAN : public BaseClassifier { class TAN : public Classifier {
private: private:
protected: protected:
void train() override; void buildModel(const torch::Tensor& weights) override;
public: public:
TAN(); TAN();
vector<string> graph(string name = "TAN") override; virtual ~TAN() {};
vector<string> graph(const string& name = "TAN") const override;
}; };
} }
#endif #endif

31
src/BayesNet/TANLd.cc Normal file
View File

@@ -0,0 +1,31 @@
#include "TANLd.h"
namespace bayesnet {
using namespace std;
TANLd::TANLd() : TAN(), Proposal(dataset, features, className) {}
TANLd& TANLd::fit(torch::Tensor& X_, torch::Tensor& y_, vector<string>& features_, string className_, map<string, vector<int>>& states_)
{
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
features = features_;
className = className_;
Xf = X_;
y = y_;
// Fills vectors Xv & yv with the data from tensors X_ (discretized) & y
states = fit_local_discretization(y);
// We have discretized the input data
// 1st we need to fit the model to build the normal TAN structure, TAN::fit initializes the base Bayesian network
TAN::fit(dataset, features, className, states);
states = localDiscretizationProposal(states, model);
return *this;
}
Tensor TANLd::predict(Tensor& X)
{
auto Xt = prepareX(X);
return TAN::predict(Xt);
}
vector<string> TANLd::graph(const string& name) const
{
return TAN::graph(name);
}
}

19
src/BayesNet/TANLd.h Normal file
View File

@@ -0,0 +1,19 @@
#ifndef TANLD_H
#define TANLD_H
#include "TAN.h"
#include "Proposal.h"
namespace bayesnet {
using namespace std;
class TANLd : public TAN, public Proposal {
private:
public:
TANLd();
virtual ~TANLd() = default;
TANLd& fit(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states) override;
vector<string> graph(const string& name = "TAN") const override;
Tensor predict(Tensor& X) override;
static inline string version() { return "0.0.1"; };
};
}
#endif // !TANLD_H

View File

@@ -3,7 +3,8 @@
namespace bayesnet { namespace bayesnet {
using namespace std; using namespace std;
using namespace torch; using namespace torch;
vector<int> argsort(vector<float>& nums) // Return the indices in descending order
vector<int> argsort(vector<double>& nums)
{ {
int n = nums.size(); int n = nums.size();
vector<int> indices(n); vector<int> indices(n);
@@ -11,21 +12,16 @@ namespace bayesnet {
sort(indices.begin(), indices.end(), [&nums](int i, int j) {return nums[i] > nums[j];}); sort(indices.begin(), indices.end(), [&nums](int i, int j) {return nums[i] > nums[j];});
return indices; return indices;
} }
vector<vector<int>> tensorToVector(const Tensor& tensor) vector<vector<int>> tensorToVector(Tensor& tensor)
{ {
// convert mxn tensor to nxm vector // convert mxn tensor to nxm vector
vector<vector<int>> result; vector<vector<int>> result;
auto tensor_accessor = tensor.accessor<int, 2>(); // Iterate over cols
for (int i = 0; i < tensor.size(1); ++i) {
// Iterate over columns and rows of the tensor auto col_tensor = tensor.index({ "...", i });
for (int j = 0; j < tensor.size(1); ++j) { auto col = vector<int>(col_tensor.data_ptr<int>(), col_tensor.data_ptr<int>() + tensor.size(0));
vector<int> column; result.push_back(col);
for (int i = 0; i < tensor.size(0); ++i) {
column.push_back(tensor_accessor[i][j]);
} }
result.push_back(column);
}
return result; return result;
} }
} }

View File

@@ -5,7 +5,7 @@
namespace bayesnet { namespace bayesnet {
using namespace std; using namespace std;
using namespace torch; using namespace torch;
vector<int> argsort(vector<float>& nums); vector<int> argsort(vector<double>& nums);
vector<vector<int>> tensorToVector(const Tensor& tensor); vector<vector<int>> tensorToVector(Tensor& tensor);
} }
#endif //BAYESNET_UTILS_H #endif //BAYESNET_UTILS_H

10
src/Platform/BestResult.h Normal file
View File

@@ -0,0 +1,10 @@
#ifndef BESTRESULT_H
#define BESTRESULT_H
#include <string>
class BestResult {
public:
static std::string title() { return "STree_default (linear-ovo)"; }
static double score() { return 22.109799; }
static std::string scoreName() { return "accuracy"; }
};
#endif

View File

@@ -1,4 +1,12 @@
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/src/Platform) include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
add_executable(main Experiment.cc ArffFiles.cc CPPFImdlp.cpp Metrics.cpp platformUtils.cc) include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
target_link_libraries(main BayesNet "${TORCH_LIBRARIES}") include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include)
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc Report.cc)
add_executable(manage manage.cc Results.cc Report.cc)
add_executable(list list.cc platformUtils Datasets.cc)
target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
target_link_libraries(manage "${TORCH_LIBRARIES}")
target_link_libraries(list ArffFiles mdlp "${TORCH_LIBRARIES}")

View File

@@ -1,221 +0,0 @@
#include <numeric>
#include <algorithm>
#include <set>
#include <cmath>
#include "CPPFImdlp.h"
#include "Metrics.h"
namespace mdlp {
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_),
max_depth(max_depth_),
proposed_cuts(proposed)
{
}
CPPFImdlp::CPPFImdlp() = default;
CPPFImdlp::~CPPFImdlp() = default;
size_t CPPFImdlp::compute_max_num_cut_points() const
{
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples
if (proposed_cuts == 0) {
return numeric_limits<size_t>::max();
}
if (proposed_cuts < 0 || proposed_cuts > static_cast<float>(X.size())) {
throw invalid_argument("wrong proposed num_cuts value");
}
if (proposed_cuts < 1)
return static_cast<size_t>(round(static_cast<float>(X.size()) * proposed_cuts));
return static_cast<size_t>(proposed_cuts);
}
void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
{
X = X_;
y = y_;
num_cut_points = compute_max_num_cut_points();
depth = 0;
discretizedData.clear();
cutPoints.clear();
if (X.size() != y.size()) {
throw invalid_argument("X and y must have the same size");
}
if (X.empty() || y.empty()) {
throw invalid_argument("X and y must have at least one element");
}
if (min_length < 3) {
throw invalid_argument("min_length must be greater than 2");
}
if (max_depth < 1) {
throw invalid_argument("max_depth must be greater than 0");
}
indices = sortIndices(X_, y_);
metrics.setData(y, indices);
computeCutPoints(0, X.size(), 1);
sort(cutPoints.begin(), cutPoints.end());
if (num_cut_points > 0) {
// Select the best (with lower entropy) cut points
while (cutPoints.size() > num_cut_points) {
resizeCutPoints();
}
}
}
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
{
size_t n;
size_t m;
size_t idxPrev = cut - 1 >= start ? cut - 1 : cut;
size_t idxNext = cut + 1 < end ? cut + 1 : cut;
bool backWall; // true if duplicates reach beginning of the interval
precision_t previous;
precision_t actual;
precision_t next;
previous = X[indices[idxPrev]];
actual = X[indices[cut]];
next = X[indices[idxNext]];
// definition 2 of the paper => X[t-1] < X[t]
// get the first equal value of X in the interval
while (idxPrev > start && actual == previous) {
previous = X[indices[--idxPrev]];
}
backWall = idxPrev == start && actual == previous;
// get the last equal value of X in the interval
while (idxNext < end - 1 && actual == next) {
next = X[indices[++idxNext]];
}
// # of duplicates before cutpoint
n = cut - 1 - idxPrev;
// # of duplicates after cutpoint
m = idxNext - cut - 1;
// Decide which values to use
cut = cut + (backWall ? m + 1 : -n);
actual = X[indices[cut]];
return { (actual + previous) / 2, cut };
}
void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_)
{
size_t cut;
pair<precision_t, size_t> result;
// Check if the interval length and the depth are Ok
if (end - start < min_length || depth_ > max_depth)
return;
depth = depth_ > depth ? depth_ : depth;
cut = getCandidate(start, end);
if (cut == numeric_limits<size_t>::max())
return;
if (mdlp(start, cut, end)) {
result = valueCutPoint(start, cut, end);
cut = result.second;
cutPoints.push_back(result.first);
computeCutPoints(start, cut, depth_ + 1);
computeCutPoints(cut, end, depth_ + 1);
}
}
size_t CPPFImdlp::getCandidate(size_t start, size_t end)
{
/* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
E(A, TA; S) is minimal amongst all the candidate cut points. */
size_t candidate = numeric_limits<size_t>::max();
size_t elements = end - start;
bool sameValues = true;
precision_t entropy_left;
precision_t entropy_right;
precision_t minEntropy;
// Check if all the values of the variable in the interval are the same
for (size_t idx = start + 1; idx < end; idx++) {
if (X[indices[idx]] != X[indices[start]]) {
sameValues = false;
break;
}
}
if (sameValues)
return candidate;
minEntropy = metrics.entropy(start, end);
for (size_t idx = start + 1; idx < end; idx++) {
// Cutpoints are always on boundaries (definition 2)
if (y[indices[idx]] == y[indices[idx - 1]])
continue;
entropy_left = precision_t(idx - start) / static_cast<precision_t>(elements) * metrics.entropy(start, idx);
entropy_right = precision_t(end - idx) / static_cast<precision_t>(elements) * metrics.entropy(idx, end);
if (entropy_left + entropy_right < minEntropy) {
minEntropy = entropy_left + entropy_right;
candidate = idx;
}
}
return candidate;
}
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
{
int k;
int k1;
int k2;
precision_t ig;
precision_t delta;
precision_t ent;
precision_t ent1;
precision_t ent2;
auto N = precision_t(end - start);
k = metrics.computeNumClasses(start, end);
k1 = metrics.computeNumClasses(start, cut);
k2 = metrics.computeNumClasses(cut, end);
ent = metrics.entropy(start, end);
ent1 = metrics.entropy(start, cut);
ent2 = metrics.entropy(cut, end);
ig = metrics.informationGain(start, cut, end);
delta = static_cast<precision_t>(log2(pow(3, precision_t(k)) - 2) -
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
precision_t term = 1 / N * (log2(N - 1) + delta);
return ig > term;
}
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
{
indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0);
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
if (X_[i1] == X_[i2])
return y_[i1] < y_[i2];
else
return X_[i1] < X_[i2];
});
return idx;
}
void CPPFImdlp::resizeCutPoints()
{
//Compute entropy of each of the whole cutpoint set and discards the biggest value
precision_t maxEntropy = 0;
precision_t entropy;
size_t maxEntropyIdx = 0;
size_t begin = 0;
size_t end;
for (size_t idx = 0; idx < cutPoints.size(); idx++) {
end = begin;
while (X[indices[end]] < cutPoints[idx] && end < X.size())
end++;
entropy = metrics.entropy(begin, end);
if (entropy > maxEntropy) {
maxEntropy = entropy;
maxEntropyIdx = idx;
}
begin = end;
}
cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx));
}
labels_t& CPPFImdlp::transform(const samples_t& data)
{
discretizedData.reserve(data.size());
for (const precision_t& item : data) {
auto upper = upper_bound(cutPoints.begin(), cutPoints.end(), item);
discretizedData.push_back(upper - cutPoints.begin());
}
return discretizedData;
}
}

View File

@@ -1,45 +0,0 @@
#ifndef CPPFIMDLP_H
#define CPPFIMDLP_H
#include "typesFImdlp.h"
#include "Metrics.h"
#include <limits>
#include <utility>
#include <string>
namespace mdlp {
class CPPFImdlp {
protected:
size_t min_length = 3;
int depth = 0;
int max_depth = numeric_limits<int>::max();
float proposed_cuts = 0;
indices_t indices = indices_t();
samples_t X = samples_t();
labels_t y = labels_t();
Metrics metrics = Metrics(y, indices);
cutPoints_t cutPoints;
size_t num_cut_points = numeric_limits<size_t>::max();
labels_t discretizedData = labels_t();
static indices_t sortIndices(samples_t&, labels_t&);
void computeCutPoints(size_t, size_t, int);
void resizeCutPoints();
bool mdlp(size_t, size_t, size_t);
size_t getCandidate(size_t, size_t);
size_t compute_max_num_cut_points() const;
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
public:
CPPFImdlp();
CPPFImdlp(size_t, int, float);
~CPPFImdlp();
void fit(samples_t&, labels_t&);
inline cutPoints_t getCutPoints() const { return cutPoints; };
labels_t& transform(const samples_t&);
inline int get_depth() const { return depth; };
static inline string version() { return "1.1.2"; };
};
}
#endif

14
src/Platform/Colors.h Normal file
View File

@@ -0,0 +1,14 @@
#ifndef COLORS_H
#define COLORS_H
class Colors {
public:
static std::string MAGENTA() { return "\033[1;35m"; }
static std::string BLUE() { return "\033[1;34m"; }
static std::string CYAN() { return "\033[1;36m"; }
static std::string GREEN() { return "\033[1;32m"; }
static std::string YELLOW() { return "\033[1;33m"; }
static std::string RED() { return "\033[1;31m"; }
static std::string WHITE() { return "\033[1;37m"; }
static std::string RESET() { return "\033[0m"; }
};
#endif // COLORS_H

266
src/Platform/Datasets.cc Normal file
View File

@@ -0,0 +1,266 @@
#include "Datasets.h"
#include "platformUtils.h"
#include "ArffFiles.h"
namespace platform {
void Datasets::load()
{
ifstream catalog(path + "/all.txt");
if (catalog.is_open()) {
string line;
while (getline(catalog, line)) {
vector<string> tokens = split(line, ',');
string name = tokens[0];
string className = tokens[1];
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
}
catalog.close();
} else {
throw invalid_argument("Unable to open catalog file. [" + path + "/all.txt" + "]");
}
}
vector<string> Datasets::getNames()
{
vector<string> result;
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
return result;
}
vector<string> Datasets::getFeatures(const string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getFeatures();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Datasets::getStates(const string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getStates();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
void Datasets::loadDataset(const string& name) const
{
if (datasets.at(name)->isLoaded()) {
return;
} else {
datasets.at(name)->load();
}
}
string Datasets::getClassName(const string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getClassName();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNSamples(const string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getNSamples();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNClasses(const string& name)
{
if (datasets.at(name)->isLoaded()) {
auto className = datasets.at(name)->getClassName();
if (discretize) {
auto states = getStates(name);
return states.at(className).size();
}
auto [Xv, yv] = getVectors(name);
return *max_element(yv.begin(), yv.end()) + 1;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
vector<int> Datasets::getClassesCounts(const string& name) const
{
if (datasets.at(name)->isLoaded()) {
auto [Xv, yv] = datasets.at(name)->getVectors();
vector<int> counts(*max_element(yv.begin(), yv.end()) + 1);
for (auto y : yv) {
counts[y]++;
}
return counts;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Datasets::getVectors(const string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getVectors();
}
pair<vector<vector<int>>&, vector<int>&> Datasets::getVectorsDiscretized(const string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getVectorsDiscretized();
}
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(const string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getTensors();
}
bool Datasets::isDataset(const string& name) const
{
return datasets.find(name) != datasets.end();
}
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
{
}
string Dataset::getName() const
{
return name;
}
string Dataset::getClassName() const
{
return className;
}
vector<string> Dataset::getFeatures() const
{
if (loaded) {
return features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures() const
{
if (loaded) {
return n_features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples() const
{
if (loaded) {
return n_samples;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Dataset::getStates() const
{
if (loaded) {
return states;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<int>>&, vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
string line;
getline(file, line);
vector<string> tokens = split(line, ',');
features = vector<string>(tokens.begin(), tokens.end() - 1);
className = tokens.back();
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
iota(begin(states[features[i]]), end(states[features[i]]), 0);
}
states[className] = vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states[className]), end(states[className]), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
}
n_samples = Xv[0].size();
n_features = Xv.size();
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
}
y = torch::tensor(yv, torch::kInt32);
}
}

68
src/Platform/Datasets.h Normal file
View File

@@ -0,0 +1,68 @@
#ifndef DATASETS_H
#define DATASETS_H
#include <torch/torch.h>
#include <map>
#include <vector>
#include <string>
namespace platform {
using namespace std;
enum fileType_t { CSV, ARFF };
class Dataset {
private:
string path;
string name;
fileType_t fileType;
string className;
int n_samples{ 0 }, n_features{ 0 };
vector<string> features;
map<string, vector<int>> states;
bool loaded;
bool discretize;
torch::Tensor X, y;
vector<vector<float>> Xv;
vector<vector<int>> Xd;
vector<int> yv;
void buildTensors();
void load_csv();
void load_arff();
void computeStates();
public:
Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
explicit Dataset(const Dataset&);
string getName() const;
string getClassName() const;
vector<string> getFeatures() const;
map<string, vector<int>> getStates() const;
pair<vector<vector<float>>&, vector<int>&> getVectors();
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized();
pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures() const;
int getNSamples() const;
void load();
const bool inline isLoaded() const { return loaded; };
};
class Datasets {
private:
string path;
fileType_t fileType;
map<string, unique_ptr<Dataset>> datasets;
bool discretize;
void load(); // Loads the list of datasets
public:
explicit Datasets(const string& path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); };
vector<string> getNames();
vector<string> getFeatures(const string& name) const;
int getNSamples(const string& name) const;
string getClassName(const string& name) const;
int getNClasses(const string& name);
vector<int> getClassesCounts(const string& name) const;
map<string, vector<int>> getStates(const string& name) const;
pair<vector<vector<float>>&, vector<int>&> getVectors(const string& name);
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized(const string& name);
pair<torch::Tensor&, torch::Tensor&> getTensors(const string& name);
bool isDataset(const string& name) const;
void loadDataset(const string& name) const;
};
};
#endif

62
src/Platform/DotEnv.h Normal file
View File

@@ -0,0 +1,62 @@
#ifndef DOTENV_H
#define DOTENV_H
#include <string>
#include <map>
#include <fstream>
#include <sstream>
#include "platformUtils.h"
namespace platform {
class DotEnv {
private:
std::map<std::string, std::string> env;
std::string trim(const std::string& str)
{
std::string result = str;
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
return !std::isspace(ch);
}));
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
return !std::isspace(ch);
}).base(), result.end());
return result;
}
public:
DotEnv()
{
std::ifstream file(".env");
if (!file.is_open()) {
std::cerr << "File .env not found" << std::endl;
exit(1);
}
std::string line;
while (std::getline(file, line)) {
line = trim(line);
if (line.empty() || line[0] == '#') {
continue;
}
std::istringstream iss(line);
std::string key, value;
if (std::getline(iss, key, '=') && std::getline(iss, value)) {
env[key] = value;
}
}
}
std::string get(const std::string& key)
{
return env[key];
}
std::vector<int> getSeeds()
{
auto seeds = std::vector<int>();
auto seeds_str = env["seeds"];
seeds_str = trim(seeds_str);
seeds_str = seeds_str.substr(1, seeds_str.size() - 2);
auto seeds_str_split = split(seeds_str, ',');
transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) {
return stoi(str);
});
return seeds;
}
};
}
#endif

View File

@@ -1,201 +1,184 @@
#include <iostream> #include "Experiment.h"
#include <string> #include "Datasets.h"
#include <torch/torch.h> #include "Models.h"
#include <thread> #include "Report.h"
#include <getopt.h>
#include "ArffFiles.h"
#include "Network.h"
#include "Metrics.hpp"
#include "CPPFImdlp.h"
#include "KDB.h"
#include "SPODE.h"
#include "AODE.h"
#include "TAN.h"
#include "platformUtils.h"
namespace platform {
using namespace std; using json = nlohmann::json;
string get_date()
/* print a description of all supported options */
void usage(const char* path)
{ {
/* take only the last portion of the path */ time_t rawtime;
const char* basename = strrchr(path, '/'); tm* timeinfo;
basename = basename ? basename + 1 : path; time(&rawtime);
timeinfo = std::localtime(&rawtime);
cout << "usage: " << basename << "[OPTION]" << endl; std::ostringstream oss;
cout << " -h, --help\t\t Print this help and exit." << endl; oss << std::put_time(timeinfo, "%Y-%m-%d");
cout return oss.str();
<< " -f, --file[=FILENAME]\t {diabetes, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}."
<< endl;
cout << " -p, --path[=FILENAME]\t folder where the data files are located, default " << PATH << endl;
cout << " -m, --model={AODE, KDB, SPODE, TAN}\t " << endl;
} }
string get_time()
tuple<string, string, string> parse_arguments(int argc, char** argv)
{ {
string file_name; time_t rawtime;
string model_name; tm* timeinfo;
string path = PATH; time(&rawtime);
const vector<struct option> long_options = { timeinfo = std::localtime(&rawtime);
{"help", no_argument, nullptr, 'h'}, std::ostringstream oss;
{"file", required_argument, nullptr, 'f'}, oss << std::put_time(timeinfo, "%H:%M:%S");
{"path", required_argument, nullptr, 'p'}, return oss.str();
{"model", required_argument, nullptr, 'm'},
{nullptr, no_argument, nullptr, 0}
};
while (true) {
const auto c = getopt_long(argc, argv, "hf:p:m:", long_options.data(), nullptr);
if (c == -1)
break;
switch (c) {
case 'h':
usage(argv[0]);
exit(0);
case 'f':
file_name = string(optarg);
break;
case 'm':
model_name = string(optarg);
break;
case 'p':
path = optarg;
if (path.back() != '/')
path += '/';
break;
case '?':
usage(argv[0]);
exit(1);
default:
abort();
} }
} string Experiment::get_file_name()
if (file_name.empty()) {
usage(argv[0]);
exit(1);
}
return make_tuple(file_name, path, model_name);
}
inline constexpr auto hash_conv(const std::string_view sv)
{ {
unsigned long hash{ 5381 }; string result = "results_" + score_name + "_" + model + "_" + platform + "_" + get_date() + "_" + get_time() + "_" + (stratified ? "1" : "0") + ".json";
for (unsigned char c : sv) { return result;
hash = ((hash << 5) + hash) ^ c;
}
return hash;
} }
inline constexpr auto operator"" _sh(const char* str, size_t len) json Experiment::build_json()
{ {
return hash_conv(std::string_view{ str, len }); json result;
result["title"] = title;
result["date"] = get_date();
result["time"] = get_time();
result["model"] = model;
result["version"] = model_version;
result["platform"] = platform;
result["score_name"] = score_name;
result["language"] = language;
result["language_version"] = language_version;
result["discretized"] = discretized;
result["stratified"] = stratified;
result["folds"] = nfolds;
result["seeds"] = randomSeeds;
result["duration"] = duration;
result["results"] = json::array();
for (const auto& r : results) {
json j;
j["dataset"] = r.getDataset();
j["hyperparameters"] = r.getHyperparameters();
j["samples"] = r.getSamples();
j["features"] = r.getFeatures();
j["classes"] = r.getClasses();
j["score_train"] = r.getScoreTrain();
j["score_test"] = r.getScoreTest();
j["score"] = r.getScoreTest();
j["score_std"] = r.getScoreTestStd();
j["score_train_std"] = r.getScoreTrainStd();
j["score_test_std"] = r.getScoreTestStd();
j["train_time"] = r.getTrainTime();
j["train_time_std"] = r.getTrainTimeStd();
j["test_time"] = r.getTestTime();
j["test_time_std"] = r.getTestTimeStd();
j["time"] = r.getTestTime() + r.getTrainTime();
j["time_std"] = r.getTestTimeStd() + r.getTrainTimeStd();
j["scores_train"] = r.getScoresTrain();
j["scores_test"] = r.getScoresTest();
j["times_train"] = r.getTimesTrain();
j["times_test"] = r.getTimesTest();
j["nodes"] = r.getNodes();
j["leaves"] = r.getLeaves();
j["depth"] = r.getDepth();
result["results"].push_back(j);
} }
return result;
}
void Experiment::save(const string& path)
tuple<string, string, string> get_options(int argc, char** argv)
{ {
map<string, bool> datasets = { json data = build_json();
{"diabetes", true}, ofstream file(path + "/" + get_file_name());
{"ecoli", true}, file << data;
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
};
vector <string> models = { "AODE", "KDB", "SPODE", "TAN" };
string file_name;
string path;
string model_name;
tie(file_name, path, model_name) = parse_arguments(argc, argv);
if (datasets.find(file_name) == datasets.end()) {
cout << "Invalid file name: " << file_name << endl;
usage(argv[0]);
exit(1);
}
if (!file_exists(path + file_name + ".arff")) {
cout << "Data File " << path + file_name + ".arff" << " does not exist" << endl;
usage(argv[0]);
exit(1);
}
if (find(models.begin(), models.end(), model_name) == models.end()) {
cout << "Invalid model name: " << model_name << endl;
usage(argv[0]);
exit(1);
}
return { file_name, path, model_name };
}
int main(int argc, char** argv)
{
string file_name, path, model_name;
tie(file_name, path, model_name) = get_options(argc, argv);
auto handler = ArffFiles();
handler.load(path + file_name + ".arff");
// Get Dataset X, y
vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
vector<string> features;
for (auto feature : handler.getAttributes()) {
features.push_back(feature.first);
}
// Discretize Dataset
vector<mdlp::labels_t> Xd;
map<string, int> maxes;
tie(Xd, maxes) = discretize(X, y, features);
maxes[className] = *max_element(y.begin(), y.end()) + 1;
map<string, vector<int>> states;
for (auto feature : features) {
states[feature] = vector<int>(maxes[feature]);
}
states[className] = vector<int>(
maxes[className]);
double score;
vector<string> lines;
vector<string> graph;
auto kdb = bayesnet::KDB(2);
auto aode = bayesnet::AODE();
auto spode = bayesnet::SPODE(2);
auto tan = bayesnet::TAN();
switch (hash_conv(model_name)) {
case "AODE"_sh:
aode.fit(Xd, y, features, className, states);
lines = aode.show();
score = aode.score(Xd, y);
graph = aode.graph();
break;
case "KDB"_sh:
kdb.fit(Xd, y, features, className, states);
lines = kdb.show();
score = kdb.score(Xd, y);
graph = kdb.graph();
break;
case "SPODE"_sh:
spode.fit(Xd, y, features, className, states);
lines = spode.show();
score = spode.score(Xd, y);
graph = spode.graph();
break;
case "TAN"_sh:
tan.fit(Xd, y, features, className, states);
lines = tan.show();
score = tan.score(Xd, y);
graph = tan.graph();
break;
}
for (auto line : lines) {
cout << line << endl;
}
cout << "Score: " << score << endl;
auto dot_file = model_name + "_" + file_name;
ofstream file(dot_file + ".dot");
file << graph;
file.close(); file.close();
cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; }
cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl;
return 0; void Experiment::report()
{
json data = build_json();
Report report(data);
report.show();
}
void Experiment::show()
{
json data = build_json();
cout << data.dump(4) << endl;
}
void Experiment::go(vector<string> filesToProcess, const string& path)
{
cout << "*** Starting experiment: " << title << " ***" << endl;
for (auto fileName : filesToProcess) {
cout << "- " << setw(20) << left << fileName << " " << right << flush;
cross_validation(path, fileName);
cout << endl;
}
}
void Experiment::cross_validation(const string& path, const string& fileName)
{
auto datasets = platform::Datasets(path, discretized, platform::ARFF);
// Get dataset
auto [X, y] = datasets.getTensors(fileName);
auto states = datasets.getStates(fileName);
auto features = datasets.getFeatures(fileName);
auto samples = datasets.getNSamples(fileName);
auto className = datasets.getClassName(fileName);
cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush;
// Prepare Result
auto result = Result();
auto [values, counts] = at::_unique(y);
result.setSamples(X.size(1)).setFeatures(X.size(0)).setClasses(values.size(0));
int nResults = nfolds * static_cast<int>(randomSeeds.size());
auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64);
auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64);
auto train_time = torch::zeros({ nResults }, torch::kFloat64);
auto test_time = torch::zeros({ nResults }, torch::kFloat64);
auto nodes = torch::zeros({ nResults }, torch::kFloat64);
auto edges = torch::zeros({ nResults }, torch::kFloat64);
auto num_states = torch::zeros({ nResults }, torch::kFloat64);
Timer train_timer, test_timer;
int item = 0;
for (auto seed : randomSeeds) {
cout << "(" << seed << ") doing Fold: " << flush;
Fold* fold;
if (stratified)
fold = new StratifiedKFold(nfolds, y, seed);
else
fold = new KFold(nfolds, y.size(0), seed);
for (int nfold = 0; nfold < nfolds; nfold++) {
auto clf = Models::instance()->create(model);
setModelVersion(clf->getVersion());
train_timer.start();
auto [train, test] = fold->getFold(nfold);
auto train_t = torch::tensor(train);
auto test_t = torch::tensor(test);
auto X_train = X.index({ "...", train_t });
auto y_train = y.index({ train_t });
auto X_test = X.index({ "...", test_t });
auto y_test = y.index({ test_t });
cout << nfold + 1 << ", " << flush;
clf->fit(X_train, y_train, features, className, states);
nodes[item] = clf->getNumberOfNodes();
edges[item] = clf->getNumberOfEdges();
num_states[item] = clf->getNumberOfStates();
train_time[item] = train_timer.getDuration();
auto accuracy_train_value = clf->score(X_train, y_train);
test_timer.start();
auto accuracy_test_value = clf->score(X_test, y_test);
test_time[item] = test_timer.getDuration();
accuracy_train[item] = accuracy_train_value;
accuracy_test[item] = accuracy_test_value;
// Store results and times in vector
result.addScoreTrain(accuracy_train_value);
result.addScoreTest(accuracy_test_value);
result.addTimeTrain(train_time[item].item<double>());
result.addTimeTest(test_time[item].item<double>());
item++;
}
cout << "end. " << flush;
delete fold;
}
result.setScoreTest(torch::mean(accuracy_test).item<double>()).setScoreTrain(torch::mean(accuracy_train).item<double>());
result.setScoreTestStd(torch::std(accuracy_test).item<double>()).setScoreTrainStd(torch::std(accuracy_train).item<double>());
result.setTrainTime(torch::mean(train_time).item<double>()).setTestTime(torch::mean(test_time).item<double>());
result.setNodes(torch::mean(nodes).item<double>()).setLeaves(torch::mean(edges).item<double>()).setDepth(torch::mean(num_states).item<double>());
result.setDataset(fileName);
addResult(result);
}
} }

View File

@@ -0,0 +1,114 @@
#ifndef EXPERIMENT_H
#define EXPERIMENT_H
#include <torch/torch.h>
#include <nlohmann/json.hpp>
#include <string>
#include <chrono>
#include "Folding.h"
#include "BaseClassifier.h"
#include "TAN.h"
#include "KDB.h"
#include "AODE.h"
using namespace std;
namespace platform {
using json = nlohmann::json;
class Timer {
private:
chrono::high_resolution_clock::time_point begin;
public:
Timer() = default;
~Timer() = default;
void start() { begin = chrono::high_resolution_clock::now(); }
double getDuration()
{
chrono::high_resolution_clock::time_point end = chrono::high_resolution_clock::now();
chrono::duration<double> time_span = chrono::duration_cast<chrono::duration<double>>(end - begin);
return time_span.count();
}
};
class Result {
private:
string dataset, hyperparameters, model_version;
int samples{ 0 }, features{ 0 }, classes{ 0 };
double score_train{ 0 }, score_test{ 0 }, score_train_std{ 0 }, score_test_std{ 0 }, train_time{ 0 }, train_time_std{ 0 }, test_time{ 0 }, test_time_std{ 0 };
float nodes{ 0 }, leaves{ 0 }, depth{ 0 };
vector<double> scores_train, scores_test, times_train, times_test;
public:
Result() = default;
Result& setDataset(const string& dataset) { this->dataset = dataset; return *this; }
Result& setHyperparameters(const string& hyperparameters) { this->hyperparameters = hyperparameters; return *this; }
Result& setSamples(int samples) { this->samples = samples; return *this; }
Result& setFeatures(int features) { this->features = features; return *this; }
Result& setClasses(int classes) { this->classes = classes; return *this; }
Result& setScoreTrain(double score) { this->score_train = score; return *this; }
Result& setScoreTest(double score) { this->score_test = score; return *this; }
Result& setScoreTrainStd(double score_std) { this->score_train_std = score_std; return *this; }
Result& setScoreTestStd(double score_std) { this->score_test_std = score_std; return *this; }
Result& setTrainTime(double train_time) { this->train_time = train_time; return *this; }
Result& setTrainTimeStd(double train_time_std) { this->train_time_std = train_time_std; return *this; }
Result& setTestTime(double test_time) { this->test_time = test_time; return *this; }
Result& setTestTimeStd(double test_time_std) { this->test_time_std = test_time_std; return *this; }
Result& setNodes(float nodes) { this->nodes = nodes; return *this; }
Result& setLeaves(float leaves) { this->leaves = leaves; return *this; }
Result& setDepth(float depth) { this->depth = depth; return *this; }
Result& addScoreTrain(double score) { scores_train.push_back(score); return *this; }
Result& addScoreTest(double score) { scores_test.push_back(score); return *this; }
Result& addTimeTrain(double time) { times_train.push_back(time); return *this; }
Result& addTimeTest(double time) { times_test.push_back(time); return *this; }
const float get_score_train() const { return score_train; }
float get_score_test() { return score_test; }
const string& getDataset() const { return dataset; }
const string& getHyperparameters() const { return hyperparameters; }
const int getSamples() const { return samples; }
const int getFeatures() const { return features; }
const int getClasses() const { return classes; }
const double getScoreTrain() const { return score_train; }
const double getScoreTest() const { return score_test; }
const double getScoreTrainStd() const { return score_train_std; }
const double getScoreTestStd() const { return score_test_std; }
const double getTrainTime() const { return train_time; }
const double getTrainTimeStd() const { return train_time_std; }
const double getTestTime() const { return test_time; }
const double getTestTimeStd() const { return test_time_std; }
const float getNodes() const { return nodes; }
const float getLeaves() const { return leaves; }
const float getDepth() const { return depth; }
const vector<double>& getScoresTrain() const { return scores_train; }
const vector<double>& getScoresTest() const { return scores_test; }
const vector<double>& getTimesTrain() const { return times_train; }
const vector<double>& getTimesTest() const { return times_test; }
};
class Experiment {
private:
string title, model, platform, score_name, model_version, language_version, language;
bool discretized{ false }, stratified{ false };
vector<Result> results;
vector<int> randomSeeds;
int nfolds{ 0 };
float duration{ 0 };
json build_json();
public:
Experiment() = default;
Experiment& setTitle(const string& title) { this->title = title; return *this; }
Experiment& setModel(const string& model) { this->model = model; return *this; }
Experiment& setPlatform(const string& platform) { this->platform = platform; return *this; }
Experiment& setScoreName(const string& score_name) { this->score_name = score_name; return *this; }
Experiment& setModelVersion(const string& model_version) { this->model_version = model_version; return *this; }
Experiment& setLanguage(const string& language) { this->language = language; return *this; }
Experiment& setLanguageVersion(const string& language_version) { this->language_version = language_version; return *this; }
Experiment& setDiscretized(bool discretized) { this->discretized = discretized; return *this; }
Experiment& setStratified(bool stratified) { this->stratified = stratified; return *this; }
Experiment& setNFolds(int nfolds) { this->nfolds = nfolds; return *this; }
Experiment& addResult(Result result) { results.push_back(result); return *this; }
Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); return *this; }
Experiment& setDuration(float duration) { this->duration = duration; return *this; }
string get_file_name();
void save(const string& path);
void cross_validation(const string& path, const string& fileName);
void go(vector<string> filesToProcess, const string& path);
void show();
void report();
};
}
#endif

95
src/Platform/Folding.cc Normal file
View File

@@ -0,0 +1,95 @@
#include "Folding.h"
#include <algorithm>
#include <map>
Fold::Fold(int k, int n, int seed) : k(k), n(n), seed(seed)
{
random_device rd;
random_seed = default_random_engine(seed == -1 ? rd() : seed);
srand(seed == -1 ? time(0) : seed);
}
KFold::KFold(int k, int n, int seed) : Fold(k, n, seed), indices(vector<int>(n))
{
iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1
shuffle(indices.begin(), indices.end(), random_seed);
}
pair<vector<int>, vector<int>> KFold::getFold(int nFold)
{
if (nFold >= k || nFold < 0) {
throw out_of_range("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")");
}
int nTest = n / k;
auto train = vector<int>();
auto test = vector<int>();
for (int i = 0; i < n; i++) {
if (i >= nTest * nFold && i < nTest * (nFold + 1)) {
test.push_back(indices[i]);
} else {
train.push_back(indices[i]);
}
}
return { train, test };
}
StratifiedKFold::StratifiedKFold(int k, torch::Tensor& y, int seed) : Fold(k, y.numel(), seed)
{
n = y.numel();
this->y = vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + n);
build();
}
StratifiedKFold::StratifiedKFold(int k, const vector<int>& y, int seed)
: Fold(k, y.size(), seed)
{
this->y = y;
n = y.size();
build();
}
void StratifiedKFold::build()
{
stratified_indices = vector<vector<int>>(k);
int fold_size = n / k;
// Compute class counts and indices
auto class_indices = map<int, vector<int>>();
vector<int> class_counts(*max_element(y.begin(), y.end()) + 1, 0);
for (auto i = 0; i < n; ++i) {
class_counts[y[i]]++;
class_indices[y[i]].push_back(i);
}
// Shuffle class indices
for (auto& [cls, indices] : class_indices) {
shuffle(indices.begin(), indices.end(), random_seed);
}
// Assign indices to folds
for (auto label = 0; label < class_counts.size(); ++label) {
auto num_samples_to_take = class_counts[label] / k;
if (num_samples_to_take == 0)
continue;
auto remainder_samples_to_take = class_counts[label] % k;
for (auto fold = 0; fold < k; ++fold) {
auto it = next(class_indices[label].begin(), num_samples_to_take);
move(class_indices[label].begin(), it, back_inserter(stratified_indices[fold])); // ##
class_indices[label].erase(class_indices[label].begin(), it);
}
while (remainder_samples_to_take > 0) {
int fold = (rand() % static_cast<int>(k));
if (stratified_indices[fold].size() == fold_size + 1) {
continue;
}
auto it = next(class_indices[label].begin(), 1);
stratified_indices[fold].push_back(*class_indices[label].begin());
class_indices[label].erase(class_indices[label].begin(), it);
remainder_samples_to_take--;
}
}
}
pair<vector<int>, vector<int>> StratifiedKFold::getFold(int nFold)
{
if (nFold >= k || nFold < 0) {
throw out_of_range("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")");
}
vector<int> test_indices = stratified_indices[nFold];
vector<int> train_indices;
for (int i = 0; i < k; ++i) {
if (i == nFold) continue;
train_indices.insert(train_indices.end(), stratified_indices[i].begin(), stratified_indices[i].end());
}
return { train_indices, test_indices };
}

37
src/Platform/Folding.h Normal file
View File

@@ -0,0 +1,37 @@
#ifndef FOLDING_H
#define FOLDING_H
#include <torch/torch.h>
#include <vector>
#include <random>
using namespace std;
class Fold {
protected:
int k;
int n;
int seed;
default_random_engine random_seed;
public:
Fold(int k, int n, int seed = -1);
virtual pair<vector<int>, vector<int>> getFold(int nFold) = 0;
virtual ~Fold() = default;
int getNumberOfFolds() { return k; }
};
class KFold : public Fold {
private:
vector<int> indices;
public:
KFold(int k, int n, int seed = -1);
pair<vector<int>, vector<int>> getFold(int nFold) override;
};
class StratifiedKFold : public Fold {
private:
vector<int> y;
vector<vector<int>> stratified_indices;
void build();
public:
StratifiedKFold(int k, const vector<int>& y, int seed = -1);
StratifiedKFold(int k, torch::Tensor& y, int seed = -1);
pair<vector<int>, vector<int>> getFold(int nFold) override;
};
#endif

View File

@@ -1,78 +0,0 @@
#include "Metrics.h"
#include <set>
#include <cmath>
using namespace std;
namespace mdlp {
Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_),
numClasses(computeNumClasses(0, indices.size()))
{
}
int Metrics::computeNumClasses(size_t start, size_t end)
{
set<int> nClasses;
for (auto i = start; i < end; ++i) {
nClasses.insert(y[indices[i]]);
}
return static_cast<int>(nClasses.size());
}
void Metrics::setData(const labels_t& y_, const indices_t& indices_)
{
indices = indices_;
y = y_;
numClasses = computeNumClasses(0, indices.size());
entropyCache.clear();
igCache.clear();
}
precision_t Metrics::entropy(size_t start, size_t end)
{
precision_t p;
precision_t ventropy = 0;
int nElements = 0;
labels_t counts(numClasses + 1, 0);
if (end - start < 2)
return 0;
if (entropyCache.find({ start, end }) != entropyCache.end()) {
return entropyCache[{start, end}];
}
for (auto i = &indices[start]; i != &indices[end]; ++i) {
counts[y[*i]]++;
nElements++;
}
for (auto count : counts) {
if (count > 0) {
p = static_cast<precision_t>(count) / static_cast<precision_t>(nElements);
ventropy -= p * log2(p);
}
}
entropyCache[{start, end}] = ventropy;
return ventropy;
}
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
{
precision_t iGain;
precision_t entropyInterval;
precision_t entropyLeft;
precision_t entropyRight;
size_t nElementsLeft = cut - start;
size_t nElementsRight = end - cut;
size_t nElements = end - start;
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
return igCache[make_tuple(start, cut, end)];
}
entropyInterval = entropy(start, end);
entropyLeft = entropy(start, cut);
entropyRight = entropy(cut, end);
iGain = entropyInterval -
(static_cast<precision_t>(nElementsLeft) * entropyLeft +
static_cast<precision_t>(nElementsRight) * entropyRight) /
static_cast<precision_t>(nElements);
igCache[make_tuple(start, cut, end)] = iGain;
return iGain;
}
}

View File

@@ -1,22 +0,0 @@
#ifndef CCMETRICS_H
#define CCMETRICS_H
#include "typesFImdlp.h"
namespace mdlp {
class Metrics {
protected:
labels_t& y;
indices_t& indices;
int numClasses;
cacheEnt_t entropyCache = cacheEnt_t();
cacheIg_t igCache = cacheIg_t();
public:
Metrics(labels_t&, indices_t&);
void setData(const labels_t&, const indices_t&);
int computeNumClasses(size_t, size_t);
precision_t entropy(size_t, size_t);
precision_t informationGain(size_t, size_t, size_t);
};
}
#endif

54
src/Platform/Models.cc Normal file
View File

@@ -0,0 +1,54 @@
#include "Models.h"
namespace platform {
using namespace std;
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
Models* Models::factory = nullptr;;
Models* Models::instance()
{
//manages singleton
if (factory == nullptr)
factory = new Models();
return factory;
}
void Models::registerFactoryFunction(const string& name,
function<bayesnet::BaseClassifier* (void)> classFactoryFunction)
{
// register the class factory function
functionRegistry[name] = classFactoryFunction;
}
shared_ptr<bayesnet::BaseClassifier> Models::create(const string& name)
{
bayesnet::BaseClassifier* instance = nullptr;
// find name in the registry and call factory method.
auto it = functionRegistry.find(name);
if (it != functionRegistry.end())
instance = it->second();
// wrap instance in a shared ptr and return
if (instance != nullptr)
return shared_ptr<bayesnet::BaseClassifier>(instance);
else
return nullptr;
}
vector<string> Models::getNames()
{
vector<string> names;
transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names),
[](const pair<string, function<bayesnet::BaseClassifier* (void)>>& pair) { return pair.first; });
return names;
}
string Models::toString()
{
string result = "";
for (const auto& pair : functionRegistry) {
result += pair.first + ", ";
}
return "{" + result.substr(0, result.size() - 2) + "}";
}
Registrar::Registrar(const string& name, function<bayesnet::BaseClassifier* (void)> classFactoryFunction)
{
// register the class factory function
Models::instance()->registerFactoryFunction(name, classFactoryFunction);
}
}

37
src/Platform/Models.h Normal file
View File

@@ -0,0 +1,37 @@
#ifndef MODELS_H
#define MODELS_H
#include <map>
#include "BaseClassifier.h"
#include "AODE.h"
#include "TAN.h"
#include "KDB.h"
#include "SPODE.h"
#include "TANLd.h"
#include "KDBLd.h"
#include "SPODELd.h"
#include "AODELd.h"
#include "BoostAODE.h"
namespace platform {
class Models {
private:
map<string, function<bayesnet::BaseClassifier* (void)>> functionRegistry;
static Models* factory; //singleton
Models() {};
public:
Models(Models&) = delete;
void operator=(const Models&) = delete;
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
static Models* instance();
shared_ptr<bayesnet::BaseClassifier> create(const string& name);
void registerFactoryFunction(const string& name,
function<bayesnet::BaseClassifier* (void)> classFactoryFunction);
vector<string> getNames();
string toString();
};
class Registrar {
public:
Registrar(const string& className, function<bayesnet::BaseClassifier* (void)> classFactoryFunction);
};
}
#endif

11
src/Platform/Paths.h Normal file
View File

@@ -0,0 +1,11 @@
#ifndef PATHS_H
#define PATHS_H
#include <string>
namespace platform {
class Paths {
public:
static std::string datasets() { return "datasets/"; }
static std::string results() { return "results/"; }
};
}
#endif

115
src/Platform/Report.cc Normal file
View File

@@ -0,0 +1,115 @@
#include <sstream>
#include <locale>
#include "Report.h"
#include "BestResult.h"
namespace platform {
string headerLine(const string& text)
{
int n = MAXL - text.length() - 3;
n = n < 0 ? 0 : n;
return "* " + text + string(n, ' ') + "*\n";
}
string Report::fromVector(const string& key)
{
stringstream oss;
string sep = "";
oss << "[";
for (auto& item : data[key]) {
oss << sep << item.get<double>();
sep = ", ";
}
oss << "]";
return oss.str();
}
string fVector(const string& title, const json& data, const int width, const int precision)
{
stringstream oss;
string sep = "";
oss << title << "[";
for (const auto& item : data) {
oss << sep << fixed << setw(width) << setprecision(precision) << item.get<double>();
sep = ", ";
}
oss << "]";
return oss.str();
}
void Report::show()
{
header();
body();
footer();
}
struct separated : numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
string do_grouping() const { return "\03"; }
};
void Report::header()
{
locale mylocale(cout.getloc(), new separated);
locale::global(mylocale);
cout.imbue(mylocale);
stringstream oss;
cout << Colors::MAGENTA() << string(MAXL, '*') << endl;
cout << headerLine("Report " + data["model"].get<string>() + " ver. " + data["version"].get<string>() + " with " + to_string(data["folds"].get<int>()) + " Folds cross validation and " + to_string(data["seeds"].size()) + " random seeds. " + data["date"].get<string>() + " " + data["time"].get<string>());
cout << headerLine(data["title"].get<string>());
cout << headerLine("Random seeds: " + fromVector("seeds") + " Stratified: " + (data["stratified"].get<bool>() ? "True" : "False"));
oss << "Execution took " << setprecision(2) << fixed << data["duration"].get<float>() << " seconds, " << data["duration"].get<float>() / 3600 << " hours, on " << data["platform"].get<string>();
cout << headerLine(oss.str());
cout << headerLine("Score is " + data["score_name"].get<string>());
cout << string(MAXL, '*') << endl;
cout << endl;
}
void Report::body()
{
cout << Colors::GREEN() << "Dataset Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << endl;
cout << "============================== ====== ===== === ========= ========= ========= =============== ================== ===============" << endl;
json lastResult;
totalScore = 0;
bool odd = true;
for (const auto& r : data["results"]) {
auto color = odd ? Colors::CYAN() : Colors::BLUE();
cout << color << setw(30) << left << r["dataset"].get<string>() << " ";
cout << setw(6) << right << r["samples"].get<int>() << " ";
cout << setw(5) << right << r["features"].get<int>() << " ";
cout << setw(3) << right << r["classes"].get<int>() << " ";
cout << setw(9) << setprecision(2) << fixed << r["nodes"].get<float>() << " ";
cout << setw(9) << setprecision(2) << fixed << r["leaves"].get<float>() << " ";
cout << setw(9) << setprecision(2) << fixed << r["depth"].get<float>() << " ";
cout << setw(8) << right << setprecision(6) << fixed << r["score"].get<double>() << "±" << setw(6) << setprecision(4) << fixed << r["score_std"].get<double>() << " ";
cout << setw(11) << right << setprecision(6) << fixed << r["time"].get<double>() << "±" << setw(6) << setprecision(4) << fixed << r["time_std"].get<double>() << " ";
try {
cout << r["hyperparameters"].get<string>();
}
catch (const exception& err) {
cout << r["hyperparameters"];
}
cout << endl;
lastResult = r;
totalScore += r["score"].get<double>();
odd = !odd;
}
if (data["results"].size() == 1) {
cout << string(MAXL, '*') << endl;
cout << headerLine(fVector("Train scores: ", lastResult["scores_train"], 14, 12));
cout << headerLine(fVector("Test scores: ", lastResult["scores_test"], 14, 12));
cout << headerLine(fVector("Train times: ", lastResult["times_train"], 10, 3));
cout << headerLine(fVector("Test times: ", lastResult["times_test"], 10, 3));
cout << string(MAXL, '*') << endl;
}
}
void Report::footer()
{
cout << Colors::MAGENTA() << string(MAXL, '*') << endl;
auto score = data["score_name"].get<string>();
if (score == BestResult::scoreName()) {
stringstream oss;
oss << score << " compared to " << BestResult::title() << " .: " << totalScore / BestResult::score();
cout << headerLine(oss.str());
}
cout << string(MAXL, '*') << endl << Colors::RESET();
}
}

26
src/Platform/Report.h Normal file
View File

@@ -0,0 +1,26 @@
#ifndef REPORT_H
#define REPORT_H
#include <string>
#include <iostream>
#include <nlohmann/json.hpp>
#include "Colors.h"
using json = nlohmann::json;
const int MAXL = 128;
namespace platform {
using namespace std;
class Report {
public:
explicit Report(json data_) { data = data_; };
virtual ~Report() = default;
void show();
private:
void header();
void body();
void footer();
string fromVector(const string& key);
json data;
double totalScore; // Total score of all results in a report
};
};
#endif

239
src/Platform/Results.cc Normal file
View File

@@ -0,0 +1,239 @@
#include <filesystem>
#include "platformUtils.h"
#include "Results.h"
#include "Report.h"
#include "BestResult.h"
#include "Colors.h"
namespace platform {
Result::Result(const string& path, const string& filename)
: path(path)
, filename(filename)
{
auto data = load();
date = data["date"];
score = 0;
for (const auto& result : data["results"]) {
score += result["score"].get<double>();
}
scoreName = data["score_name"];
if (scoreName == BestResult::scoreName()) {
score /= BestResult::score();
}
title = data["title"];
duration = data["duration"];
model = data["model"];
}
json Result::load() const
{
ifstream resultData(path + "/" + filename);
if (resultData.is_open()) {
json data = json::parse(resultData);
return data;
}
throw invalid_argument("Unable to open result file. [" + path + "/" + filename + "]");
}
void Results::load()
{
using std::filesystem::directory_iterator;
for (const auto& file : directory_iterator(path)) {
auto filename = file.path().filename().string();
if (filename.find(".json") != string::npos && filename.find("results_") == 0) {
auto result = Result(path, filename);
bool addResult = true;
if (model != "any" && result.getModel() != model || scoreName != "any" && scoreName != result.getScoreName())
addResult = false;
if (addResult)
files.push_back(result);
}
}
}
string Result::to_string() const
{
stringstream oss;
oss << date << " ";
oss << setw(12) << left << model << " ";
oss << setw(11) << left << scoreName << " ";
oss << right << setw(11) << setprecision(7) << fixed << score << " ";
oss << setw(9) << setprecision(3) << fixed << duration << " ";
oss << setw(50) << left << title << " ";
return oss.str();
}
void Results::show() const
{
cout << Colors::GREEN() << "Results found: " << files.size() << endl;
cout << "-------------------" << endl;
auto i = 0;
cout << " # Date Model Score Name Score Duration Title" << endl;
cout << "=== ========== ============ =========== =========== ========= =============================================================" << endl;
bool odd = true;
for (const auto& result : files) {
auto color = odd ? Colors::BLUE() : Colors::CYAN();
cout << color << setw(3) << fixed << right << i++ << " ";
cout << result.to_string() << endl;
if (i == max && max != 0) {
break;
}
odd = !odd;
}
}
int Results::getIndex(const string& intent) const
{
string color;
if (intent == "delete") {
color = Colors::RED();
} else {
color = Colors::YELLOW();
}
cout << color << "Choose result to " << intent << " (cancel=-1): ";
string line;
getline(cin, line);
int index = stoi(line);
if (index >= -1 && index < static_cast<int>(files.size())) {
return index;
}
cout << "Invalid index" << endl;
return -1;
}
void Results::report(const int index) const
{
cout << Colors::YELLOW() << "Reporting " << files.at(index).getFilename() << endl;
auto data = files.at(index).load();
Report report(data);
report.show();
}
void Results::menu()
{
char option;
int index;
bool finished = false;
string filename, line, options = "qldhsr";
while (!finished) {
cout << Colors::RESET() << "Choose option (quit='q', list='l', delete='d', hide='h', sort='s', report='r'): ";
getline(cin, line);
if (line.size() == 0)
continue;
if (options.find(line[0]) != string::npos) {
if (line.size() > 1) {
cout << "Invalid option" << endl;
continue;
}
option = line[0];
} else {
index = stoi(line);
if (index >= 0 && index < files.size()) {
report(index);
} else {
cout << "Invalid option" << endl;
}
continue;
}
switch (option) {
case 'q':
finished = true;
break;
case 'l':
show();
break;
case 'd':
index = getIndex("delete");
if (index == -1)
break;
filename = files[index].getFilename();
cout << "Deleting " << filename << endl;
remove((path + "/" + filename).c_str());
files.erase(files.begin() + index);
cout << "File: " + filename + " deleted!" << endl;
show();
break;
case 'h':
index = getIndex("hide");
if (index == -1)
break;
filename = files[index].getFilename();
cout << "Hiding " << filename << endl;
rename((path + "/" + filename).c_str(), (path + "/." + filename).c_str());
files.erase(files.begin() + index);
show();
menu();
break;
case 's':
sortList();
show();
break;
case 'r':
index = getIndex("report");
if (index == -1)
break;
report(index);
break;
default:
cout << "Invalid option" << endl;
}
}
}
void Results::sortList()
{
cout << Colors::YELLOW() << "Choose sorting field (date='d', score='s', duration='u', model='m'): ";
string line;
char option;
getline(cin, line);
if (line.size() == 0)
return;
if (line.size() > 1) {
cout << "Invalid option" << endl;
return;
}
option = line[0];
switch (option) {
case 'd':
sortDate();
break;
case 's':
sortScore();
break;
case 'u':
sortDuration();
break;
case 'm':
sortModel();
break;
default:
cout << "Invalid option" << endl;
}
}
void Results::sortDate()
{
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
return a.getDate() > b.getDate();
});
}
void Results::sortModel()
{
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
return a.getModel() > b.getModel();
});
}
void Results::sortDuration()
{
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
return a.getDuration() > b.getDuration();
});
}
void Results::sortScore()
{
sort(files.begin(), files.end(), [](const Result& a, const Result& b) {
return a.getScore() > b.getScore();
});
}
void Results::manage()
{
if (files.size() == 0) {
cout << "No results found!" << endl;
exit(0);
}
show();
menu();
cout << "Done!" << endl;
}
}

56
src/Platform/Results.h Normal file
View File

@@ -0,0 +1,56 @@
#ifndef RESULTS_H
#define RESULTS_H
#include <map>
#include <vector>
#include <string>
#include <nlohmann/json.hpp>
namespace platform {
using namespace std;
using json = nlohmann::json;
class Result {
public:
Result(const string& path, const string& filename);
json load() const;
string to_string() const;
string getFilename() const { return filename; };
string getDate() const { return date; };
double getScore() const { return score; };
string getTitle() const { return title; };
double getDuration() const { return duration; };
string getModel() const { return model; };
string getScoreName() const { return scoreName; };
private:
string path;
string filename;
string date;
double score;
string title;
double duration;
string model;
string scoreName;
};
class Results {
public:
Results(const string& path, const int max, const string& model, const string& score) : path(path), max(max), model(model), scoreName(score) { load(); };
void manage();
private:
string path;
int max;
string model;
string scoreName;
vector<Result> files;
void load(); // Loads the list of results
void show() const;
void report(const int index) const;
int getIndex(const string& intent) const;
void menu();
void sortList();
void sortDate();
void sortScore();
void sortModel();
void sortDuration();
};
};
#endif

57
src/Platform/list.cc Normal file
View File

@@ -0,0 +1,57 @@
#include <iostream>
#include <locale>
#include "Paths.h"
#include "Colors.h"
#include "Datasets.h"
using namespace std;
const int BALANCE_LENGTH = 75;
struct separated : numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
string do_grouping() const { return "\03"; }
};
void outputBalance(const string& balance)
{
auto temp = string(balance);
while (temp.size() > BALANCE_LENGTH - 1) {
auto part = temp.substr(0, BALANCE_LENGTH);
cout << part << endl;
cout << setw(48) << " ";
temp = temp.substr(BALANCE_LENGTH);
}
cout << temp << endl;
}
int main(int argc, char** argv)
{
auto data = platform::Datasets(platform::Paths().datasets(), false);
locale mylocale(cout.getloc(), new separated);
locale::global(mylocale);
cout.imbue(mylocale);
cout << Colors::GREEN() << "Dataset Sampl. Feat. Cls. Balance" << endl;
string balanceBars = string(BALANCE_LENGTH, '=');
cout << "============================== ====== ===== === " << balanceBars << endl;
bool odd = true;
for (const auto& dataset : data.getNames()) {
auto color = odd ? Colors::CYAN() : Colors::BLUE();
cout << color << setw(30) << left << dataset << " ";
data.loadDataset(dataset);
auto nSamples = data.getNSamples(dataset);
cout << setw(6) << right << nSamples << " ";
cout << setw(5) << right << data.getFeatures(dataset).size() << " ";
cout << setw(3) << right << data.getNClasses(dataset) << " ";
stringstream oss;
string sep = "";
for (auto number : data.getClassesCounts(dataset)) {
oss << sep << setprecision(2) << fixed << (float)number / nSamples * 100.0 << "% (" << number << ")";
sep = " / ";
}
outputBalance(oss.str());
odd = !odd;
}
cout << Colors::RESET() << endl;
return 0;
}

122
src/Platform/main.cc Normal file
View File

@@ -0,0 +1,122 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include "platformUtils.h"
#include "Experiment.h"
#include "Datasets.h"
#include "DotEnv.h"
#include "Models.h"
#include "modelRegister.h"
#include "Paths.h"
using namespace std;
argparse::ArgumentParser manageArguments(int argc, char** argv)
{
auto env = platform::DotEnv();
argparse::ArgumentParser program("main");
program.add_argument("-d", "--dataset").default_value("").help("Dataset file name");
program.add_argument("-p", "--path")
.help("folder where the data files are located, default")
.default_value(string{ platform::Paths::datasets() });
program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->toString())
.action([](const std::string& value) {
static const vector<string> choices = platform::Models::instance()->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw runtime_error("Model must be one of " + platform::Models::instance()->toString());
}
);
program.add_argument("--title").default_value("").help("Experiment title");
program.add_argument("--discretize").help("Discretize input dataset").default_value((bool)stoi(env.get("discretize"))).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw runtime_error(err.what());
}
catch (...) {
throw runtime_error("Number of folds must be an integer");
}});
auto seed_values = env.getSeeds();
program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
try {
program.parse_args(argc, argv);
auto file_name = program.get<string>("dataset");
auto path = program.get<string>("path");
auto model_name = program.get<string>("model");
auto discretize_dataset = program.get<bool>("discretize");
auto stratified = program.get<bool>("stratified");
auto n_folds = program.get<int>("folds");
auto seeds = program.get<vector<int>>("seeds");
auto complete_file_name = path + file_name + ".arff";
auto title = program.get<string>("title");
if (title == "" && file_name == "") {
throw runtime_error("title is mandatory if dataset is not provided");
}
}
catch (const exception& err) {
cerr << err.what() << endl;
cerr << program;
exit(1);
}
return program;
}
int main(int argc, char** argv)
{
auto program = manageArguments(argc, argv);
bool saveResults = false;
auto file_name = program.get<string>("dataset");
auto path = program.get<string>("path");
auto model_name = program.get<string>("model");
auto discretize_dataset = program.get<bool>("discretize");
auto stratified = program.get<bool>("stratified");
auto n_folds = program.get<int>("folds");
auto seeds = program.get<vector<int>>("seeds");
vector<string> filesToTest;
auto datasets = platform::Datasets(path, true, platform::ARFF);
auto title = program.get<string>("title");
if (file_name != "") {
if (!datasets.isDataset(file_name)) {
cerr << "Dataset " << file_name << " not found" << endl;
exit(1);
}
if (title == "") {
title = "Test " + file_name + " " + model_name + " " + to_string(n_folds) + " folds";
}
filesToTest.push_back(file_name);
} else {
filesToTest = platform::Datasets(path, true, platform::ARFF).getNames();
saveResults = true;
}
/*
* Begin Processing
*/
auto env = platform::DotEnv();
auto experiment = platform::Experiment();
experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("14.0.3");
experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform"));
experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy");
for (auto seed : seeds) {
experiment.addRandomSeed(seed);
}
platform::Timer timer;
timer.start();
experiment.go(filesToTest, path);
experiment.setDuration(timer.getDuration());
if (saveResults)
experiment.save(platform::Paths::results());
else
experiment.report();
cout << "Done!" << endl;
return 0;
}

41
src/Platform/manage.cc Normal file
View File

@@ -0,0 +1,41 @@
#include <iostream>
#include <argparse/argparse.hpp>
#include "platformUtils.h"
#include "Paths.h"
#include "Results.h"
using namespace std;
argparse::ArgumentParser manageArguments(int argc, char** argv)
{
argparse::ArgumentParser program("manage");
program.add_argument("-n", "--number").default_value(0).help("Number of results to show (0 = all)").scan<'i', int>();
program.add_argument("-m", "--model").default_value("any").help("Filter results of the selected model)");
program.add_argument("-s", "--score").default_value("any").help("Filter results of the score name supplied");
try {
program.parse_args(argc, argv);
auto number = program.get<int>("number");
if (number < 0) {
throw runtime_error("Number of results must be greater than or equal to 0");
}
auto model = program.get<string>("model");
auto score = program.get<string>("score");
}
catch (const exception& err) {
cerr << err.what() << endl;
cerr << program;
exit(1);
}
return program;
}
int main(int argc, char** argv)
{
auto program = manageArguments(argc, argv);
auto number = program.get<int>("number");
auto model = program.get<string>("model");
auto score = program.get<string>("score");
auto results = platform::Results(platform::Paths::results(), number, model, score);
results.manage();
return 0;
}

View File

@@ -0,0 +1,21 @@
#ifndef MODEL_REGISTER_H
#define MODEL_REGISTER_H
static platform::Registrar registrarT("TAN",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::TAN();});
static platform::Registrar registrarTLD("TANLd",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::TANLd();});
static platform::Registrar registrarS("SPODE",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::SPODE(2);});
static platform::Registrar registrarSLD("SPODELd",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::SPODELd(2);});
static platform::Registrar registrarK("KDB",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::KDB(2);});
static platform::Registrar registrarKLD("KDBLd",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::KDBLd(2);});
static platform::Registrar registrarA("AODE",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODE();});
static platform::Registrar registrarALD("AODELd",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODELd();});
static platform::Registrar registrarBA("BoostAODE",
[](void) -> bayesnet::BaseClassifier* { return new bayesnet::BoostAODE();});
#endif

View File

@@ -1,4 +1,18 @@
#include "platformUtils.h" #include "platformUtils.h"
#include "Paths.h"
using namespace torch;
vector<string> split(const string& text, char delimiter)
{
vector<string> result;
stringstream ss(text);
string token;
while (getline(ss, token, delimiter)) {
result.push_back(token);
}
return result;
}
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features) pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)
{ {
@@ -14,7 +28,19 @@ pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t
return { Xd, maxes }; return { Xd, maxes };
} }
bool file_exists(const std::string& name) vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y)
{
vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
bool file_exists(const string& name)
{ {
if (FILE* file = fopen(name.c_str(), "r")) { if (FILE* file = fopen(name.c_str(), "r")) {
fclose(file); fclose(file);
@@ -24,19 +50,51 @@ bool file_exists(const std::string& name)
} }
} }
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(string name) tuple<Tensor, Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset)
{ {
auto handler = ArffFiles(); auto handler = ArffFiles();
handler.load(PATH + static_cast<string>(name) + ".arff"); handler.load(path + static_cast<string>(name) + ".arff", class_last);
// Get Dataset X, y // Get Dataset X, y
vector<mdlp::samples_t>& X = handler.getX(); vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY(); mdlp::labels_t& y = handler.getY();
// Get className & Features // Get className & Features
auto className = handler.getClassName(); auto className = handler.getClassName();
vector<string> features; vector<string> features;
for (auto feature : handler.getAttributes()) { auto attributes = handler.getAttributes();
features.push_back(feature.first); transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
Tensor Xd;
auto states = map<string, vector<int>>();
if (discretize_dataset) {
auto Xr = discretizeDataset(X, y);
Xd = torch::zeros({ static_cast<int>(Xr[0].size()), static_cast<int>(Xr.size()) }, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
iota(begin(states[features[i]]), end(states[features[i]]), 0);
Xd.index_put_({ "...", i }, torch::tensor(Xr[i], torch::kInt32));
} }
states[className] = vector<int>(*max_element(y.begin(), y.end()) + 1);
iota(begin(states[className]), end(states[className]), 0);
} else {
Xd = torch::zeros({ static_cast<int>(X[0].size()), static_cast<int>(X.size()) }, torch::kFloat32);
for (int i = 0; i < features.size(); ++i) {
Xd.index_put_({ "...", i }, torch::tensor(X[i]));
}
}
return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
}
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(const string& name)
{
auto handler = ArffFiles();
handler.load(platform::Paths::datasets() + static_cast<string>(name) + ".arff");
// Get Dataset X, y
vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
vector<string> features;
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
// Discretize Dataset // Discretize Dataset
vector<mdlp::labels_t> Xd; vector<mdlp::labels_t> Xd;
map<string, int> maxes; map<string, int> maxes;

View File

@@ -1,5 +1,6 @@
#ifndef PLATFORM_UTILS_H #ifndef PLATFORM_UTILS_H
#define PLATFORM_UTILS_H #define PLATFORM_UTILS_H
#include <torch/torch.h>
#include <string> #include <string>
#include <vector> #include <vector>
#include <map> #include <map>
@@ -10,6 +11,11 @@ using namespace std;
const string PATH = "../../data/"; const string PATH = "../../data/";
bool file_exists(const std::string& name); bool file_exists(const std::string& name);
vector<string> split(const string& text, char delimiter);
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features); pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features);
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(string name); vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y);
pair<torch::Tensor, map<string, vector<int>>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector<string>& features, const string& className);
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(const string& name);
tuple<torch::Tensor, torch::Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset);
map<string, vector<int>> get_states(vector<string>& features, string className, map<string, int>& maxes);
#endif //PLATFORM_UTILS_H #endif //PLATFORM_UTILS_H

View File

@@ -1,18 +0,0 @@
#ifndef TYPES_H
#define TYPES_H
#include <vector>
#include <map>
#include <stdexcept>
using namespace std;
namespace mdlp {
typedef float precision_t;
typedef vector<precision_t> samples_t;
typedef vector<int> labels_t;
typedef vector<size_t> indices_t;
typedef vector<precision_t> cutPoints_t;
typedef map<pair<int, int>, precision_t> cacheEnt_t;
typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
}
#endif

View File

@@ -9,29 +9,21 @@ TEST_CASE("Test Bayesian Network")
{ {
auto [Xd, y, features, className, states] = loadFile("iris"); auto [Xd, y, features, className, states] = loadFile("iris");
SECTION("Test Update Nodes")
{
auto net = bayesnet::Network();
net.addNode("A", 3);
REQUIRE(net.getStates() == 3);
net.addNode("A", 5);
REQUIRE(net.getStates() == 5);
}
SECTION("Test get features") SECTION("Test get features")
{ {
auto net = bayesnet::Network(); auto net = bayesnet::Network();
net.addNode("A", 3); net.addNode("A");
net.addNode("B", 5); net.addNode("B");
REQUIRE(net.getFeatures() == vector<string>{"A", "B"}); REQUIRE(net.getFeatures() == vector<string>{"A", "B"});
net.addNode("C", 2); net.addNode("C");
REQUIRE(net.getFeatures() == vector<string>{"A", "B", "C"}); REQUIRE(net.getFeatures() == vector<string>{"A", "B", "C"});
} }
SECTION("Test get edges") SECTION("Test get edges")
{ {
auto net = bayesnet::Network(); auto net = bayesnet::Network();
net.addNode("A", 3); net.addNode("A");
net.addNode("B", 5); net.addNode("B");
net.addNode("C", 2); net.addNode("C");
net.addEdge("A", "B"); net.addEdge("A", "B");
net.addEdge("B", "C"); net.addEdge("B", "C");
REQUIRE(net.getEdges() == vector<pair<string, string>>{ {"A", "B"}, { "B", "C" } }); REQUIRE(net.getEdges() == vector<pair<string, string>>{ {"A", "B"}, { "B", "C" } });

View File

@@ -2,8 +2,10 @@ if(ENABLE_TESTING)
set(TEST_MAIN "unit_tests") set(TEST_MAIN "unit_tests")
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/src/Platform) include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
set(TEST_SOURCES BayesModels.cc BayesNetwork.cc ${BayesNet_SOURCES} ${Platform_SOURCES}) include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
set(TEST_SOURCES BayesModels.cc BayesNetwork.cc ${BayesNet_SOURCE_DIR}/src/Platform/platformUtils.cc ${BayesNet_SOURCES})
add_executable(${TEST_MAIN} ${TEST_SOURCES}) add_executable(${TEST_MAIN} ${TEST_SOURCES})
target_link_libraries(${TEST_MAIN} PUBLIC "${TORCH_LIBRARIES}" Catch2::Catch2WithMain) target_link_libraries(${TEST_MAIN} PUBLIC "${TORCH_LIBRARIES}" ArffFiles mdlp Catch2::Catch2WithMain)
add_test(NAME ${TEST_MAIN} COMMAND ${TEST_MAIN}) add_test(NAME ${TEST_MAIN} COMMAND ${TEST_MAIN})
endif(ENABLE_TESTING) endif(ENABLE_TESTING)