Compare commits
2 Commits
5f70449091
...
f519003766
Author | SHA1 | Date | |
---|---|---|---|
f519003766
|
|||
8ddfd58a50
|
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
[submodule "lib/mdlp"]
|
||||||
|
path = lib/mdlp
|
||||||
|
url = https://github.com/rmontanana/mdlp
|
||||||
|
[submodule "lib/catch2"]
|
||||||
|
path = lib/catch2
|
||||||
|
url = https://github.com/catchorg/Catch2.git
|
@@ -33,12 +33,22 @@ set(CMAKE_BUILD_TYPE "Debug")
|
|||||||
# --------------
|
# --------------
|
||||||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
|
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
|
||||||
|
|
||||||
|
include(AddGitSubmodule)
|
||||||
|
include(StaticAnalyzers) # clang-tidy
|
||||||
|
include(CodeCoverage)
|
||||||
|
|
||||||
|
# External libraries - dependencies of BayesNet
|
||||||
|
# ---------------------------------------------
|
||||||
|
# include(FetchContent)
|
||||||
|
add_git_submodule("lib/mdlp")
|
||||||
|
add_git_submodule("lib/catch2")
|
||||||
|
|
||||||
# Subdirectories
|
# Subdirectories
|
||||||
# --------------
|
# --------------
|
||||||
add_subdirectory(config)
|
add_subdirectory(config)
|
||||||
add_subdirectory(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
add_subdirectory(lib/Files)
|
||||||
add_subdirectory(${BayesNet_SOURCE_DIR}/src/Platform)
|
add_subdirectory(src/BayesNet)
|
||||||
|
add_subdirectory(src/Platform)
|
||||||
add_subdirectory(sample)
|
add_subdirectory(sample)
|
||||||
|
|
||||||
file(GLOB BayesNet_HEADERS CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.h ${BayesNet_SOURCE_DIR}/BayesNet/*.hpp)
|
file(GLOB BayesNet_HEADERS CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/BayesNet/*.h ${BayesNet_SOURCE_DIR}/BayesNet/*.hpp)
|
||||||
@@ -51,14 +61,14 @@ if (ENABLE_TESTING)
|
|||||||
MESSAGE("Testing enabled")
|
MESSAGE("Testing enabled")
|
||||||
enable_testing()
|
enable_testing()
|
||||||
if (CODE_COVERAGE)
|
if (CODE_COVERAGE)
|
||||||
include(CodeCoverage)
|
#include(CodeCoverage)
|
||||||
MESSAGE("Code coverage enabled")
|
MESSAGE("Code coverage enabled")
|
||||||
set(CMAKE_C_FLAGS " ${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage")
|
set(CMAKE_C_FLAGS " ${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage")
|
||||||
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage")
|
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage")
|
||||||
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
|
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
|
||||||
endif (CODE_COVERAGE)
|
endif (CODE_COVERAGE)
|
||||||
find_package(Catch2 3 REQUIRED)
|
#find_package(Catch2 3 REQUIRED)
|
||||||
include(CTest)
|
include(CTest)
|
||||||
include(Catch)
|
#include(Catch)
|
||||||
add_subdirectory(tests)
|
add_subdirectory(tests)
|
||||||
endif (ENABLE_TESTING)
|
endif (ENABLE_TESTING)
|
12
cmake/modules/AddGitSubmodule.cmake
Normal file
12
cmake/modules/AddGitSubmodule.cmake
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
|
||||||
|
function(add_git_submodule dir)
|
||||||
|
find_package(Git REQUIRED)
|
||||||
|
|
||||||
|
if(NOT EXISTS ${dir}/CMakeLists.txt)
|
||||||
|
message(STATUS "🚨 Adding git submodule => ${dir}")
|
||||||
|
execute_process(COMMAND ${GIT_EXECUTABLE}
|
||||||
|
submodule update --init --recursive -- ${dir}
|
||||||
|
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||||
|
endif()
|
||||||
|
add_subdirectory(${dir})
|
||||||
|
endfunction(add_git_submodule)
|
22
cmake/modules/StaticAnalyzers.cmake
Normal file
22
cmake/modules/StaticAnalyzers.cmake
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
if(ENABLE_CLANG_TIDY)
|
||||||
|
find_program(CLANG_TIDY_COMMAND NAMES clang-tidy)
|
||||||
|
|
||||||
|
if(NOT CLANG_TIDY_COMMAND)
|
||||||
|
message(WARNING "🔴 CMake_RUN_CLANG_TIDY is ON but clang-tidy is not found!")
|
||||||
|
set(CMAKE_CXX_CLANG_TIDY "" CACHE STRING "" FORCE)
|
||||||
|
else()
|
||||||
|
|
||||||
|
message(STATUS "🟢 CMake_RUN_CLANG_TIDY is ON")
|
||||||
|
set(CLANGTIDY_EXTRA_ARGS
|
||||||
|
"-extra-arg=-Wno-unknown-warning-option"
|
||||||
|
)
|
||||||
|
set(CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_COMMAND};-p=${CMAKE_BINARY_DIR};${CLANGTIDY_EXTRA_ARGS}" CACHE STRING "" FORCE)
|
||||||
|
|
||||||
|
add_custom_target(clang-tidy
|
||||||
|
COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target ${CMAKE_PROJECT_NAME}
|
||||||
|
COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target clang-tidy
|
||||||
|
COMMENT "Running clang-tidy..."
|
||||||
|
)
|
||||||
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
endif()
|
||||||
|
endif(ENABLE_CLANG_TIDY)
|
2
lib/Files/CMakeLists.txt
Normal file
2
lib/Files/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
add_library(ArffFiles ArffFiles.cc)
|
||||||
|
#target_link_libraries(BayesNet "${TORCH_LIBRARIES}")
|
1
lib/catch2
Submodule
1
lib/catch2
Submodule
Submodule lib/catch2 added at 4acc51828f
1
lib/mdlp
Submodule
1
lib/mdlp
Submodule
Submodule lib/mdlp added at fbffc3a9c4
@@ -1,4 +1,6 @@
|
|||||||
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
||||||
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
||||||
add_executable(sample sample.cc ${BayesNet_SOURCE_DIR}/src/Platform/ArffFiles.cc ${BayesNet_SOURCE_DIR}/src/Platform/CPPFImdlp.cpp ${BayesNet_SOURCE_DIR}/src/Platform/Metrics.cpp ${BayesNet_SOURCE_DIR}/src/Platform/typesFImdlp.h ${BayesNet_HEADERS})
|
include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
|
||||||
target_link_libraries(sample BayesNet "${TORCH_LIBRARIES}")
|
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
|
||||||
|
add_executable(BayesNetSample sample.cc)
|
||||||
|
target_link_libraries(BayesNetSample BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
|
@@ -5,7 +5,7 @@
|
|||||||
#include <getopt.h>
|
#include <getopt.h>
|
||||||
#include "ArffFiles.h"
|
#include "ArffFiles.h"
|
||||||
#include "Network.h"
|
#include "Network.h"
|
||||||
#include "Metrics.hpp"
|
#include "BayesMetrics.h"
|
||||||
#include "CPPFImdlp.h"
|
#include "CPPFImdlp.h"
|
||||||
#include "KDB.h"
|
#include "KDB.h"
|
||||||
#include "SPODE.h"
|
#include "SPODE.h"
|
||||||
|
@@ -2,7 +2,7 @@
|
|||||||
#define CLASSIFIERS_H
|
#define CLASSIFIERS_H
|
||||||
#include <torch/torch.h>
|
#include <torch/torch.h>
|
||||||
#include "Network.h"
|
#include "Network.h"
|
||||||
#include "Metrics.hpp"
|
#include "BayesMetrics.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace torch;
|
using namespace torch;
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
#include "Metrics.hpp"
|
#include "BayesMetrics.h"
|
||||||
#include "Mst.h"
|
#include "Mst.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
namespace bayesnet {
|
namespace bayesnet {
|
@@ -1,2 +1,2 @@
|
|||||||
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc Metrics.cc BaseClassifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc Mst.cc)
|
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc BaseClassifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc Mst.cc)
|
||||||
target_link_libraries(BayesNet "${TORCH_LIBRARIES}")
|
target_link_libraries(BayesNet "${TORCH_LIBRARIES}")
|
@@ -2,7 +2,7 @@
|
|||||||
#define ENSEMBLE_H
|
#define ENSEMBLE_H
|
||||||
#include <torch/torch.h>
|
#include <torch/torch.h>
|
||||||
#include "BaseClassifier.h"
|
#include "BaseClassifier.h"
|
||||||
#include "Metrics.hpp"
|
#include "BayesMetrics.h"
|
||||||
#include "bayesnetUtils.h"
|
#include "bayesnetUtils.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace torch;
|
using namespace torch;
|
||||||
|
@@ -1,4 +1,6 @@
|
|||||||
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
||||||
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
||||||
add_executable(main Experiment.cc ArffFiles.cc CPPFImdlp.cpp Metrics.cpp platformUtils.cc)
|
include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
|
||||||
target_link_libraries(main BayesNet "${TORCH_LIBRARIES}")
|
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
|
||||||
|
add_executable(main Experiment.cc platformUtils.cc)
|
||||||
|
target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
|
@@ -1,221 +0,0 @@
|
|||||||
#include <numeric>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <set>
|
|
||||||
#include <cmath>
|
|
||||||
#include "CPPFImdlp.h"
|
|
||||||
#include "Metrics.h"
|
|
||||||
|
|
||||||
namespace mdlp {
|
|
||||||
|
|
||||||
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_),
|
|
||||||
max_depth(max_depth_),
|
|
||||||
proposed_cuts(proposed)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
CPPFImdlp::CPPFImdlp() = default;
|
|
||||||
|
|
||||||
CPPFImdlp::~CPPFImdlp() = default;
|
|
||||||
|
|
||||||
size_t CPPFImdlp::compute_max_num_cut_points() const
|
|
||||||
{
|
|
||||||
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples
|
|
||||||
if (proposed_cuts == 0) {
|
|
||||||
return numeric_limits<size_t>::max();
|
|
||||||
}
|
|
||||||
if (proposed_cuts < 0 || proposed_cuts > static_cast<float>(X.size())) {
|
|
||||||
throw invalid_argument("wrong proposed num_cuts value");
|
|
||||||
}
|
|
||||||
if (proposed_cuts < 1)
|
|
||||||
return static_cast<size_t>(round(static_cast<float>(X.size()) * proposed_cuts));
|
|
||||||
return static_cast<size_t>(proposed_cuts);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
|
|
||||||
{
|
|
||||||
X = X_;
|
|
||||||
y = y_;
|
|
||||||
num_cut_points = compute_max_num_cut_points();
|
|
||||||
depth = 0;
|
|
||||||
discretizedData.clear();
|
|
||||||
cutPoints.clear();
|
|
||||||
if (X.size() != y.size()) {
|
|
||||||
throw invalid_argument("X and y must have the same size");
|
|
||||||
}
|
|
||||||
if (X.empty() || y.empty()) {
|
|
||||||
throw invalid_argument("X and y must have at least one element");
|
|
||||||
}
|
|
||||||
if (min_length < 3) {
|
|
||||||
throw invalid_argument("min_length must be greater than 2");
|
|
||||||
}
|
|
||||||
if (max_depth < 1) {
|
|
||||||
throw invalid_argument("max_depth must be greater than 0");
|
|
||||||
}
|
|
||||||
indices = sortIndices(X_, y_);
|
|
||||||
metrics.setData(y, indices);
|
|
||||||
computeCutPoints(0, X.size(), 1);
|
|
||||||
sort(cutPoints.begin(), cutPoints.end());
|
|
||||||
if (num_cut_points > 0) {
|
|
||||||
// Select the best (with lower entropy) cut points
|
|
||||||
while (cutPoints.size() > num_cut_points) {
|
|
||||||
resizeCutPoints();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
|
|
||||||
{
|
|
||||||
size_t n;
|
|
||||||
size_t m;
|
|
||||||
size_t idxPrev = cut - 1 >= start ? cut - 1 : cut;
|
|
||||||
size_t idxNext = cut + 1 < end ? cut + 1 : cut;
|
|
||||||
bool backWall; // true if duplicates reach beginning of the interval
|
|
||||||
precision_t previous;
|
|
||||||
precision_t actual;
|
|
||||||
precision_t next;
|
|
||||||
previous = X[indices[idxPrev]];
|
|
||||||
actual = X[indices[cut]];
|
|
||||||
next = X[indices[idxNext]];
|
|
||||||
// definition 2 of the paper => X[t-1] < X[t]
|
|
||||||
// get the first equal value of X in the interval
|
|
||||||
while (idxPrev > start && actual == previous) {
|
|
||||||
previous = X[indices[--idxPrev]];
|
|
||||||
}
|
|
||||||
backWall = idxPrev == start && actual == previous;
|
|
||||||
// get the last equal value of X in the interval
|
|
||||||
while (idxNext < end - 1 && actual == next) {
|
|
||||||
next = X[indices[++idxNext]];
|
|
||||||
}
|
|
||||||
// # of duplicates before cutpoint
|
|
||||||
n = cut - 1 - idxPrev;
|
|
||||||
// # of duplicates after cutpoint
|
|
||||||
m = idxNext - cut - 1;
|
|
||||||
// Decide which values to use
|
|
||||||
cut = cut + (backWall ? m + 1 : -n);
|
|
||||||
actual = X[indices[cut]];
|
|
||||||
return { (actual + previous) / 2, cut };
|
|
||||||
}
|
|
||||||
|
|
||||||
void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_)
|
|
||||||
{
|
|
||||||
size_t cut;
|
|
||||||
pair<precision_t, size_t> result;
|
|
||||||
// Check if the interval length and the depth are Ok
|
|
||||||
if (end - start < min_length || depth_ > max_depth)
|
|
||||||
return;
|
|
||||||
depth = depth_ > depth ? depth_ : depth;
|
|
||||||
cut = getCandidate(start, end);
|
|
||||||
if (cut == numeric_limits<size_t>::max())
|
|
||||||
return;
|
|
||||||
if (mdlp(start, cut, end)) {
|
|
||||||
result = valueCutPoint(start, cut, end);
|
|
||||||
cut = result.second;
|
|
||||||
cutPoints.push_back(result.first);
|
|
||||||
computeCutPoints(start, cut, depth_ + 1);
|
|
||||||
computeCutPoints(cut, end, depth_ + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t CPPFImdlp::getCandidate(size_t start, size_t end)
|
|
||||||
{
|
|
||||||
/* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
|
|
||||||
E(A, TA; S) is minimal amongst all the candidate cut points. */
|
|
||||||
size_t candidate = numeric_limits<size_t>::max();
|
|
||||||
size_t elements = end - start;
|
|
||||||
bool sameValues = true;
|
|
||||||
precision_t entropy_left;
|
|
||||||
precision_t entropy_right;
|
|
||||||
precision_t minEntropy;
|
|
||||||
// Check if all the values of the variable in the interval are the same
|
|
||||||
for (size_t idx = start + 1; idx < end; idx++) {
|
|
||||||
if (X[indices[idx]] != X[indices[start]]) {
|
|
||||||
sameValues = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (sameValues)
|
|
||||||
return candidate;
|
|
||||||
minEntropy = metrics.entropy(start, end);
|
|
||||||
for (size_t idx = start + 1; idx < end; idx++) {
|
|
||||||
// Cutpoints are always on boundaries (definition 2)
|
|
||||||
if (y[indices[idx]] == y[indices[idx - 1]])
|
|
||||||
continue;
|
|
||||||
entropy_left = precision_t(idx - start) / static_cast<precision_t>(elements) * metrics.entropy(start, idx);
|
|
||||||
entropy_right = precision_t(end - idx) / static_cast<precision_t>(elements) * metrics.entropy(idx, end);
|
|
||||||
if (entropy_left + entropy_right < minEntropy) {
|
|
||||||
minEntropy = entropy_left + entropy_right;
|
|
||||||
candidate = idx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return candidate;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
|
|
||||||
{
|
|
||||||
int k;
|
|
||||||
int k1;
|
|
||||||
int k2;
|
|
||||||
precision_t ig;
|
|
||||||
precision_t delta;
|
|
||||||
precision_t ent;
|
|
||||||
precision_t ent1;
|
|
||||||
precision_t ent2;
|
|
||||||
auto N = precision_t(end - start);
|
|
||||||
k = metrics.computeNumClasses(start, end);
|
|
||||||
k1 = metrics.computeNumClasses(start, cut);
|
|
||||||
k2 = metrics.computeNumClasses(cut, end);
|
|
||||||
ent = metrics.entropy(start, end);
|
|
||||||
ent1 = metrics.entropy(start, cut);
|
|
||||||
ent2 = metrics.entropy(cut, end);
|
|
||||||
ig = metrics.informationGain(start, cut, end);
|
|
||||||
delta = static_cast<precision_t>(log2(pow(3, precision_t(k)) - 2) -
|
|
||||||
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
|
|
||||||
precision_t term = 1 / N * (log2(N - 1) + delta);
|
|
||||||
return ig > term;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
|
||||||
indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
|
|
||||||
{
|
|
||||||
indices_t idx(X_.size());
|
|
||||||
iota(idx.begin(), idx.end(), 0);
|
|
||||||
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
|
|
||||||
if (X_[i1] == X_[i2])
|
|
||||||
return y_[i1] < y_[i2];
|
|
||||||
else
|
|
||||||
return X_[i1] < X_[i2];
|
|
||||||
});
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CPPFImdlp::resizeCutPoints()
|
|
||||||
{
|
|
||||||
//Compute entropy of each of the whole cutpoint set and discards the biggest value
|
|
||||||
precision_t maxEntropy = 0;
|
|
||||||
precision_t entropy;
|
|
||||||
size_t maxEntropyIdx = 0;
|
|
||||||
size_t begin = 0;
|
|
||||||
size_t end;
|
|
||||||
for (size_t idx = 0; idx < cutPoints.size(); idx++) {
|
|
||||||
end = begin;
|
|
||||||
while (X[indices[end]] < cutPoints[idx] && end < X.size())
|
|
||||||
end++;
|
|
||||||
entropy = metrics.entropy(begin, end);
|
|
||||||
if (entropy > maxEntropy) {
|
|
||||||
maxEntropy = entropy;
|
|
||||||
maxEntropyIdx = idx;
|
|
||||||
}
|
|
||||||
begin = end;
|
|
||||||
}
|
|
||||||
cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx));
|
|
||||||
}
|
|
||||||
labels_t& CPPFImdlp::transform(const samples_t& data)
|
|
||||||
{
|
|
||||||
discretizedData.reserve(data.size());
|
|
||||||
for (const precision_t& item : data) {
|
|
||||||
auto upper = upper_bound(cutPoints.begin(), cutPoints.end(), item);
|
|
||||||
discretizedData.push_back(upper - cutPoints.begin());
|
|
||||||
}
|
|
||||||
return discretizedData;
|
|
||||||
}
|
|
||||||
}
|
|
@@ -1,45 +0,0 @@
|
|||||||
#ifndef CPPFIMDLP_H
|
|
||||||
#define CPPFIMDLP_H
|
|
||||||
|
|
||||||
#include "typesFImdlp.h"
|
|
||||||
#include "Metrics.h"
|
|
||||||
#include <limits>
|
|
||||||
#include <utility>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
namespace mdlp {
|
|
||||||
class CPPFImdlp {
|
|
||||||
protected:
|
|
||||||
size_t min_length = 3;
|
|
||||||
int depth = 0;
|
|
||||||
int max_depth = numeric_limits<int>::max();
|
|
||||||
float proposed_cuts = 0;
|
|
||||||
indices_t indices = indices_t();
|
|
||||||
samples_t X = samples_t();
|
|
||||||
labels_t y = labels_t();
|
|
||||||
Metrics metrics = Metrics(y, indices);
|
|
||||||
cutPoints_t cutPoints;
|
|
||||||
size_t num_cut_points = numeric_limits<size_t>::max();
|
|
||||||
labels_t discretizedData = labels_t();
|
|
||||||
|
|
||||||
static indices_t sortIndices(samples_t&, labels_t&);
|
|
||||||
|
|
||||||
void computeCutPoints(size_t, size_t, int);
|
|
||||||
void resizeCutPoints();
|
|
||||||
bool mdlp(size_t, size_t, size_t);
|
|
||||||
size_t getCandidate(size_t, size_t);
|
|
||||||
size_t compute_max_num_cut_points() const;
|
|
||||||
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
|
|
||||||
|
|
||||||
public:
|
|
||||||
CPPFImdlp();
|
|
||||||
CPPFImdlp(size_t, int, float);
|
|
||||||
~CPPFImdlp();
|
|
||||||
void fit(samples_t&, labels_t&);
|
|
||||||
inline cutPoints_t getCutPoints() const { return cutPoints; };
|
|
||||||
labels_t& transform(const samples_t&);
|
|
||||||
inline int get_depth() const { return depth; };
|
|
||||||
static inline string version() { return "1.1.2"; };
|
|
||||||
};
|
|
||||||
}
|
|
||||||
#endif
|
|
@@ -5,7 +5,7 @@
|
|||||||
#include <getopt.h>
|
#include <getopt.h>
|
||||||
#include "ArffFiles.h"
|
#include "ArffFiles.h"
|
||||||
#include "Network.h"
|
#include "Network.h"
|
||||||
#include "Metrics.hpp"
|
#include "BayesMetrics.h"
|
||||||
#include "CPPFImdlp.h"
|
#include "CPPFImdlp.h"
|
||||||
#include "KDB.h"
|
#include "KDB.h"
|
||||||
#include "SPODE.h"
|
#include "SPODE.h"
|
||||||
|
@@ -1,78 +0,0 @@
|
|||||||
#include "Metrics.h"
|
|
||||||
#include <set>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
namespace mdlp {
|
|
||||||
Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_),
|
|
||||||
numClasses(computeNumClasses(0, indices.size()))
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
int Metrics::computeNumClasses(size_t start, size_t end)
|
|
||||||
{
|
|
||||||
set<int> nClasses;
|
|
||||||
for (auto i = start; i < end; ++i) {
|
|
||||||
nClasses.insert(y[indices[i]]);
|
|
||||||
}
|
|
||||||
return static_cast<int>(nClasses.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
void Metrics::setData(const labels_t& y_, const indices_t& indices_)
|
|
||||||
{
|
|
||||||
indices = indices_;
|
|
||||||
y = y_;
|
|
||||||
numClasses = computeNumClasses(0, indices.size());
|
|
||||||
entropyCache.clear();
|
|
||||||
igCache.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
precision_t Metrics::entropy(size_t start, size_t end)
|
|
||||||
{
|
|
||||||
precision_t p;
|
|
||||||
precision_t ventropy = 0;
|
|
||||||
int nElements = 0;
|
|
||||||
labels_t counts(numClasses + 1, 0);
|
|
||||||
if (end - start < 2)
|
|
||||||
return 0;
|
|
||||||
if (entropyCache.find({ start, end }) != entropyCache.end()) {
|
|
||||||
return entropyCache[{start, end}];
|
|
||||||
}
|
|
||||||
for (auto i = &indices[start]; i != &indices[end]; ++i) {
|
|
||||||
counts[y[*i]]++;
|
|
||||||
nElements++;
|
|
||||||
}
|
|
||||||
for (auto count : counts) {
|
|
||||||
if (count > 0) {
|
|
||||||
p = static_cast<precision_t>(count) / static_cast<precision_t>(nElements);
|
|
||||||
ventropy -= p * log2(p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
entropyCache[{start, end}] = ventropy;
|
|
||||||
return ventropy;
|
|
||||||
}
|
|
||||||
|
|
||||||
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
|
|
||||||
{
|
|
||||||
precision_t iGain;
|
|
||||||
precision_t entropyInterval;
|
|
||||||
precision_t entropyLeft;
|
|
||||||
precision_t entropyRight;
|
|
||||||
size_t nElementsLeft = cut - start;
|
|
||||||
size_t nElementsRight = end - cut;
|
|
||||||
size_t nElements = end - start;
|
|
||||||
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
|
|
||||||
return igCache[make_tuple(start, cut, end)];
|
|
||||||
}
|
|
||||||
entropyInterval = entropy(start, end);
|
|
||||||
entropyLeft = entropy(start, cut);
|
|
||||||
entropyRight = entropy(cut, end);
|
|
||||||
iGain = entropyInterval -
|
|
||||||
(static_cast<precision_t>(nElementsLeft) * entropyLeft +
|
|
||||||
static_cast<precision_t>(nElementsRight) * entropyRight) /
|
|
||||||
static_cast<precision_t>(nElements);
|
|
||||||
igCache[make_tuple(start, cut, end)] = iGain;
|
|
||||||
return iGain;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@@ -1,22 +0,0 @@
|
|||||||
#ifndef CCMETRICS_H
|
|
||||||
#define CCMETRICS_H
|
|
||||||
|
|
||||||
#include "typesFImdlp.h"
|
|
||||||
|
|
||||||
namespace mdlp {
|
|
||||||
class Metrics {
|
|
||||||
protected:
|
|
||||||
labels_t& y;
|
|
||||||
indices_t& indices;
|
|
||||||
int numClasses;
|
|
||||||
cacheEnt_t entropyCache = cacheEnt_t();
|
|
||||||
cacheIg_t igCache = cacheIg_t();
|
|
||||||
public:
|
|
||||||
Metrics(labels_t&, indices_t&);
|
|
||||||
void setData(const labels_t&, const indices_t&);
|
|
||||||
int computeNumClasses(size_t, size_t);
|
|
||||||
precision_t entropy(size_t, size_t);
|
|
||||||
precision_t informationGain(size_t, size_t, size_t);
|
|
||||||
};
|
|
||||||
}
|
|
||||||
#endif
|
|
@@ -1,18 +0,0 @@
|
|||||||
#ifndef TYPES_H
|
|
||||||
#define TYPES_H
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <map>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
namespace mdlp {
|
|
||||||
typedef float precision_t;
|
|
||||||
typedef vector<precision_t> samples_t;
|
|
||||||
typedef vector<int> labels_t;
|
|
||||||
typedef vector<size_t> indices_t;
|
|
||||||
typedef vector<precision_t> cutPoints_t;
|
|
||||||
typedef map<pair<int, int>, precision_t> cacheEnt_t;
|
|
||||||
typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
|
|
||||||
}
|
|
||||||
#endif
|
|
@@ -2,8 +2,10 @@ if(ENABLE_TESTING)
|
|||||||
set(TEST_MAIN "unit_tests")
|
set(TEST_MAIN "unit_tests")
|
||||||
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
||||||
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
||||||
set(TEST_SOURCES BayesModels.cc BayesNetwork.cc ${BayesNet_SOURCES} ${Platform_SOURCES})
|
include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
|
||||||
|
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
|
||||||
|
set(TEST_SOURCES BayesModels.cc BayesNetwork.cc ${BayesNet_SOURCE_DIR}/src/Platform/platformUtils.cc ${BayesNet_SOURCES})
|
||||||
add_executable(${TEST_MAIN} ${TEST_SOURCES})
|
add_executable(${TEST_MAIN} ${TEST_SOURCES})
|
||||||
target_link_libraries(${TEST_MAIN} PUBLIC "${TORCH_LIBRARIES}" Catch2::Catch2WithMain)
|
target_link_libraries(${TEST_MAIN} PUBLIC "${TORCH_LIBRARIES}" ArffFiles mdlp Catch2::Catch2WithMain)
|
||||||
add_test(NAME ${TEST_MAIN} COMMAND ${TEST_MAIN})
|
add_test(NAME ${TEST_MAIN} COMMAND ${TEST_MAIN})
|
||||||
endif(ENABLE_TESTING)
|
endif(ENABLE_TESTING)
|
||||||
|
Reference in New Issue
Block a user