Fix CUDA and mdlp library issues
This commit is contained in:
parent
9a323cd7a3
commit
f0f3d9ad6e
@ -23,6 +23,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
|
|
||||||
### Internal
|
### Internal
|
||||||
|
|
||||||
|
- Ignore CUDA language in CMake CodeCoverage module.
|
||||||
|
- Remove mdlp library from submodules and add it as a dependency.
|
||||||
- Create library ShuffleArffFile to limit the number of samples with a parameter and shuffle them.
|
- Create library ShuffleArffFile to limit the number of samples with a parameter and shuffle them.
|
||||||
- Refactor catch2 library location to test/lib
|
- Refactor catch2 library location to test/lib
|
||||||
- Refactor loadDataset function in tests.
|
- Refactor loadDataset function in tests.
|
||||||
|
@ -49,11 +49,12 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
|||||||
set(CODE_COVERAGE ON)
|
set(CODE_COVERAGE ON)
|
||||||
endif (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
endif (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||||
|
|
||||||
|
get_property(LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
|
||||||
|
message(STATUS "Languages=${LANGUAGES}")
|
||||||
if (CODE_COVERAGE)
|
if (CODE_COVERAGE)
|
||||||
enable_testing()
|
enable_testing()
|
||||||
include(CodeCoverage)
|
include(CodeCoverage)
|
||||||
MESSAGE("Code coverage enabled")
|
MESSAGE(STATUS "Code coverage enabled")
|
||||||
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
|
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
|
||||||
endif (CODE_COVERAGE)
|
endif (CODE_COVERAGE)
|
||||||
|
|
||||||
@ -63,9 +64,16 @@ endif (ENABLE_CLANG_TIDY)
|
|||||||
|
|
||||||
# External libraries - dependencies of BayesNet
|
# External libraries - dependencies of BayesNet
|
||||||
# ---------------------------------------------
|
# ---------------------------------------------
|
||||||
|
|
||||||
|
find_library(FImdlp NAMES libfimdlp.a REQUIRED)
|
||||||
|
find_path(FImdlp_INCLUDE_DIRS REQUIRED NAMES fimdlp)
|
||||||
|
|
||||||
|
message(STATUS "FImdlp=${FImdlp}")
|
||||||
|
message(STATUS "FImdlp_INCLUDE_DIRS=${FImdlp_INCLUDE_DIRS}")
|
||||||
|
|
||||||
|
|
||||||
# include(FetchContent)
|
# include(FetchContent)
|
||||||
add_git_submodule("lib/json")
|
add_git_submodule("lib/json")
|
||||||
add_git_submodule("lib/mdlp")
|
|
||||||
|
|
||||||
# Subdirectories
|
# Subdirectories
|
||||||
# --------------
|
# --------------
|
||||||
@ -75,7 +83,7 @@ add_subdirectory(bayesnet)
|
|||||||
# Testing
|
# Testing
|
||||||
# -------
|
# -------
|
||||||
if (ENABLE_TESTING)
|
if (ENABLE_TESTING)
|
||||||
MESSAGE("Testing enabled")
|
MESSAGE(STATUS "Testing enabled")
|
||||||
add_subdirectory(tests/lib/catch2)
|
add_subdirectory(tests/lib/catch2)
|
||||||
include(CTest)
|
include(CTest)
|
||||||
add_subdirectory(tests)
|
add_subdirectory(tests)
|
||||||
|
@ -4,9 +4,10 @@ include_directories(
|
|||||||
${BayesNet_SOURCE_DIR}/lib/json/include
|
${BayesNet_SOURCE_DIR}/lib/json/include
|
||||||
${BayesNet_SOURCE_DIR}
|
${BayesNet_SOURCE_DIR}
|
||||||
${CMAKE_BINARY_DIR}/configured_files/include
|
${CMAKE_BINARY_DIR}/configured_files/include
|
||||||
|
${FImdlp_INCLUDE_DIRS}
|
||||||
)
|
)
|
||||||
|
|
||||||
file(GLOB_RECURSE Sources "*.cc")
|
file(GLOB_RECURSE Sources "*.cc")
|
||||||
|
|
||||||
add_library(BayesNet ${Sources})
|
add_library(BayesNet ${Sources})
|
||||||
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")
|
target_link_libraries(BayesNet ${FImdlp} "${TORCH_LIBRARIES}")
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <torch/torch.h>
|
#include <torch/torch.h>
|
||||||
#include <CPPFImdlp.h>
|
#include <fimdlp/CPPFImdlp.h>
|
||||||
#include "bayesnet/network/Network.h"
|
#include "bayesnet/network/Network.h"
|
||||||
#include "Classifier.h"
|
#include "Classifier.h"
|
||||||
|
|
||||||
|
@ -137,7 +137,7 @@
|
|||||||
|
|
||||||
include(CMakeParseArguments)
|
include(CMakeParseArguments)
|
||||||
|
|
||||||
option(CODE_COVERAGE_VERBOSE "Verbose information" FALSE)
|
option(CODE_COVERAGE_VERBOSE "Verbose information" TRUE)
|
||||||
|
|
||||||
# Check prereqs
|
# Check prereqs
|
||||||
find_program( GCOV_PATH gcov )
|
find_program( GCOV_PATH gcov )
|
||||||
@ -160,7 +160,11 @@ foreach(LANG ${LANGUAGES})
|
|||||||
endif()
|
endif()
|
||||||
elseif(NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "GNU"
|
elseif(NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "GNU"
|
||||||
AND NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "(LLVM)?[Ff]lang")
|
AND NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "(LLVM)?[Ff]lang")
|
||||||
message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
|
if ("${LANG}" MATCHES "CUDA")
|
||||||
|
message(STATUS "Ignoring CUDA")
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
|
2
lib/json
2
lib/json
@ -1 +1 @@
|
|||||||
Subproject commit b36f4c477c40356a0ae1204b567cca3c2a57d201
|
Subproject commit 378e091795a70fced276cd882bd8a6a428668fe5
|
@ -5,15 +5,22 @@ project(bayesnet_sample)
|
|||||||
set(CMAKE_CXX_STANDARD 17)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
|
||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
find_library(BayesNet NAMES BayesNet.a libBayesNet.a REQUIRED)
|
find_library(BayesNet NAMES libBayesNet BayesNet libBayesNet.a REQUIRED)
|
||||||
|
find_path(Bayesnet_INCLUDE_DIRS REQUIRED NAMES bayesnet)
|
||||||
|
find_library(FImdlp NAMES libfimdlp.a PATHS REQUIRED)
|
||||||
|
find_path(FImdlp_INCLUDE_DIRS REQUIRED NAMES fimdlp)
|
||||||
|
|
||||||
|
message(STATUS "FImdlp=${FImdlp}")
|
||||||
|
message(STATUS "FImdlp_INCLUDE_DIRS=${FImdlp_INCLUDE_DIRS}")
|
||||||
|
message(STATUS "BayesNet=${BayesNet}")
|
||||||
|
message(STATUS "Bayesnet_INCLUDE_DIRS=${Bayesnet_INCLUDE_DIRS}")
|
||||||
|
|
||||||
include_directories(
|
include_directories(
|
||||||
../tests/lib/Files
|
../tests/lib/Files
|
||||||
lib/mdlp
|
|
||||||
lib/json/include
|
lib/json/include
|
||||||
/usr/local/include
|
/usr/local/include
|
||||||
|
${FImdlp_INCLUDE_DIRS}
|
||||||
)
|
)
|
||||||
|
|
||||||
add_subdirectory(lib/mdlp)
|
|
||||||
add_executable(bayesnet_sample sample.cc)
|
add_executable(bayesnet_sample sample.cc)
|
||||||
target_link_libraries(bayesnet_sample mdlp "${TORCH_LIBRARIES}" "${BayesNet}")
|
target_link_libraries(bayesnet_sample ${FImdlp} "${TORCH_LIBRARIES}" "${BayesNet}")
|
@ -1,11 +0,0 @@
|
|||||||
cmake_minimum_required(VERSION 3.20)
|
|
||||||
project(mdlp)
|
|
||||||
|
|
||||||
if (POLICY CMP0135)
|
|
||||||
cmake_policy(SET CMP0135 NEW)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 11)
|
|
||||||
|
|
||||||
add_library(mdlp CPPFImdlp.cpp Metrics.cpp)
|
|
||||||
|
|
@ -1,222 +0,0 @@
|
|||||||
#include <numeric>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <set>
|
|
||||||
#include <cmath>
|
|
||||||
#include "CPPFImdlp.h"
|
|
||||||
#include "Metrics.h"
|
|
||||||
|
|
||||||
namespace mdlp {
|
|
||||||
|
|
||||||
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_),
|
|
||||||
max_depth(max_depth_),
|
|
||||||
proposed_cuts(proposed)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
CPPFImdlp::CPPFImdlp() = default;
|
|
||||||
|
|
||||||
CPPFImdlp::~CPPFImdlp() = default;
|
|
||||||
|
|
||||||
size_t CPPFImdlp::compute_max_num_cut_points() const
|
|
||||||
{
|
|
||||||
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples
|
|
||||||
if (proposed_cuts == 0) {
|
|
||||||
return numeric_limits<size_t>::max();
|
|
||||||
}
|
|
||||||
if (proposed_cuts < 0 || proposed_cuts > static_cast<float>(X.size())) {
|
|
||||||
throw invalid_argument("wrong proposed num_cuts value");
|
|
||||||
}
|
|
||||||
if (proposed_cuts < 1)
|
|
||||||
return static_cast<size_t>(round(static_cast<float>(X.size()) * proposed_cuts));
|
|
||||||
return static_cast<size_t>(proposed_cuts);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
|
|
||||||
{
|
|
||||||
X = X_;
|
|
||||||
y = y_;
|
|
||||||
num_cut_points = compute_max_num_cut_points();
|
|
||||||
depth = 0;
|
|
||||||
discretizedData.clear();
|
|
||||||
cutPoints.clear();
|
|
||||||
if (X.size() != y.size()) {
|
|
||||||
throw invalid_argument("X and y must have the same size");
|
|
||||||
}
|
|
||||||
if (X.empty() || y.empty()) {
|
|
||||||
throw invalid_argument("X and y must have at least one element");
|
|
||||||
}
|
|
||||||
if (min_length < 3) {
|
|
||||||
throw invalid_argument("min_length must be greater than 2");
|
|
||||||
}
|
|
||||||
if (max_depth < 1) {
|
|
||||||
throw invalid_argument("max_depth must be greater than 0");
|
|
||||||
}
|
|
||||||
indices = sortIndices(X_, y_);
|
|
||||||
metrics.setData(y, indices);
|
|
||||||
computeCutPoints(0, X.size(), 1);
|
|
||||||
sort(cutPoints.begin(), cutPoints.end());
|
|
||||||
if (num_cut_points > 0) {
|
|
||||||
// Select the best (with lower entropy) cut points
|
|
||||||
while (cutPoints.size() > num_cut_points) {
|
|
||||||
resizeCutPoints();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
|
|
||||||
{
|
|
||||||
size_t n;
|
|
||||||
size_t m;
|
|
||||||
size_t idxPrev = cut - 1 >= start ? cut - 1 : cut;
|
|
||||||
size_t idxNext = cut + 1 < end ? cut + 1 : cut;
|
|
||||||
bool backWall; // true if duplicates reach beginning of the interval
|
|
||||||
precision_t previous;
|
|
||||||
precision_t actual;
|
|
||||||
precision_t next;
|
|
||||||
previous = X[indices[idxPrev]];
|
|
||||||
actual = X[indices[cut]];
|
|
||||||
next = X[indices[idxNext]];
|
|
||||||
// definition 2 of the paper => X[t-1] < X[t]
|
|
||||||
// get the first equal value of X in the interval
|
|
||||||
while (idxPrev > start && actual == previous) {
|
|
||||||
previous = X[indices[--idxPrev]];
|
|
||||||
}
|
|
||||||
backWall = idxPrev == start && actual == previous;
|
|
||||||
// get the last equal value of X in the interval
|
|
||||||
while (idxNext < end - 1 && actual == next) {
|
|
||||||
next = X[indices[++idxNext]];
|
|
||||||
}
|
|
||||||
// # of duplicates before cutpoint
|
|
||||||
n = cut - 1 - idxPrev;
|
|
||||||
// # of duplicates after cutpoint
|
|
||||||
m = idxNext - cut - 1;
|
|
||||||
// Decide which values to use
|
|
||||||
cut = cut + (backWall ? m + 1 : -n);
|
|
||||||
actual = X[indices[cut]];
|
|
||||||
return { (actual + previous) / 2, cut };
|
|
||||||
}
|
|
||||||
|
|
||||||
void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_)
|
|
||||||
{
|
|
||||||
size_t cut;
|
|
||||||
pair<precision_t, size_t> result;
|
|
||||||
// Check if the interval length and the depth are Ok
|
|
||||||
if (end - start < min_length || depth_ > max_depth)
|
|
||||||
return;
|
|
||||||
depth = depth_ > depth ? depth_ : depth;
|
|
||||||
cut = getCandidate(start, end);
|
|
||||||
if (cut == numeric_limits<size_t>::max())
|
|
||||||
return;
|
|
||||||
if (mdlp(start, cut, end)) {
|
|
||||||
result = valueCutPoint(start, cut, end);
|
|
||||||
cut = result.second;
|
|
||||||
cutPoints.push_back(result.first);
|
|
||||||
computeCutPoints(start, cut, depth_ + 1);
|
|
||||||
computeCutPoints(cut, end, depth_ + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t CPPFImdlp::getCandidate(size_t start, size_t end)
|
|
||||||
{
|
|
||||||
/* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
|
|
||||||
E(A, TA; S) is minimal amongst all the candidate cut points. */
|
|
||||||
size_t candidate = numeric_limits<size_t>::max();
|
|
||||||
size_t elements = end - start;
|
|
||||||
bool sameValues = true;
|
|
||||||
precision_t entropy_left;
|
|
||||||
precision_t entropy_right;
|
|
||||||
precision_t minEntropy;
|
|
||||||
// Check if all the values of the variable in the interval are the same
|
|
||||||
for (size_t idx = start + 1; idx < end; idx++) {
|
|
||||||
if (X[indices[idx]] != X[indices[start]]) {
|
|
||||||
sameValues = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (sameValues)
|
|
||||||
return candidate;
|
|
||||||
minEntropy = metrics.entropy(start, end);
|
|
||||||
for (size_t idx = start + 1; idx < end; idx++) {
|
|
||||||
// Cutpoints are always on boundaries (definition 2)
|
|
||||||
if (y[indices[idx]] == y[indices[idx - 1]])
|
|
||||||
continue;
|
|
||||||
entropy_left = precision_t(idx - start) / static_cast<precision_t>(elements) * metrics.entropy(start, idx);
|
|
||||||
entropy_right = precision_t(end - idx) / static_cast<precision_t>(elements) * metrics.entropy(idx, end);
|
|
||||||
if (entropy_left + entropy_right < minEntropy) {
|
|
||||||
minEntropy = entropy_left + entropy_right;
|
|
||||||
candidate = idx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return candidate;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
|
|
||||||
{
|
|
||||||
int k;
|
|
||||||
int k1;
|
|
||||||
int k2;
|
|
||||||
precision_t ig;
|
|
||||||
precision_t delta;
|
|
||||||
precision_t ent;
|
|
||||||
precision_t ent1;
|
|
||||||
precision_t ent2;
|
|
||||||
auto N = precision_t(end - start);
|
|
||||||
k = metrics.computeNumClasses(start, end);
|
|
||||||
k1 = metrics.computeNumClasses(start, cut);
|
|
||||||
k2 = metrics.computeNumClasses(cut, end);
|
|
||||||
ent = metrics.entropy(start, end);
|
|
||||||
ent1 = metrics.entropy(start, cut);
|
|
||||||
ent2 = metrics.entropy(cut, end);
|
|
||||||
ig = metrics.informationGain(start, cut, end);
|
|
||||||
delta = static_cast<precision_t>(log2(pow(3, precision_t(k)) - 2) -
|
|
||||||
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
|
|
||||||
precision_t term = 1 / N * (log2(N - 1) + delta);
|
|
||||||
return ig > term;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
|
||||||
indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
|
|
||||||
{
|
|
||||||
indices_t idx(X_.size());
|
|
||||||
iota(idx.begin(), idx.end(), 0);
|
|
||||||
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
|
|
||||||
if (X_[i1] == X_[i2])
|
|
||||||
return y_[i1] < y_[i2];
|
|
||||||
else
|
|
||||||
return X_[i1] < X_[i2];
|
|
||||||
});
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CPPFImdlp::resizeCutPoints()
|
|
||||||
{
|
|
||||||
//Compute entropy of each of the whole cutpoint set and discards the biggest value
|
|
||||||
precision_t maxEntropy = 0;
|
|
||||||
precision_t entropy;
|
|
||||||
size_t maxEntropyIdx = 0;
|
|
||||||
size_t begin = 0;
|
|
||||||
size_t end;
|
|
||||||
for (size_t idx = 0; idx < cutPoints.size(); idx++) {
|
|
||||||
end = begin;
|
|
||||||
while (X[indices[end]] < cutPoints[idx] && end < X.size())
|
|
||||||
end++;
|
|
||||||
entropy = metrics.entropy(begin, end);
|
|
||||||
if (entropy > maxEntropy) {
|
|
||||||
maxEntropy = entropy;
|
|
||||||
maxEntropyIdx = idx;
|
|
||||||
}
|
|
||||||
begin = end;
|
|
||||||
}
|
|
||||||
cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx));
|
|
||||||
}
|
|
||||||
labels_t& CPPFImdlp::transform(const samples_t& data)
|
|
||||||
{
|
|
||||||
discretizedData.clear();
|
|
||||||
discretizedData.reserve(data.size());
|
|
||||||
for (const precision_t& item : data) {
|
|
||||||
auto upper = upper_bound(cutPoints.begin(), cutPoints.end(), item);
|
|
||||||
discretizedData.push_back(upper - cutPoints.begin());
|
|
||||||
}
|
|
||||||
return discretizedData;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,51 +0,0 @@
|
|||||||
// ***************************************************************
|
|
||||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
|
||||||
// SPDX-FileType: SOURCE
|
|
||||||
// SPDX-License-Identifier: MIT
|
|
||||||
// ***************************************************************
|
|
||||||
|
|
||||||
#ifndef CPPFIMDLP_H
|
|
||||||
#define CPPFIMDLP_H
|
|
||||||
|
|
||||||
#include "typesFImdlp.h"
|
|
||||||
#include "Metrics.h"
|
|
||||||
#include <limits>
|
|
||||||
#include <utility>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
namespace mdlp {
|
|
||||||
class CPPFImdlp {
|
|
||||||
protected:
|
|
||||||
size_t min_length = 3;
|
|
||||||
int depth = 0;
|
|
||||||
int max_depth = numeric_limits<int>::max();
|
|
||||||
float proposed_cuts = 0;
|
|
||||||
indices_t indices = indices_t();
|
|
||||||
samples_t X = samples_t();
|
|
||||||
labels_t y = labels_t();
|
|
||||||
Metrics metrics = Metrics(y, indices);
|
|
||||||
cutPoints_t cutPoints;
|
|
||||||
size_t num_cut_points = numeric_limits<size_t>::max();
|
|
||||||
labels_t discretizedData = labels_t();
|
|
||||||
|
|
||||||
static indices_t sortIndices(samples_t&, labels_t&);
|
|
||||||
|
|
||||||
void computeCutPoints(size_t, size_t, int);
|
|
||||||
void resizeCutPoints();
|
|
||||||
bool mdlp(size_t, size_t, size_t);
|
|
||||||
size_t getCandidate(size_t, size_t);
|
|
||||||
size_t compute_max_num_cut_points() const;
|
|
||||||
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
|
|
||||||
|
|
||||||
public:
|
|
||||||
CPPFImdlp();
|
|
||||||
CPPFImdlp(size_t, int, float);
|
|
||||||
~CPPFImdlp();
|
|
||||||
void fit(samples_t&, labels_t&);
|
|
||||||
inline cutPoints_t getCutPoints() const { return cutPoints; };
|
|
||||||
labels_t& transform(const samples_t&);
|
|
||||||
inline int get_depth() const { return depth; };
|
|
||||||
static inline string version() { return "1.1.2"; };
|
|
||||||
};
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,21 +0,0 @@
|
|||||||
MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2022 Ricardo Montañana Gómez
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
@ -1,78 +0,0 @@
|
|||||||
#include "Metrics.h"
|
|
||||||
#include <set>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
namespace mdlp {
|
|
||||||
Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_),
|
|
||||||
numClasses(computeNumClasses(0, indices.size()))
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
int Metrics::computeNumClasses(size_t start, size_t end)
|
|
||||||
{
|
|
||||||
set<int> nClasses;
|
|
||||||
for (auto i = start; i < end; ++i) {
|
|
||||||
nClasses.insert(y[indices[i]]);
|
|
||||||
}
|
|
||||||
return static_cast<int>(nClasses.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
void Metrics::setData(const labels_t& y_, const indices_t& indices_)
|
|
||||||
{
|
|
||||||
indices = indices_;
|
|
||||||
y = y_;
|
|
||||||
numClasses = computeNumClasses(0, indices.size());
|
|
||||||
entropyCache.clear();
|
|
||||||
igCache.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
precision_t Metrics::entropy(size_t start, size_t end)
|
|
||||||
{
|
|
||||||
precision_t p;
|
|
||||||
precision_t ventropy = 0;
|
|
||||||
int nElements = 0;
|
|
||||||
labels_t counts(numClasses + 1, 0);
|
|
||||||
if (end - start < 2)
|
|
||||||
return 0;
|
|
||||||
if (entropyCache.find({ start, end }) != entropyCache.end()) {
|
|
||||||
return entropyCache[{start, end}];
|
|
||||||
}
|
|
||||||
for (auto i = &indices[start]; i != &indices[end]; ++i) {
|
|
||||||
counts[y[*i]]++;
|
|
||||||
nElements++;
|
|
||||||
}
|
|
||||||
for (auto count : counts) {
|
|
||||||
if (count > 0) {
|
|
||||||
p = static_cast<precision_t>(count) / static_cast<precision_t>(nElements);
|
|
||||||
ventropy -= p * log2(p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
entropyCache[{start, end}] = ventropy;
|
|
||||||
return ventropy;
|
|
||||||
}
|
|
||||||
|
|
||||||
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
|
|
||||||
{
|
|
||||||
precision_t iGain;
|
|
||||||
precision_t entropyInterval;
|
|
||||||
precision_t entropyLeft;
|
|
||||||
precision_t entropyRight;
|
|
||||||
size_t nElementsLeft = cut - start;
|
|
||||||
size_t nElementsRight = end - cut;
|
|
||||||
size_t nElements = end - start;
|
|
||||||
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
|
|
||||||
return igCache[make_tuple(start, cut, end)];
|
|
||||||
}
|
|
||||||
entropyInterval = entropy(start, end);
|
|
||||||
entropyLeft = entropy(start, cut);
|
|
||||||
entropyRight = entropy(cut, end);
|
|
||||||
iGain = entropyInterval -
|
|
||||||
(static_cast<precision_t>(nElementsLeft) * entropyLeft +
|
|
||||||
static_cast<precision_t>(nElementsRight) * entropyRight) /
|
|
||||||
static_cast<precision_t>(nElements);
|
|
||||||
igCache[make_tuple(start, cut, end)] = iGain;
|
|
||||||
return iGain;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,28 +0,0 @@
|
|||||||
// ***************************************************************
|
|
||||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
|
||||||
// SPDX-FileType: SOURCE
|
|
||||||
// SPDX-License-Identifier: MIT
|
|
||||||
// ***************************************************************
|
|
||||||
|
|
||||||
#ifndef CCMETRICS_H
|
|
||||||
#define CCMETRICS_H
|
|
||||||
|
|
||||||
#include "typesFImdlp.h"
|
|
||||||
|
|
||||||
namespace mdlp {
|
|
||||||
class Metrics {
|
|
||||||
protected:
|
|
||||||
labels_t& y;
|
|
||||||
indices_t& indices;
|
|
||||||
int numClasses;
|
|
||||||
cacheEnt_t entropyCache = cacheEnt_t();
|
|
||||||
cacheIg_t igCache = cacheIg_t();
|
|
||||||
public:
|
|
||||||
Metrics(labels_t&, indices_t&);
|
|
||||||
void setData(const labels_t&, const indices_t&);
|
|
||||||
int computeNumClasses(size_t, size_t);
|
|
||||||
precision_t entropy(size_t, size_t);
|
|
||||||
precision_t informationGain(size_t, size_t, size_t);
|
|
||||||
};
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,41 +0,0 @@
|
|||||||
[![Build](https://github.com/rmontanana/mdlp/actions/workflows/build.yml/badge.svg)](https://github.com/rmontanana/mdlp/actions/workflows/build.yml)
|
|
||||||
[![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=rmontanana_mdlp&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=rmontanana_mdlp)
|
|
||||||
[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=rmontanana_mdlp&metric=reliability_rating)](https://sonarcloud.io/summary/new_code?id=rmontanana_mdlp)
|
|
||||||
|
|
||||||
# mdlp
|
|
||||||
|
|
||||||
Discretization algorithm based on the paper by Fayyad & Irani [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](https://www.ijcai.org/Proceedings/93-2/Papers/022.pdf)
|
|
||||||
|
|
||||||
The implementation tries to mitigate the problem of different label values with the same value of the variable:
|
|
||||||
|
|
||||||
- Sorts the values of the variable using the label values as a tie-breaker
|
|
||||||
- Once found a valid candidate for the split, it checks if the previous value is the same as actual one, and tries to get previous one, or next if the former is not possible.
|
|
||||||
|
|
||||||
Other features:
|
|
||||||
|
|
||||||
- Intervals with the same value of the variable are not taken into account for cutpoints.
|
|
||||||
- Intervals have to have more than two examples to be evaluated.
|
|
||||||
|
|
||||||
The algorithm returns the cut points for the variable.
|
|
||||||
|
|
||||||
## Sample
|
|
||||||
|
|
||||||
To run the sample, just execute the following commands:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd sample
|
|
||||||
cmake -B build
|
|
||||||
cd build
|
|
||||||
make
|
|
||||||
./sample -f iris -m 2
|
|
||||||
./sample -h
|
|
||||||
```
|
|
||||||
|
|
||||||
## Test
|
|
||||||
|
|
||||||
To run the tests and see coverage (llvm & gcovr have to be installed), execute the following commands:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd tests
|
|
||||||
./test
|
|
||||||
```
|
|
@ -1,24 +0,0 @@
|
|||||||
// ***************************************************************
|
|
||||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
|
||||||
// SPDX-FileType: SOURCE
|
|
||||||
// SPDX-License-Identifier: MIT
|
|
||||||
// ***************************************************************
|
|
||||||
|
|
||||||
#ifndef TYPES_H
|
|
||||||
#define TYPES_H
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <map>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
namespace mdlp {
|
|
||||||
typedef float precision_t;
|
|
||||||
typedef vector<precision_t> samples_t;
|
|
||||||
typedef vector<int> labels_t;
|
|
||||||
typedef vector<size_t> indices_t;
|
|
||||||
typedef vector<precision_t> cutPoints_t;
|
|
||||||
typedef map<pair<int, int>, precision_t> cacheEnt_t;
|
|
||||||
typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -5,7 +5,7 @@
|
|||||||
// ***************************************************************
|
// ***************************************************************
|
||||||
|
|
||||||
#include <ArffFiles.hpp>
|
#include <ArffFiles.hpp>
|
||||||
#include <CPPFImdlp.h>
|
#include <fimdlp/CPPFImdlp.h>
|
||||||
#include <bayesnet/ensembles/BoostAODE.h>
|
#include <bayesnet/ensembles/BoostAODE.h>
|
||||||
|
|
||||||
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
||||||
|
@ -6,6 +6,7 @@ if(ENABLE_TESTING)
|
|||||||
${BayesNet_SOURCE_DIR}/lib/json/include
|
${BayesNet_SOURCE_DIR}/lib/json/include
|
||||||
${BayesNet_SOURCE_DIR}
|
${BayesNet_SOURCE_DIR}
|
||||||
${CMAKE_BINARY_DIR}/configured_files/include
|
${CMAKE_BINARY_DIR}/configured_files/include
|
||||||
|
${FImdlp_INCLUDE_DIRS}
|
||||||
)
|
)
|
||||||
file(GLOB_RECURSE BayesNet_SOURCES "${BayesNet_SOURCE_DIR}/bayesnet/*.cc")
|
file(GLOB_RECURSE BayesNet_SOURCES "${BayesNet_SOURCE_DIR}/bayesnet/*.cc")
|
||||||
add_executable(TestBayesNet TestBayesNetwork.cc TestBayesNode.cc TestBayesClassifier.cc
|
add_executable(TestBayesNet TestBayesNetwork.cc TestBayesNode.cc TestBayesClassifier.cc
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
#include <catch2/catch_test_macros.hpp>
|
#include <catch2/catch_test_macros.hpp>
|
||||||
#include <catch2/matchers/catch_matchers.hpp>
|
#include <catch2/matchers/catch_matchers.hpp>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <CPPFImdlp.h>
|
#include <fimdlp/CPPFImdlp.h>
|
||||||
#include <folding.hpp>
|
#include <folding.hpp>
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
#define TO_STR2(x) #x
|
#define TO_STR2(x) #x
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
#include <ArffFiles.hpp>
|
#include <ArffFiles.hpp>
|
||||||
#include <CPPFImdlp.h>
|
#include <fimdlp/CPPFImdlp.h>
|
||||||
#include <folding.hpp>
|
#include <folding.hpp>
|
||||||
#include <bayesnet/network/Network.h>
|
#include <bayesnet/network/Network.h>
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit fa43b77429ba76c462b1898d6cd2f2d7a9416b14
|
Subproject commit 506276c59217429c93abd2fe9507c7f45eb81072
|
Loading…
Reference in New Issue
Block a user