23 Commits

Author SHA1 Message Date
7ee9896734 Fix mistake in github action 2024-06-08 12:36:56 +02:00
8f7f605670 Fix mistake in github action 2024-06-08 12:32:18 +02:00
2f55b27691 Fix mistake in github action 2024-06-08 12:28:23 +02:00
378fbd51ef Fix mistake in github action 2024-06-08 12:25:17 +02:00
402d0da878 Fix mistake in github action 2024-06-08 12:23:28 +02:00
f34bcc2ed7 Add libtorch to github action 2024-06-08 12:20:51 +02:00
c9ba35fb58 update test script 2024-06-08 12:02:16 +02:00
e205668906 Add torch methods to discretize
Add fit_transform methods
2024-06-07 23:54:42 +02:00
633aa52849 Refactor sample build 2024-06-06 12:04:55 +02:00
61de687476 Fix library creation problem 2024-06-06 11:13:50 +02:00
7ff88c8e4b Update Discretizer version 2024-06-05 17:55:45 +02:00
Ricardo Montañana Gómez
638bb2a59e Discretizer (#8)
* Add better check in testKBins.py

* Add Discretizer base class for Both discretizers

* Refactor order of constructors init
2024-06-05 17:53:08 +02:00
Ricardo Montañana Gómez
f258fc220f Merge pull request #7 from rmontanana/BinDisc
Implement BinDisc and tests
2024-06-05 11:08:56 +02:00
0beeda320d Update workflow build 2024-06-05 10:56:49 +02:00
6b68a41c42 Implement BinDisc and tests 2024-06-05 10:45:11 +02:00
236d1b2f8b Update sonarcloud github action 2024-05-02 12:51:40 +02:00
52ee93178f Update dockerfile 2024-05-02 10:46:29 +00:00
eeda4347e9 Add logo to README 2024-05-02 11:56:18 +02:00
5708dc3de9 Fix initialization mistake in transform 2023-08-01 17:30:37 +02:00
fbffc3a9c4 Remove sample from library binary file 2023-07-20 18:42:46 +02:00
ab3786e2a2 Add transform test 2023-07-06 16:40:58 +02:00
be1917d05b Fix attribute name extraction in ArffFiles 2023-07-06 16:15:23 +02:00
5679d607e5 Add transform method to discretize values using CutPoints 2023-07-06 16:06:52 +02:00
28 changed files with 1338 additions and 166 deletions

View File

@@ -1,18 +1,16 @@
FROM mcr.microsoft.com/devcontainers/cpp:0-ubuntu-22.04
ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.22.2"
# Optionally install the cmake for vcpkg
COPY ./reinstall-cmake.sh /tmp/
RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \
chmod +x /tmp/reinstall-cmake.sh && /tmp/reinstall-cmake.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \
fi \
&& rm -f /tmp/reinstall-cmake.sh
RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
&& apt-get -y install --no-install-recommends \
python3 \
python3-pip \
lcov \
cmake \
&& apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/*
RUN pip3 install --no-cache-dir \
cpplint \
cmake-format\
gcovr
# [Optional] Uncomment this section to install additional vcpkg ports.
# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install <your-port-name-here>"
# [Optional] Uncomment this section to install additional packages.
# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
# && apt-get -y install --no-install-recommends <your-package-list-here>

View File

@@ -13,25 +13,28 @@ jobs:
env:
BUILD_WRAPPER_OUT_DIR: build_wrapper_output_directory # Directory where build-wrapper output will be placed
steps:
- uses: actions/checkout@v3.2.0
- uses: actions/checkout@v4.1.6
with:
fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis
- name: Install sonar-scanner and build-wrapper
uses: SonarSource/sonarcloud-github-c-cpp@v1
uses: SonarSource/sonarcloud-github-c-cpp@v2
- name: Install lcov & gcovr
run: |
sudo apt-get -y install lcov
sudo apt-get -y install gcovr
- name: Install Libtorch
run: |
wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.3.1%2Bcpu.zip
unzip libtorch-cxx11-abi-shared-with-deps-2.3.1+cpu.zip
- name: Tests & build-wrapper
run: |
cmake -S . -B build -Wno-dev
cmake -S . -B build -Wno-dev -DCMAKE_PREFIX_PATH=$(pwd)/libtorch
build-wrapper-linux-x86-64 --out-dir ${{ env.BUILD_WRAPPER_OUT_DIR }} cmake --build build/ --config Release
cd build
make
ctest -C Release --output-on-failure --test-dir tests
cd ..
# gcovr -f CPPFImdlp.cpp -f Metrics.cpp --merge-mode-functions=separate --txt --sonarqube=coverage.xml
gcovr -f CPPFImdlp.cpp -f Metrics.cpp --txt --sonarqube=coverage.xml
gcovr -f CPPFImdlp.cpp -f Metrics.cpp -f BinDisc.cpp -f Discretizer.cpp --txt --sonarqube=coverage.xml
- name: Run sonar-scanner
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

2
.gitignore vendored
View File

@@ -31,6 +31,8 @@
*.out
*.app
**/build
build_Debug
build_Release
**/lcoverage
.idea
cmake-*

11
.vscode/launch.json vendored
View File

@@ -8,15 +8,10 @@
"name": "C++ Launch config",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceFolder}/build/sample/sample",
"cwd": "${workspaceFolder}/build/sample",
"args": [
"-f",
"glass"
],
"targetArchitecture": "arm64",
"program": "${workspaceFolder}/tests/build/Metrics_unittest",
"cwd": "${workspaceFolder}/tests/build",
"args": [],
"launchCompleteCommand": "exec-run",
"preLaunchTask": "CMake: build",
"stopAtEntry": false,
"linux": {
"MIMode": "gdb",

102
.vscode/settings.json vendored
View File

@@ -5,5 +5,105 @@
},
"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
"cmake.configureOnOpen": true,
"sonarlint.pathToCompileCommands": "${workspaceFolder}/build/compile_commands.json"
"sonarlint.pathToCompileCommands": "${workspaceFolder}/build/compile_commands.json",
"files.associations": {
"*.rmd": "markdown",
"*.py": "python",
"vector": "cpp",
"__bit_reference": "cpp",
"__bits": "cpp",
"__config": "cpp",
"__debug": "cpp",
"__errc": "cpp",
"__hash_table": "cpp",
"__locale": "cpp",
"__mutex_base": "cpp",
"__node_handle": "cpp",
"__nullptr": "cpp",
"__split_buffer": "cpp",
"__string": "cpp",
"__threading_support": "cpp",
"__tuple": "cpp",
"array": "cpp",
"atomic": "cpp",
"bitset": "cpp",
"cctype": "cpp",
"chrono": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"exception": "cpp",
"initializer_list": "cpp",
"ios": "cpp",
"iosfwd": "cpp",
"istream": "cpp",
"limits": "cpp",
"locale": "cpp",
"memory": "cpp",
"mutex": "cpp",
"new": "cpp",
"optional": "cpp",
"ostream": "cpp",
"ratio": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"streambuf": "cpp",
"string": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"typeinfo": "cpp",
"unordered_map": "cpp",
"variant": "cpp",
"algorithm": "cpp",
"iostream": "cpp",
"iomanip": "cpp",
"numeric": "cpp",
"set": "cpp",
"__tree": "cpp",
"deque": "cpp",
"list": "cpp",
"map": "cpp",
"unordered_set": "cpp",
"any": "cpp",
"condition_variable": "cpp",
"forward_list": "cpp",
"fstream": "cpp",
"stack": "cpp",
"thread": "cpp",
"__memory": "cpp",
"filesystem": "cpp",
"*.toml": "toml",
"utility": "cpp",
"span": "cpp",
"*.tcc": "cpp",
"bit": "cpp",
"charconv": "cpp",
"cinttypes": "cpp",
"codecvt": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory_resource": "cpp",
"random": "cpp",
"source_location": "cpp",
"format": "cpp",
"numbers": "cpp",
"semaphore": "cpp",
"stop_token": "cpp",
"text_encoding": "cpp",
"typeindex": "cpp",
"valarray": "cpp"
}
}

99
BinDisc.cpp Normal file
View File

@@ -0,0 +1,99 @@
#include <algorithm>
#include <limits>
#include <cmath>
#include "BinDisc.h"
#include <iostream>
#include <string>
namespace mdlp {
BinDisc::BinDisc(int n_bins, strategy_t strategy) :
Discretizer(), n_bins{ n_bins }, strategy{ strategy }
{
if (n_bins < 3) {
throw std::invalid_argument("n_bins must be greater than 2");
}
}
BinDisc::~BinDisc() = default;
void BinDisc::fit(samples_t& X)
{
// y is included for compatibility with the Discretizer interface
cutPoints.clear();
if (X.empty()) {
cutPoints.push_back(std::numeric_limits<precision_t>::max());
return;
}
if (strategy == strategy_t::QUANTILE) {
fit_quantile(X);
} else if (strategy == strategy_t::UNIFORM) {
fit_uniform(X);
}
}
void BinDisc::fit(samples_t& X, labels_t& y)
{
fit(X);
}
std::vector<precision_t> linspace(precision_t start, precision_t end, int num)
{
// Doesn't include end point as it is not needed
if (start == end) {
return { 0 };
}
precision_t delta = (end - start) / static_cast<precision_t>(num - 1);
std::vector<precision_t> linspc;
for (size_t i = 0; i < num - 1; ++i) {
precision_t val = start + delta * static_cast<precision_t>(i);
linspc.push_back(val);
}
return linspc;
}
size_t clip(const size_t n, size_t lower, size_t upper)
{
return std::max(lower, std::min(n, upper));
}
std::vector<precision_t> percentile(samples_t& data, std::vector<precision_t>& percentiles)
{
// Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html
std::vector<precision_t> results;
results.reserve(percentiles.size());
for (auto percentile : percentiles) {
const size_t i = static_cast<size_t>(std::floor(static_cast<double>(data.size() - 1) * percentile / 100.));
const auto indexLower = clip(i, 0, data.size() - 1);
const double percentI = static_cast<double>(indexLower) / static_cast<double>(data.size() - 1);
const double fraction =
(percentile / 100.0 - percentI) /
(static_cast<double>(indexLower + 1) / static_cast<double>(data.size() - 1) - percentI);
const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction;
if (value != results.back())
results.push_back(value);
}
return results;
}
void BinDisc::fit_quantile(samples_t& X)
{
auto quantiles = linspace(0.0, 100.0, n_bins + 1);
auto data = X;
std::sort(data.begin(), data.end());
if (data.front() == data.back() || data.size() == 1) {
// if X is constant
cutPoints.push_back(std::numeric_limits<precision_t>::max());
return;
}
cutPoints = percentile(data, quantiles);
normalizeCutPoints();
}
void BinDisc::fit_uniform(samples_t& X)
{
auto minmax = std::minmax_element(X.begin(), X.end());
cutPoints = linspace(*minmax.first, *minmax.second, n_bins + 1);
normalizeCutPoints();
}
void BinDisc::normalizeCutPoints()
{
// Add max value to the end
cutPoints.push_back(std::numeric_limits<precision_t>::max());
// Remove first as it is not needed
cutPoints.erase(cutPoints.begin());
}
}

28
BinDisc.h Normal file
View File

@@ -0,0 +1,28 @@
#ifndef BINDISC_H
#define BINDISC_H
#include "typesFImdlp.h"
#include "Discretizer.h"
#include <string>
namespace mdlp {
enum class strategy_t {
UNIFORM,
QUANTILE
};
class BinDisc : public Discretizer {
public:
BinDisc(int n_bins = 3, strategy_t strategy = strategy_t::UNIFORM);
~BinDisc();
// y is included for compatibility with the Discretizer interface
void fit(samples_t& X_, labels_t& y) override;
void fit(samples_t& X);
private:
void fit_uniform(samples_t&);
void fit_quantile(samples_t&);
void normalizeCutPoints();
int n_bins;
strategy_t strategy;
};
}
#endif

View File

@@ -1,13 +1,9 @@
cmake_minimum_required(VERSION 3.20)
project(mdlp)
if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif ()
set(CMAKE_CXX_STANDARD 11)
add_library(mdlp CPPFImdlp.cpp Metrics.cpp sample/sample.cpp)
set(CMAKE_CXX_STANDARD 17)
find_package(Torch REQUIRED)
include_directories(${TORCH_INCLUDE_DIRS})
add_library(mdlp CPPFImdlp.cpp Metrics.cpp BinDisc.cpp Discretizer.cpp)
target_link_libraries(mdlp "${TORCH_LIBRARIES}")
add_subdirectory(sample)
add_subdirectory(tests)
add_subdirectory(tests)

View File

@@ -3,20 +3,17 @@
#include <set>
#include <cmath>
#include "CPPFImdlp.h"
#include "Metrics.h"
namespace mdlp {
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed): min_length(min_length_),
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) :
Discretizer(),
min_length(min_length_),
max_depth(max_depth_),
proposed_cuts(proposed)
{
}
CPPFImdlp::CPPFImdlp() = default;
CPPFImdlp::~CPPFImdlp() = default;
size_t CPPFImdlp::compute_max_num_cut_points() const
{
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples
@@ -37,6 +34,7 @@ namespace mdlp {
y = y_;
num_cut_points = compute_max_num_cut_points();
depth = 0;
discretizedData.clear();
cutPoints.clear();
if (X.size() != y.size()) {
throw invalid_argument("X and y must have the same size");
@@ -177,7 +175,7 @@ namespace mdlp {
indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
{
indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0);
std::iota(idx.begin(), idx.end(), 0);
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
if (X_[i1] == X_[i2])
return y_[i1] < y_[i2];
@@ -208,4 +206,5 @@ namespace mdlp {
}
cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx));
}
}

View File

@@ -2,13 +2,20 @@
#define CPPFIMDLP_H
#include "typesFImdlp.h"
#include "Metrics.h"
#include <limits>
#include <utility>
#include <string>
#include "Metrics.h"
#include "Discretizer.h"
namespace mdlp {
class CPPFImdlp {
class CPPFImdlp : public Discretizer {
public:
CPPFImdlp() = default;
CPPFImdlp(size_t min_length_, int max_depth_, float proposed);
virtual ~CPPFImdlp() = default;
void fit(samples_t& X_, labels_t& y_) override;
inline int get_depth() const { return depth; };
protected:
size_t min_length = 3;
int depth = 0;
@@ -18,26 +25,14 @@ namespace mdlp {
samples_t X = samples_t();
labels_t y = labels_t();
Metrics metrics = Metrics(y, indices);
cutPoints_t cutPoints;
size_t num_cut_points = numeric_limits<size_t>::max();
static indices_t sortIndices(samples_t&, labels_t&);
void computeCutPoints(size_t, size_t, int);
void resizeCutPoints();
bool mdlp(size_t, size_t, size_t);
size_t getCandidate(size_t, size_t);
size_t compute_max_num_cut_points() const;
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
public:
CPPFImdlp();
CPPFImdlp(size_t, int, float);
~CPPFImdlp();
void fit(samples_t&, labels_t&);
inline cutPoints_t getCutPoints() const { return cutPoints; };
inline int get_depth() const { return depth; };
static inline string version() { return "1.1.2"; };
};
}
#endif

41
Discretizer.cpp Normal file
View File

@@ -0,0 +1,41 @@
#include "Discretizer.h"
namespace mdlp {
labels_t& Discretizer::transform(const samples_t& data)
{
discretizedData.clear();
discretizedData.reserve(data.size());
for (const precision_t& item : data) {
auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item);
discretizedData.push_back(upper - cutPoints.begin());
}
return discretizedData;
}
labels_t& Discretizer::fit_transform(samples_t& X_, labels_t& y_)
{
fit(X_, y_);
return transform(X_);
}
void Discretizer::fit_t(torch::Tensor& X_, torch::Tensor& y_)
{
auto num_elements = X_.numel();
samples_t X(X_.data_ptr<precision_t>(), X_.data_ptr<precision_t>() + num_elements);
labels_t y(y_.data_ptr<int64_t>(), y_.data_ptr<int64_t>() + num_elements);
fit(X, y);
}
torch::Tensor Discretizer::transform_t(torch::Tensor& X_)
{
auto num_elements = X_.numel();
samples_t X(X_.data_ptr<float>(), X_.data_ptr<float>() + num_elements);
auto result = transform(X);
return torch::tensor(result, torch::kInt64);
}
torch::Tensor Discretizer::fit_transform_t(torch::Tensor& X_, torch::Tensor& y_)
{
auto num_elements = X_.numel();
samples_t X(X_.data_ptr<precision_t>(), X_.data_ptr<precision_t>() + num_elements);
labels_t y(y_.data_ptr<int64_t>(), y_.data_ptr<int64_t>() + num_elements);
auto result = fit_transform(X, y);
return torch::tensor(result, torch::kInt64);
}
}

27
Discretizer.h Normal file
View File

@@ -0,0 +1,27 @@
#ifndef DISCRETIZER_H
#define DISCRETIZER_H
#include <string>
#include <algorithm>
#include <torch/torch.h>
#include "typesFImdlp.h"
namespace mdlp {
class Discretizer {
public:
Discretizer() = default;
virtual ~Discretizer() = default;
inline cutPoints_t getCutPoints() const { return cutPoints; };
virtual void fit(samples_t& X_, labels_t& y_) = 0;
labels_t& transform(const samples_t& data);
labels_t& fit_transform(samples_t& X_, labels_t& y_);
void fit_t(torch::Tensor& X_, torch::Tensor& y_);
torch::Tensor transform_t(torch::Tensor& X_);
torch::Tensor fit_transform_t(torch::Tensor& X_, torch::Tensor& y_);
static inline std::string version() { return "1.2.1"; };
protected:
labels_t discretizedData = labels_t();
cutPoints_t cutPoints;
};
}
#endif

View File

@@ -4,8 +4,8 @@
using namespace std;
namespace mdlp {
Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_),
numClasses(computeNumClasses(0, indices.size()))
Metrics::Metrics(labels_t& y_, indices_t& indices_) : y(y_), indices(indices_),
numClasses(computeNumClasses(0, indices_.size()))
{
}

View File

@@ -2,7 +2,7 @@
[![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=rmontanana_mdlp&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=rmontanana_mdlp)
[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=rmontanana_mdlp&metric=reliability_rating)](https://sonarcloud.io/summary/new_code?id=rmontanana_mdlp)
# mdlp
# <img src="logo.png" alt="logo" width="50"/> mdlp
Discretization algorithm based on the paper by Fayyad &amp; Irani [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](https://www.ijcai.org/Proceedings/93-2/Papers/022.pdf)
@@ -23,12 +23,10 @@ The algorithm returns the cut points for the variable.
To run the sample, just execute the following commands:
```bash
cd sample
cmake -B build
cd build
make
./sample -f iris -m 2
./sample -h
cmake -B build -S .
cmake --build build
build/sample/sample -f iris -m 2
build/sample/sample -h
```
## Test

BIN
logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

View File

@@ -1,21 +0,0 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "lldb puro",
"type": "cppdbg",
// "targetArchitecture": "arm64",
"request": "launch",
"program": "${workspaceRoot}/build/sample",
"args": [
"-f",
"iris"
],
"stopAtEntry": false,
"cwd": "${workspaceRoot}/build/",
"environment": [],
"externalConsole": false,
"MIMode": "lldb"
},
]
}

View File

@@ -1,5 +1,6 @@
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Debug)
add_executable(sample sample.cpp ../tests/ArffFiles.cpp ../Metrics.cpp ../CPPFImdlp.cpp)
add_executable(sample sample.cpp ../tests/ArffFiles.cpp)
target_link_libraries(sample mdlp "${TORCH_LIBRARIES}")

View File

@@ -5,13 +5,13 @@
#include <algorithm>
#include <cstring>
#include <getopt.h>
#include <torch/torch.h>
#include "../Discretizer.h"
#include "../CPPFImdlp.h"
#include "../BinDisc.h"
#include "../tests/ArffFiles.h"
using namespace std;
using namespace mdlp;
const string PATH = "../../tests/datasets/";
const string PATH = "tests/datasets/";
/* print a description of all supported options */
void usage(const char* path)
@@ -20,17 +20,17 @@ void usage(const char* path)
const char* basename = strrchr(path, '/');
basename = basename ? basename + 1 : path;
cout << "usage: " << basename << "[OPTION]" << endl;
cout << " -h, --help\t\t Print this help and exit." << endl;
cout
std::cout << "usage: " << basename << "[OPTION]" << std::endl;
std::cout << " -h, --help\t\t Print this help and exit." << std::endl;
std::cout
<< " -f, --file[=FILENAME]\t {all, diabetes, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}."
<< endl;
cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl;
cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl;
cout
<< std::endl;
std::cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << std::endl;
std::cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << std::endl;
std::cout
<< " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 -> any"
<< endl;
cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl;
<< std::endl;
std::cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << std::endl;
}
tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
@@ -96,56 +96,79 @@ void process_file(const string& path, const string& file_name, bool class_last,
file.load(path + file_name + ".arff", class_last);
const auto attributes = file.getAttributes();
const auto items = file.getSize();
cout << "Number of lines: " << items << endl;
cout << "Attributes: " << endl;
std::cout << "Number of lines: " << items << std::endl;
std::cout << "Attributes: " << std::endl;
for (auto attribute : attributes) {
cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl;
std::cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << std::endl;
}
cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl;
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
std::cout << "Class name: " << file.getClassName() << std::endl;
std::cout << "Class type: " << file.getClassType() << std::endl;
std::cout << "Data: " << std::endl;
std::vector<mdlp::samples_t>& X = file.getX();
mdlp::labels_t& y = file.getY();
for (int i = 0; i < 5; i++) {
for (auto feature : X) {
cout << fixed << setprecision(1) << feature[i] << " ";
std::cout << fixed << setprecision(1) << feature[i] << " ";
}
cout << y[i] << endl;
std::cout << y[i] << std::endl;
}
auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
size_t total = 0;
for (auto i = 0; i < attributes.size(); i++) {
auto min_max = minmax_element(X[i].begin(), X[i].end());
cout << "Cut points for feature " << get<0>(attributes[i]) << ": [" << setprecision(3);
std::cout << "Cut points for feature " << get<0>(attributes[i]) << ": [" << setprecision(3);
test.fit(X[i], y);
auto cut_points = test.getCutPoints();
for (auto item : cut_points) {
cout << item;
std::cout << item;
if (item != cut_points.back())
cout << ", ";
std::cout << ", ";
}
total += test.getCutPoints().size();
cout << "]" << endl;
cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl;
cout << "--------------------------" << endl;
std::cout << "]" << std::endl;
std::cout << "Min: " << *min_max.first << " Max: " << *min_max.second << std::endl;
std::cout << "--------------------------" << std::endl;
}
std::cout << "Total cut points ...: " << total << std::endl;
std::cout << "Total feature states: " << total + attributes.size() << std::endl;
std::cout << "Version ............: " << test.version() << std::endl;
std::cout << "Transformed data (vector)..: " << std::endl;
test.fit(X[0], y);
auto data = test.transform(X[0]);
for (int i = 130; i < 135; i++) {
std::cout << std::fixed << std::setprecision(1) << X[0][i] << " " << data[i] << std::endl;
}
auto Xt = torch::tensor(X[0], torch::kFloat32);
auto yt = torch::tensor(y, torch::kInt64);
//test.fit_t(Xt, yt);
auto result = test.fit_transform_t(Xt, yt);
std::cout << "Transformed data (torch)...: " << std::endl;
for (int i = 130; i < 135; i++) {
std::cout << std::fixed << std::setprecision(1) << Xt[i].item<float>() << " " << result[i].item<int64_t>() << std::endl;
}
auto disc = mdlp::BinDisc(3);
auto res_v = disc.fit_transform(X[0], y);
disc.fit_t(Xt, yt);
auto res_t = disc.transform_t(Xt);
std::cout << "Transformed data (BinDisc)...: " << std::endl;
for (int i = 130; i < 135; i++) {
std::cout << std::fixed << std::setprecision(1) << Xt[i].item<float>() << " " << res_v[i] << " " << res_t[i].item<int64_t>() << std::endl;
}
cout << "Total cut points ...: " << total << endl;
cout << "Total feature states: " << total + attributes.size() << endl;
}
void process_all_files(const map<string, bool>& datasets, const string& path, int max_depth, int min_length,
float max_cutpoints)
{
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: "
<< max_cutpoints << endl << endl;
std::cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: "
<< max_cutpoints << std::endl << std::endl;
printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)");
printf("==================== ==== ==== ========\n");
for (const auto& dataset : datasets) {
ArffFiles file;
file.load(path + dataset.first + ".arff", dataset.second);
auto attributes = file.getAttributes();
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
std::vector<mdlp::samples_t>& X = file.getX();
mdlp::labels_t& y = file.getY();
size_t timing = 0;
size_t cut_points = 0;
for (auto i = 0; i < attributes.size(); i++) {
@@ -163,7 +186,7 @@ void process_all_files(const map<string, bool>& datasets, const string& path, in
int main(int argc, char** argv)
{
map<string, bool> datasets = {
std::map<std::string, bool> datasets = {
{"diabetes", true},
{"glass", true},
{"iris", true},
@@ -173,14 +196,14 @@ int main(int argc, char** argv)
{"mfeat-factors", true},
{"test", true}
};
string file_name;
string path;
std::string file_name;
std::string path;
int max_depth;
int min_length;
float max_cutpoints;
tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv);
if (datasets.find(file_name) == datasets.end() && file_name != "all") {
cout << "Invalid file name: " << file_name << endl;
std::cout << "Invalid file name: " << file_name << std::endl;
usage(argv[0]);
exit(1);
}
@@ -188,10 +211,10 @@ int main(int argc, char** argv)
process_all_files(datasets, path, max_depth, min_length, max_cutpoints);
else {
process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints);
cout << "File name ....: " << file_name << endl;
cout << "Max depth ....: " << max_depth << endl;
cout << "Min length ...: " << min_length << endl;
cout << "Max cutpoints : " << max_cutpoints << endl;
std::cout << "File name ....: " << file_name << std::endl;
std::cout << "Max depth ....: " << max_depth << std::endl;
std::cout << "Min length ...: " << min_length << std::endl;
std::cout << "Max cutpoints : " << max_cutpoints << std::endl;
}
return 0;
}

View File

@@ -3,7 +3,7 @@ sonar.organization=rmontanana
# This is the name and version displayed in the SonarCloud UI.
sonar.projectName=mdlp
sonar.projectVersion=1.0.2
sonar.projectVersion=1.1.3
# sonar.test.exclusions=tests/**
# sonar.tests=tests/
# sonar.coverage.exclusions=tests/**,sample/**

View File

@@ -63,7 +63,7 @@ void ArffFiles::load(const string& fileName, bool classLast)
type = "";
while (ss >> type_w)
type += type_w + " ";
attributes.emplace_back(attribute, trim(type));
attributes.emplace_back(trim(attribute), trim(type));
continue;
}
if (line[0] == '@') {
@@ -111,8 +111,8 @@ void ArffFiles::generateDataset(bool classLast)
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t") + 1);
s.erase(0, s.find_first_not_of(" '\n\r\t"));
s.erase(s.find_last_not_of(" '\n\r\t") + 1);
return s;
}

364
tests/BinDisc_unittest.cpp Normal file
View File

@@ -0,0 +1,364 @@
#include <fstream>
#include <string>
#include <iostream>
#include "gtest/gtest.h"
#include "ArffFiles.h"
#include "../BinDisc.h"
namespace mdlp {
const float margin = 1e-4;
static std::string set_data_path()
{
std::string path = "../datasets/";
std::ifstream file(path + "iris.arff");
if (file.is_open()) {
file.close();
return path;
}
return "../../tests/datasets/";
}
const std::string data_path = set_data_path();
class TestBinDisc3U : public BinDisc, public testing::Test {
public:
TestBinDisc3U(int n_bins = 3) : BinDisc(n_bins, strategy_t::UNIFORM) {};
};
class TestBinDisc3Q : public BinDisc, public testing::Test {
public:
TestBinDisc3Q(int n_bins = 3) : BinDisc(n_bins, strategy_t::QUANTILE) {};
};
class TestBinDisc4U : public BinDisc, public testing::Test {
public:
TestBinDisc4U(int n_bins = 4) : BinDisc(n_bins, strategy_t::UNIFORM) {};
};
class TestBinDisc4Q : public BinDisc, public testing::Test {
public:
TestBinDisc4Q(int n_bins = 4) : BinDisc(n_bins, strategy_t::QUANTILE) {};
};
TEST_F(TestBinDisc3U, Easy3BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
auto y = labels_t();
fit(X, y);
auto cuts = getCutPoints();
ASSERT_EQ(3, cuts.size());
EXPECT_NEAR(3.66667, cuts.at(0), margin);
EXPECT_NEAR(6.33333, cuts.at(1), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts.at(2));
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3Q, Easy3BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
fit(X);
auto cuts = getCutPoints();
ASSERT_EQ(3, cuts.size());
EXPECT_NEAR(3.666667, cuts[0], margin);
EXPECT_NEAR(6.333333, cuts[1], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3U, X10BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
fit(X);
auto cuts = getCutPoints();
ASSERT_EQ(3, cuts.size());
EXPECT_EQ(4.0, cuts[0]);
EXPECT_EQ(7.0, cuts[1]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3Q, X10BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
fit(X);
auto cuts = getCutPoints();
ASSERT_EQ(3, cuts.size());
EXPECT_EQ(4, cuts[0]);
EXPECT_EQ(7, cuts[1]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3U, X11BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
fit(X);
auto cuts = getCutPoints();
ASSERT_EQ(3, cuts.size());
EXPECT_NEAR(4.33333, cuts[0], margin);
EXPECT_NEAR(7.66667, cuts[1], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3U, X11BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
fit(X);
auto cuts = getCutPoints();
ASSERT_EQ(3, cuts.size());
EXPECT_NEAR(4.33333, cuts[0], margin);
EXPECT_NEAR(7.66667, cuts[1], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3U, ConstantUniform)
{
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
fit(X);
auto cuts = getCutPoints();
ASSERT_EQ(1, cuts.size());
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 0, 0 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3Q, ConstantQuantile)
{
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(1, cuts.size());
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 0, 0 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc3U, EmptyUniform)
{
samples_t X = {};
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(1, cuts.size());
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
}
TEST_F(TestBinDisc3Q, EmptyQuantile)
{
samples_t X = {};
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(1, cuts.size());
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]);
}
TEST(TestBinDisc3, ExceptionNumberBins)
{
EXPECT_THROW(BinDisc(2), std::invalid_argument);
}
TEST_F(TestBinDisc3U, EasyRepeated)
{
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
fit(X);
auto cuts = getCutPoints();
ASSERT_EQ(3, cuts.size());
EXPECT_NEAR(1.66667, cuts[0], margin);
EXPECT_NEAR(2.33333, cuts[1], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
auto labels = transform(X);
labels_t expected = { 2, 0, 0, 2, 0, 0, 2, 0, 0 };
EXPECT_EQ(expected, labels);
ASSERT_EQ(3.0, X[0]); // X is not modified
}
TEST_F(TestBinDisc3Q, EasyRepeated)
{
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(2, cuts.size());
EXPECT_NEAR(1.66667, cuts[0], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[1]);
auto labels = transform(X);
labels_t expected = { 1, 0, 0, 1, 0, 0, 1, 0, 0 };
EXPECT_EQ(expected, labels);
ASSERT_EQ(3.0, X[0]); // X is not modified
}
TEST_F(TestBinDisc4U, Easy4BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4, cuts.size());
ASSERT_EQ(3.75, cuts[0]);
EXPECT_EQ(6.5, cuts[1]);
EXPECT_EQ(9.25, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4Q, Easy4BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4, cuts.size());
ASSERT_EQ(3.75, cuts[0]);
EXPECT_EQ(6.5, cuts[1]);
EXPECT_EQ(9.25, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4U, X13BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4, cuts.size());
EXPECT_EQ(4.0, cuts[0]);
EXPECT_EQ(7.0, cuts[1]);
EXPECT_EQ(10.0, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4Q, X13BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4, cuts.size());
EXPECT_EQ(4.0, cuts[0]);
EXPECT_EQ(7.0, cuts[1]);
EXPECT_EQ(10.0, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4U, X14BinsUniform)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4, cuts.size());
EXPECT_EQ(4.25, cuts[0]);
EXPECT_EQ(7.5, cuts[1]);
EXPECT_EQ(10.75, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4Q, X14BinsQuantile)
{
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4, cuts.size());
EXPECT_EQ(4.25, cuts[0]);
EXPECT_EQ(7.5, cuts[1]);
EXPECT_EQ(10.75, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4U, X15BinsUniform)
{
samples_t X = { 15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4, cuts.size());
EXPECT_EQ(4.5, cuts[0]);
EXPECT_EQ(8, cuts[1]);
EXPECT_EQ(11.5, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
auto labels = transform(X);
labels_t expected = { 3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4Q, X15BinsQuantile)
{
samples_t X = { 15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4, cuts.size());
EXPECT_EQ(4.5, cuts[0]);
EXPECT_EQ(8, cuts[1]);
EXPECT_EQ(11.5, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
auto labels = transform(X);
labels_t expected = { 3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4U, RepeatedValuesUniform)
{
samples_t X = { 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0 };
// 0 1 2 3 4 5 6 7 8 9
fit(X);
auto cuts = getCutPoints();
EXPECT_EQ(4, cuts.size());
EXPECT_EQ(1.0, cuts[0]);
EXPECT_EQ(2.0, cuts[1]);
ASSERT_EQ(3.0, cuts[2]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]);
auto labels = transform(X);
labels_t expected = { 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4Q, RepeatedValuesQuantile)
{
samples_t X = { 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0 };
// 0 1 2 3 4 5 6 7 8 9
fit(X);
auto cuts = getCutPoints();
ASSERT_EQ(3, cuts.size());
EXPECT_EQ(2.0, cuts[0]);
ASSERT_EQ(3.0, cuts[1]);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]);
auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels);
}
TEST_F(TestBinDisc4U, irisUniform)
{
ArffFiles file;
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
fit(X[0]);
auto Xt = transform(X[0]);
labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
EXPECT_EQ(expected, Xt);
auto Xtt = fit_transform(X[0], file.getY());
EXPECT_EQ(expected, Xtt);
auto Xt_t = torch::tensor(X[0], torch::kFloat32);
auto y_t = torch::tensor(file.getY(), torch::kInt64);
auto Xtt_t = fit_transform_t(Xt_t, y_t);
for (int i = 0; i < expected.size(); i++)
EXPECT_EQ(expected[i], Xtt_t[i].item<int64_t>());
}
TEST_F(TestBinDisc4Q, irisQuantile)
{
ArffFiles file;
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
fit(X[0]);
auto Xt = transform(X[0]);
labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 };
EXPECT_EQ(expected, Xt);
auto Xtt = fit_transform(X[0], file.getY());
EXPECT_EQ(expected, Xtt);
auto Xt_t = torch::tensor(X[0], torch::kFloat32);
auto y_t = torch::tensor(file.getY(), torch::kInt64);
auto Xtt_t = fit_transform_t(Xt_t, y_t);
for (int i = 0; i < expected.size(); i++)
EXPECT_EQ(expected[i], Xtt_t[i].item<int64_t>());
fit_t(Xt_t, y_t);
auto Xt_t2 = transform_t(Xt_t);
for (int i = 0; i < expected.size(); i++)
EXPECT_EQ(expected[i], Xt_t2[i].item<int64_t>());
}
}

View File

@@ -1,9 +1,8 @@
set(CMAKE_CXX_STANDARD 11)
cmake_minimum_required(VERSION 3.20)
set(CMAKE_CXX_STANDARD 17)
cmake_policy(SET CMP0135 NEW)
include(FetchContent)
include_directories(${GTEST_INCLUDE_DIRS})
FetchContent_Declare(
googletest
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
@@ -12,18 +11,35 @@ FetchContent_Declare(
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)
find_package(Torch REQUIRED)
enable_testing()
include_directories(${TORCH_INCLUDE_DIRS})
add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp)
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp)
target_link_libraries(Metrics_unittest GTest::gtest_main)
target_link_libraries(FImdlp_unittest GTest::gtest_main)
target_compile_options(Metrics_unittest PRIVATE --coverage)
target_compile_options(FImdlp_unittest PRIVATE --coverage)
target_link_options(Metrics_unittest PRIVATE --coverage)
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp ../Discretizer.cpp)
target_link_libraries(FImdlp_unittest GTest::gtest_main "${TORCH_LIBRARIES}")
target_compile_options(FImdlp_unittest PRIVATE --coverage)
target_link_options(FImdlp_unittest PRIVATE --coverage)
add_executable(BinDisc_unittest ../BinDisc.cpp ArffFiles.cpp BinDisc_unittest.cpp ../Discretizer.cpp)
target_link_libraries(BinDisc_unittest GTest::gtest_main "${TORCH_LIBRARIES}")
target_compile_options(BinDisc_unittest PRIVATE --coverage)
target_link_options(BinDisc_unittest PRIVATE --coverage)
add_executable(Discretizer_unittest ../BinDisc.cpp ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp ../Discretizer.cpp Discretizer_unittest.cpp)
target_link_libraries(Discretizer_unittest GTest::gtest_main "${TORCH_LIBRARIES}")
target_compile_options(Discretizer_unittest PRIVATE --coverage)
target_link_options(Discretizer_unittest PRIVATE --coverage)
include(GoogleTest)
gtest_discover_tests(Metrics_unittest)
gtest_discover_tests(FImdlp_unittest)
gtest_discover_tests(BinDisc_unittest)
gtest_discover_tests(Discretizer_unittest)

View File

@@ -0,0 +1,74 @@
#include <fstream>
#include <string>
#include <iostream>
#include "gtest/gtest.h"
#include "ArffFiles.h"
#include "../Discretizer.h"
#include "../BinDisc.h"
#include "../CPPFImdlp.h"
namespace mdlp {
const float margin = 1e-4;
static std::string set_data_path()
{
std::string path = "../datasets/";
std::ifstream file(path + "iris.arff");
if (file.is_open()) {
file.close();
return path;
}
return "../../tests/datasets/";
}
const std::string data_path = set_data_path();
TEST(Discretizer, BinIrisUniform)
{
ArffFiles file;
Discretizer* disc = new BinDisc(4, strategy_t::UNIFORM);
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
auto y = labels_t();
disc->fit(X[0], y);
auto Xt = disc->transform(X[0]);
labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
delete disc;
EXPECT_EQ(expected, Xt);
}
TEST(Discretizer, BinIrisQuantile)
{
ArffFiles file;
Discretizer* disc = new BinDisc(4, strategy_t::QUANTILE);
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
auto y = labels_t();
disc->fit(X[0], y);
auto Xt = disc->transform(X[0]);
labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 };
delete disc;
EXPECT_EQ(expected, Xt);
}
TEST(Discretizer, FImdlpIris)
{
labels_t expected = {
5, 3, 4, 4, 5, 5, 5, 5, 2, 4, 5, 5, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5,
5, 4, 5, 3, 5, 5, 5, 4, 4, 5, 5, 5, 4, 4, 5, 4, 3, 5, 5, 0, 4, 5,
5, 3, 5, 4, 5, 4, 4, 4, 4, 0, 1, 1, 4, 0, 2, 0, 0, 3, 0, 2, 2, 4,
3, 0, 0, 0, 4, 1, 0, 1, 2, 3, 1, 3, 2, 0, 0, 0, 0, 0, 3, 5, 4, 0,
3, 0, 0, 3, 0, 0, 0, 3, 2, 2, 0, 1, 4, 0, 3, 2, 3, 3, 0, 2, 0, 5,
4, 0, 3, 0, 1, 4, 3, 5, 0, 0, 4, 1, 1, 0, 4, 4, 1, 3, 1, 3, 1, 5,
1, 1, 0, 3, 5, 4, 3, 4, 4, 4, 0, 4, 4, 3, 0, 3, 5, 3
};
ArffFiles file;
Discretizer* disc = new CPPFImdlp();
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
disc->fit(X[1], y);
auto computed = disc->transform(X[1]);
delete disc;
EXPECT_EQ(computed.size(), expected.size());
for (unsigned long i = 0; i < computed.size(); i++) {
EXPECT_EQ(computed[i], expected[i]);
}
}
}

View File

@@ -15,11 +15,11 @@ throw; \
, etype)
namespace mdlp {
class TestFImdlp: public CPPFImdlp, public testing::Test {
class TestFImdlp : public CPPFImdlp, public testing::Test {
public:
precision_t precision = 0.000001f;
TestFImdlp(): CPPFImdlp() {}
TestFImdlp() : CPPFImdlp() {}
string data_path;
@@ -329,4 +329,31 @@ namespace mdlp {
}
}
TEST_F(TestFImdlp, TransformTest)
{
labels_t expected = {
5, 3, 4, 4, 5, 5, 5, 5, 2, 4, 5, 5, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5,
5, 4, 5, 3, 5, 5, 5, 4, 4, 5, 5, 5, 4, 4, 5, 4, 3, 5, 5, 0, 4, 5,
5, 3, 5, 4, 5, 4, 4, 4, 4, 0, 1, 1, 4, 0, 2, 0, 0, 3, 0, 2, 2, 4,
3, 0, 0, 0, 4, 1, 0, 1, 2, 3, 1, 3, 2, 0, 0, 0, 0, 0, 3, 5, 4, 0,
3, 0, 0, 3, 0, 0, 0, 3, 2, 2, 0, 1, 4, 0, 3, 2, 3, 3, 0, 2, 0, 5,
4, 0, 3, 0, 1, 4, 3, 5, 0, 0, 4, 1, 1, 0, 4, 4, 1, 3, 1, 3, 1, 5,
1, 1, 0, 3, 5, 4, 3, 4, 4, 4, 0, 4, 4, 3, 0, 3, 5, 3
};
ArffFiles file;
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
fit(X[1], y);
// auto computed = transform(X[1]);
// EXPECT_EQ(computed.size(), expected.size());
// for (unsigned long i = 0; i < computed.size(); i++) {
// EXPECT_EQ(computed[i], expected[i]);
// }
// auto computed_ft = fit_transform(X[1], y);
// EXPECT_EQ(computed_ft.size(), expected.size());
// for (unsigned long i = 0; i < computed_ft.size(); i++) {
// EXPECT_EQ(computed_ft[i], expected[i]);
// }
}
}

View File

@@ -2,13 +2,13 @@
#include "../Metrics.h"
namespace mdlp {
class TestMetrics: public Metrics, public testing::Test {
class TestMetrics : public Metrics, public testing::Test {
public:
labels_t y_ = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
indices_t indices_ = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
precision_t precision = 0.000001f;
precision_t precision = 1e-6;
TestMetrics(): Metrics(y_, indices_) {};
TestMetrics() : Metrics(y_, indices_) {};
void SetUp() override
{

View File

@@ -1,20 +1,15 @@
if [ -d build ] ; then
#!/bin/bash
if [ -d build ] && [ "$1" != "run" ]; then
rm -fr build
fi
if [ -d gcovr-report ] ; then
rm -fr gcovr-report
fi
cmake -S . -B build -Wno-dev
cmake -S . -B build -Wno-dev -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS="--coverage" -DCMAKE_C_FLAGS="--coverage"
cmake --build build
cd build
ctest --output-on-failure
cd ..
if [ ! -d gcovr-report ] ; then
mkdir gcovr-report
fi
rm -fr gcovr-report/* 2>/dev/null
#lcov --capture --directory ./ --output-file lcoverage/main_coverage.info
#lcov --remove lcoverage/main_coverage.info 'v1/*' '/Applications/*' '*/tests/*' --output-file lcoverage/main_coverage.info -q
#lcov --list lcoverage/main_coverage.info
mkdir gcovr-report
cd ..
gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml
gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --gcov-filter "BinDisc.cpp" --gcov-filter "Discretizer.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml --exclude-noncode-lines

412
tests/testKbins.py Normal file
View File

@@ -0,0 +1,412 @@
from scipy.io.arff import loadarff
from sklearn.preprocessing import KBinsDiscretizer
def test(clf, X, expected, title):
X = [[x] for x in X]
clf.fit(X)
computed = [int(x[0]) for x in clf.transform(X)]
print(f"{title}")
print(f"{computed=}")
print(f"{expected=}")
assert computed == expected
print("-" * 80)
# Test Uniform Strategy
clf3u = KBinsDiscretizer(
n_bins=3, encode="ordinal", strategy="uniform", subsample=200_000
)
clf3q = KBinsDiscretizer(
n_bins=3, encode="ordinal", strategy="quantile", subsample=200_000
)
clf4u = KBinsDiscretizer(
n_bins=4, encode="ordinal", strategy="uniform", subsample=200_000
)
clf4q = KBinsDiscretizer(
n_bins=4, encode="ordinal", strategy="quantile", subsample=200_000
)
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
labels = [0, 0, 0, 1, 1, 1, 2, 2, 2]
test(clf3u, X, labels, title="Easy3BinsUniform")
test(clf3q, X, labels, title="Easy3BinsQuantile")
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
labels = [0, 0, 0, 1, 1, 1, 2, 2, 2, 2]
# En C++ se obtiene el mismo resultado en ambos, no como aquí
labels2 = [0, 0, 0, 1, 1, 1, 1, 2, 2, 2]
test(clf3u, X, labels, title="X10BinsUniform")
test(clf3q, X, labels2, title="X10BinsQuantile")
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
labels = [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2]
# En C++ se obtiene el mismo resultado en ambos, no como aquí
# labels2 = [0, 0, 0, 1, 1, 1, 1, 2, 2, 2]
test(clf3u, X, labels, title="X11BinsUniform")
test(clf3q, X, labels, title="X11BinsQuantile")
#
X = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
labels = [0, 0, 0, 0, 0, 0]
test(clf3u, X, labels, title="ConstantUniform")
test(clf3q, X, labels, title="ConstantQuantile")
#
X = [3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0]
labels = [2, 0, 0, 2, 0, 0, 2, 0, 0]
labels2 = [1, 0, 0, 1, 0, 0, 1, 0, 0] # igual que en C++
test(clf3u, X, labels, title="EasyRepeatedUniform")
test(clf3q, X, labels2, title="EasyRepeatedQuantile")
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
labels = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
test(clf4u, X, labels, title="Easy4BinsUniform")
test(clf4q, X, labels, title="Easy4BinsQuantile")
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]
labels = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
test(clf4u, X, labels, title="X13BinsUniform")
test(clf4q, X, labels, title="X13BinsQuantile")
#
X = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0]
labels = [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
test(clf4u, X, labels, title="X14BinsUniform")
test(clf4q, X, labels, title="X14BinsQuantile")
#
X1 = [15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0]
X2 = [15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0]
labels1 = [3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0]
labels2 = [3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0]
test(clf4u, X1, labels1, title="X15BinsUniform")
test(clf4q, X2, labels2, title="X15BinsQuantile")
#
X = [0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0]
labels = [0, 1, 1, 1, 2, 2, 3, 3, 3, 3]
test(clf4u, X, labels, title="RepeatedValuesUniform")
test(clf4q, X, labels, title="RepeatedValuesQuantile")
print(f"Uniform {clf4u.bin_edges_=}")
print(f"Quaintile {clf4q.bin_edges_=}")
print("-" * 80)
#
data, meta = loadarff("tests/datasets/iris.arff")
labelsu = [
0,
0,
0,
0,
0,
1,
0,
0,
0,
0,
1,
0,
0,
0,
1,
1,
1,
0,
1,
0,
1,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
0,
1,
0,
0,
1,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
0,
3,
2,
2,
1,
2,
1,
2,
0,
2,
0,
0,
1,
1,
1,
1,
2,
1,
1,
2,
1,
1,
1,
2,
1,
2,
2,
2,
2,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
1,
1,
1,
1,
1,
0,
1,
1,
1,
2,
0,
1,
2,
1,
3,
2,
2,
3,
0,
3,
2,
3,
2,
2,
2,
1,
1,
2,
2,
3,
3,
1,
2,
1,
3,
2,
2,
3,
2,
1,
2,
3,
3,
3,
2,
2,
1,
3,
2,
2,
1,
2,
2,
2,
1,
2,
2,
2,
2,
2,
2,
1,
]
labelsq = [
1,
0,
0,
0,
0,
1,
0,
0,
0,
0,
1,
0,
0,
0,
2,
1,
1,
1,
1,
1,
1,
1,
0,
1,
0,
0,
0,
1,
1,
0,
0,
1,
1,
1,
0,
0,
1,
0,
0,
1,
0,
0,
0,
0,
1,
0,
1,
0,
1,
0,
3,
3,
3,
1,
3,
1,
2,
0,
3,
1,
0,
2,
2,
2,
1,
3,
1,
2,
2,
1,
2,
2,
2,
2,
3,
3,
3,
3,
2,
1,
1,
1,
2,
2,
1,
2,
3,
2,
1,
1,
1,
2,
2,
0,
1,
1,
1,
2,
1,
1,
2,
2,
3,
2,
3,
3,
0,
3,
3,
3,
3,
3,
3,
1,
2,
3,
3,
3,
3,
2,
3,
1,
3,
2,
3,
3,
2,
2,
3,
3,
3,
3,
3,
2,
2,
3,
2,
3,
2,
3,
3,
3,
2,
3,
3,
3,
2,
3,
2,
2,
]
# test(clf4u, data["sepallength"], labelsu, title="IrisUniform")
# test(clf4q, data["sepallength"], labelsq, title="IrisQuantile")
sepallength = [[x] for x in data["sepallength"]]
clf4u.fit(sepallength)
clf4q.fit(sepallength)
computedu = clf4u.transform(sepallength)
computedq = clf4q.transform(sepallength)
wrongu = 0
wrongq = 0
for i in range(len(labelsu)):
if labelsu[i] != computedu[i]:
wrongu += 1
if labelsq[i] != computedq[i]:
wrongq += 1
print(f"Iris sepallength diff. between BinDisc & sklearn::KBins Uniform ={wrongu:3d}")
print(f"Iris sepallength diff. between BinDisc & sklearn::KBins Quantile ={wrongq:3d}")

View File

@@ -8,11 +8,11 @@
using namespace std;
namespace mdlp {
typedef float precision_t;
typedef vector<precision_t> samples_t;
typedef vector<int> labels_t;
typedef vector<size_t> indices_t;
typedef vector<precision_t> cutPoints_t;
typedef map<pair<int, int>, precision_t> cacheEnt_t;
typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
typedef std::vector<precision_t> samples_t;
typedef std::vector<int> labels_t;
typedef std::vector<size_t> indices_t;
typedef std::vector<precision_t> cutPoints_t;
typedef std::map<std::pair<int, int>, precision_t> cacheEnt_t;
typedef std::map<std::tuple<int, int, int>, precision_t> cacheIg_t;
}
#endif