22 Commits

Author SHA1 Message Date
Ricardo Montañana Gómez
cb9babace1 Merge c488ace719 into 7b0673fd4b 2024-07-02 11:50:55 +02:00
c488ace719 Fix FImdlp tests 2024-07-02 11:50:42 +02:00
8f6e16f04f Fix BinDisc quantile mistakes 2024-07-02 09:40:06 +02:00
7b0673fd4b Update README 2024-06-24 11:47:03 +02:00
a1346e1943 Fix Error in percentile method 2024-06-24 10:55:26 +02:00
b3fc598c29 Update build.yml 2024-06-14 22:04:29 +02:00
cc1efa0b4e Update README 2024-06-14 22:01:11 +02:00
90965877eb Add Makefile with build & test actions 2024-06-14 21:17:30 +02:00
c4e6c041fe Fix int type 2024-06-09 00:29:55 +02:00
7938df7f0f Update sonar mdlp version 2024-06-08 13:25:28 +02:00
7ee9896734 Fix mistake in github action 2024-06-08 12:36:56 +02:00
8f7f605670 Fix mistake in github action 2024-06-08 12:32:18 +02:00
2f55b27691 Fix mistake in github action 2024-06-08 12:28:23 +02:00
378fbd51ef Fix mistake in github action 2024-06-08 12:25:17 +02:00
402d0da878 Fix mistake in github action 2024-06-08 12:23:28 +02:00
f34bcc2ed7 Add libtorch to github action 2024-06-08 12:20:51 +02:00
c9ba35fb58 update test script 2024-06-08 12:02:16 +02:00
e205668906 Add torch methods to discretize
Add fit_transform methods
2024-06-07 23:54:42 +02:00
633aa52849 Refactor sample build 2024-06-06 12:04:55 +02:00
61de687476 Fix library creation problem 2024-06-06 11:13:50 +02:00
7ff88c8e4b Update Discretizer version 2024-06-05 17:55:45 +02:00
Ricardo Montañana Gómez
638bb2a59e Discretizer (#8)
* Add better check in testKBins.py

* Add Discretizer base class for Both discretizers

* Refactor order of constructors init
2024-06-05 17:53:08 +02:00
33 changed files with 1259 additions and 379 deletions

View File

@@ -22,15 +22,19 @@ jobs:
run: | run: |
sudo apt-get -y install lcov sudo apt-get -y install lcov
sudo apt-get -y install gcovr sudo apt-get -y install gcovr
- name: Install Libtorch
run: |
wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.3.1%2Bcpu.zip
unzip libtorch-cxx11-abi-shared-with-deps-2.3.1+cpu.zip
- name: Tests & build-wrapper - name: Tests & build-wrapper
run: | run: |
cmake -S . -B build -Wno-dev cmake -S . -B build -Wno-dev -DCMAKE_PREFIX_PATH=$(pwd)/libtorch -DENABLE_TESTING=ON
build-wrapper-linux-x86-64 --out-dir ${{ env.BUILD_WRAPPER_OUT_DIR }} cmake --build build/ --config Release build-wrapper-linux-x86-64 --out-dir ${{ env.BUILD_WRAPPER_OUT_DIR }} cmake --build build/ --config Release
cd build cd build
make make
ctest -C Release --output-on-failure --test-dir tests ctest -C Release --output-on-failure --test-dir tests
cd .. cd ..
gcovr -f CPPFImdlp.cpp -f Metrics.cpp -f BinDisc.cpp --txt --sonarqube=coverage.xml gcovr -f CPPFImdlp.cpp -f Metrics.cpp -f BinDisc.cpp -f Discretizer.cpp --txt --sonarqube=coverage.xml
- name: Run sonar-scanner - name: Run sonar-scanner
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

2
.gitignore vendored
View File

@@ -33,6 +33,8 @@
**/build **/build
build_Debug build_Debug
build_Release build_Release
build_debug
build_release
**/lcoverage **/lcoverage
.idea .idea
cmake-* cmake-*

11
.vscode/launch.json vendored
View File

@@ -8,15 +8,10 @@
"name": "C++ Launch config", "name": "C++ Launch config",
"type": "cppdbg", "type": "cppdbg",
"request": "launch", "request": "launch",
"program": "${workspaceFolder}/build/sample/sample", "program": "${workspaceFolder}/tests/build/BinDisc_unittest",
"cwd": "${workspaceFolder}/build/sample", "cwd": "${workspaceFolder}/tests/build",
"args": [ "args": [],
"-f",
"glass"
],
"targetArchitecture": "arm64",
"launchCompleteCommand": "exec-run", "launchCompleteCommand": "exec-run",
"preLaunchTask": "CMake: build",
"stopAtEntry": false, "stopAtEntry": false,
"linux": { "linux": {
"MIMode": "gdb", "MIMode": "gdb",

102
.vscode/settings.json vendored
View File

@@ -5,5 +5,105 @@
}, },
"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools", "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
"cmake.configureOnOpen": true, "cmake.configureOnOpen": true,
"sonarlint.pathToCompileCommands": "${workspaceFolder}/build/compile_commands.json" "sonarlint.pathToCompileCommands": "${workspaceFolder}/build/compile_commands.json",
"files.associations": {
"*.rmd": "markdown",
"*.py": "python",
"vector": "cpp",
"__bit_reference": "cpp",
"__bits": "cpp",
"__config": "cpp",
"__debug": "cpp",
"__errc": "cpp",
"__hash_table": "cpp",
"__locale": "cpp",
"__mutex_base": "cpp",
"__node_handle": "cpp",
"__nullptr": "cpp",
"__split_buffer": "cpp",
"__string": "cpp",
"__threading_support": "cpp",
"__tuple": "cpp",
"array": "cpp",
"atomic": "cpp",
"bitset": "cpp",
"cctype": "cpp",
"chrono": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"exception": "cpp",
"initializer_list": "cpp",
"ios": "cpp",
"iosfwd": "cpp",
"istream": "cpp",
"limits": "cpp",
"locale": "cpp",
"memory": "cpp",
"mutex": "cpp",
"new": "cpp",
"optional": "cpp",
"ostream": "cpp",
"ratio": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"streambuf": "cpp",
"string": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"typeinfo": "cpp",
"unordered_map": "cpp",
"variant": "cpp",
"algorithm": "cpp",
"iostream": "cpp",
"iomanip": "cpp",
"numeric": "cpp",
"set": "cpp",
"__tree": "cpp",
"deque": "cpp",
"list": "cpp",
"map": "cpp",
"unordered_set": "cpp",
"any": "cpp",
"condition_variable": "cpp",
"forward_list": "cpp",
"fstream": "cpp",
"stack": "cpp",
"thread": "cpp",
"__memory": "cpp",
"filesystem": "cpp",
"*.toml": "toml",
"utility": "cpp",
"span": "cpp",
"*.tcc": "cpp",
"bit": "cpp",
"charconv": "cpp",
"cinttypes": "cpp",
"codecvt": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory_resource": "cpp",
"random": "cpp",
"source_location": "cpp",
"format": "cpp",
"numbers": "cpp",
"semaphore": "cpp",
"stop_token": "cpp",
"text_encoding": "cpp",
"typeindex": "cpp",
"valarray": "cpp"
}
} }

View File

@@ -1,5 +1,4 @@
#include <algorithm> #include <algorithm>
#include <limits>
#include <cmath> #include <cmath>
#include "BinDisc.h" #include "BinDisc.h"
#include <iostream> #include <iostream>
@@ -7,7 +6,8 @@
namespace mdlp { namespace mdlp {
BinDisc::BinDisc(int n_bins, strategy_t strategy) : n_bins{ n_bins }, strategy{ strategy } BinDisc::BinDisc(int n_bins, strategy_t strategy) :
Discretizer(), n_bins{ n_bins }, strategy{ strategy }
{ {
if (n_bins < 3) { if (n_bins < 3) {
throw std::invalid_argument("n_bins must be greater than 2"); throw std::invalid_argument("n_bins must be greater than 2");
@@ -16,9 +16,11 @@ namespace mdlp {
BinDisc::~BinDisc() = default; BinDisc::~BinDisc() = default;
void BinDisc::fit(samples_t& X) void BinDisc::fit(samples_t& X)
{ {
// y is included for compatibility with the Discretizer interface
cutPoints.clear(); cutPoints.clear();
if (X.empty()) { if (X.empty()) {
cutPoints.push_back(std::numeric_limits<precision_t>::max()); cutPoints.push_back(0.0);
cutPoints.push_back(0.0);
return; return;
} }
if (strategy == strategy_t::QUANTILE) { if (strategy == strategy_t::QUANTILE) {
@@ -27,15 +29,18 @@ namespace mdlp {
fit_uniform(X); fit_uniform(X);
} }
} }
void BinDisc::fit(samples_t& X, labels_t& y)
{
fit(X);
}
std::vector<precision_t> linspace(precision_t start, precision_t end, int num) std::vector<precision_t> linspace(precision_t start, precision_t end, int num)
{ {
// Doesn't include end point as it is not needed
if (start == end) { if (start == end) {
return { 0 }; return { start, end };
} }
precision_t delta = (end - start) / static_cast<precision_t>(num - 1); precision_t delta = (end - start) / static_cast<precision_t>(num - 1);
std::vector<precision_t> linspc; std::vector<precision_t> linspc;
for (size_t i = 0; i < num - 1; ++i) { for (size_t i = 0; i < num; ++i) {
precision_t val = start + delta * static_cast<precision_t>(i); precision_t val = start + delta * static_cast<precision_t>(i);
linspc.push_back(val); linspc.push_back(val);
} }
@@ -49,17 +54,19 @@ namespace mdlp {
{ {
// Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html // Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html
std::vector<precision_t> results; std::vector<precision_t> results;
bool first = true;
results.reserve(percentiles.size()); results.reserve(percentiles.size());
for (auto percentile : percentiles) { for (auto percentile : percentiles) {
const size_t i = static_cast<size_t>(std::floor(static_cast<double>(data.size() - 1) * percentile / 100.)); const size_t i = static_cast<size_t>(std::floor(static_cast<double>(data.size() - 1) * percentile / 100.));
const auto indexLower = clip(i, 0, data.size() - 1); const auto indexLower = clip(i, 0, data.size() - 2);
const double percentI = static_cast<double>(indexLower) / static_cast<double>(data.size() - 1); const double percentI = static_cast<double>(indexLower) / static_cast<double>(data.size() - 1);
const double fraction = const double fraction =
(percentile / 100.0 - percentI) / (percentile / 100.0 - percentI) /
(static_cast<double>(indexLower + 1) / static_cast<double>(data.size() - 1) - percentI); (static_cast<double>(indexLower + 1) / static_cast<double>(data.size() - 1) - percentI);
const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction; const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction;
if (value != results.back()) if (value != results.back() || first) // first needed as results.back() return is undefined for empty vectors
results.push_back(value); results.push_back(value);
first = false;
} }
return results; return results;
} }
@@ -69,70 +76,16 @@ namespace mdlp {
auto data = X; auto data = X;
std::sort(data.begin(), data.end()); std::sort(data.begin(), data.end());
if (data.front() == data.back() || data.size() == 1) { if (data.front() == data.back() || data.size() == 1) {
// if X is constant // if X is constant, pass any two given points that shall be ignored in transform
cutPoints.push_back(std::numeric_limits<precision_t>::max()); cutPoints.push_back(data.front());
cutPoints.push_back(data.front());
return; return;
} }
cutPoints = percentile(data, quantiles); cutPoints = percentile(data, quantiles);
normalizeCutPoints();
} }
void BinDisc::fit_uniform(samples_t& X) void BinDisc::fit_uniform(samples_t& X)
{ {
auto minmax = std::minmax_element(X.begin(), X.end()); auto minmax = std::minmax_element(X.begin(), X.end());
cutPoints = linspace(*minmax.first, *minmax.second, n_bins + 1); cutPoints = linspace(*minmax.first, *minmax.second, n_bins + 1);
normalizeCutPoints();
}
void BinDisc::normalizeCutPoints()
{
// Add max value to the end
cutPoints.push_back(std::numeric_limits<precision_t>::max());
// Remove first as it is not needed
cutPoints.erase(cutPoints.begin());
}
labels_t& BinDisc::transform(const samples_t& X)
{
discretizedData.clear();
discretizedData.reserve(X.size());
for (const precision_t& item : X) {
auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item);
discretizedData.push_back(upper - cutPoints.begin());
}
return discretizedData;
} }
} }
// void BinDisc::fit_quantile(samples_t& X)
// {
// cutPoints.clear();
// if (X.empty()) {
// cutPoints.push_back(std::numeric_limits<float>::max());
// return;
// }
// samples_t data = X;
// std::sort(data.begin(), data.end());
// float min_val = data.front();
// float max_val = data.back();
// // Handle case of all data points having the same value
// if (min_val == max_val) {
// cutPoints.push_back(std::numeric_limits<float>::max());
// return;
// }
// int first = X.size() / n_bins;
// cutPoints.push_back(data.at(first - 1));
// int bins_done = 1;
// int prev = first - 1;
// while (bins_done < n_bins) {
// int next = first * (bins_done + 1) - 1;
// while (next < X.size() && data.at(next) == data[prev]) {
// ++next;
// }
// if (next == X.size() || bins_done == n_bins - 1) {
// cutPoints.push_back(std::numeric_limits<float>::max());
// break;
// } else {
// cutPoints.push_back(data[next]);
// bins_done++;
// prev = next;
// }
// }
// }

View File

@@ -2,30 +2,26 @@
#define BINDISC_H #define BINDISC_H
#include "typesFImdlp.h" #include "typesFImdlp.h"
#include "Discretizer.h"
#include <string> #include <string>
namespace mdlp { namespace mdlp {
enum class strategy_t { enum class strategy_t {
UNIFORM, UNIFORM,
QUANTILE QUANTILE
}; };
class BinDisc { class BinDisc : public Discretizer {
public: public:
BinDisc(int n_bins = 3, strategy_t strategy = strategy_t::UNIFORM); BinDisc(int n_bins = 3, strategy_t strategy = strategy_t::UNIFORM);
~BinDisc(); ~BinDisc();
void fit(samples_t&); // y is included for compatibility with the Discretizer interface
inline cutPoints_t getCutPoints() const { return cutPoints; }; void fit(samples_t& X_, labels_t& y) override;
labels_t& transform(const samples_t&); void fit(samples_t& X);
static inline std::string version() { return "1.0.0"; };
private: private:
void fit_uniform(samples_t&); void fit_uniform(samples_t&);
void fit_quantile(samples_t&); void fit_quantile(samples_t&);
void normalizeCutPoints();
int n_bins; int n_bins;
strategy_t strategy; strategy_t strategy;
labels_t discretizedData = labels_t();
cutPoints_t cutPoints;
}; };
} }
#endif #endif

View File

@@ -1,13 +1,11 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
project(mdlp) project(mdlp)
set(CMAKE_CXX_STANDARD 17)
if (POLICY CMP0135) find_package(Torch REQUIRED)
cmake_policy(SET CMP0135 NEW) include_directories(${TORCH_INCLUDE_DIRS})
endif () add_library(mdlp CPPFImdlp.cpp Metrics.cpp BinDisc.cpp Discretizer.cpp)
target_link_libraries(mdlp "${TORCH_LIBRARIES}")
set(CMAKE_CXX_STANDARD 11)
add_library(mdlp CPPFImdlp.cpp Metrics.cpp)
add_subdirectory(sample) add_subdirectory(sample)
add_subdirectory(tests) if (ENABLE_TESTING)
add_subdirectory(tests)
endif(ENABLE_TESTING)

View File

@@ -6,16 +6,14 @@
namespace mdlp { namespace mdlp {
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_), CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) :
Discretizer(),
min_length(min_length_),
max_depth(max_depth_), max_depth(max_depth_),
proposed_cuts(proposed) proposed_cuts(proposed)
{ {
} }
CPPFImdlp::CPPFImdlp() = default;
CPPFImdlp::~CPPFImdlp() = default;
size_t CPPFImdlp::compute_max_num_cut_points() const size_t CPPFImdlp::compute_max_num_cut_points() const
{ {
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples // Set the actual maximum number of cut points as a number or as a percentage of the number of samples
@@ -27,7 +25,7 @@ namespace mdlp {
} }
if (proposed_cuts < 1) if (proposed_cuts < 1)
return static_cast<size_t>(round(static_cast<float>(X.size()) * proposed_cuts)); return static_cast<size_t>(round(static_cast<float>(X.size()) * proposed_cuts));
return static_cast<size_t>(proposed_cuts); return static_cast<size_t>(proposed_cuts); // The 2 extra cutpoints should not be considered here as this parameter is considered before they are added
} }
void CPPFImdlp::fit(samples_t& X_, labels_t& y_) void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
@@ -60,6 +58,10 @@ namespace mdlp {
resizeCutPoints(); resizeCutPoints();
} }
} }
// Insert first & last X value to the cutpoints as them shall be ignored in transform
auto minmax = std::minmax_element(X.begin(), X.end());
cutPoints.push_back(*minmax.second);
cutPoints.insert(cutPoints.begin(), *minmax.first);
} }
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
@@ -208,14 +210,5 @@ namespace mdlp {
} }
cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx)); cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx));
} }
labels_t& CPPFImdlp::transform(const samples_t& data)
{
discretizedData.clear();
discretizedData.reserve(data.size());
for (const precision_t& item : data) {
auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item);
discretizedData.push_back(upper - cutPoints.begin());
}
return discretizedData;
}
} }

View File

@@ -6,18 +6,16 @@
#include <utility> #include <utility>
#include <string> #include <string>
#include "Metrics.h" #include "Metrics.h"
#include "Discretizer.h"
namespace mdlp { namespace mdlp {
class CPPFImdlp { class CPPFImdlp : public Discretizer {
public: public:
CPPFImdlp(); CPPFImdlp() = default;
CPPFImdlp(size_t, int, float); CPPFImdlp(size_t min_length_, int max_depth_, float proposed);
~CPPFImdlp(); virtual ~CPPFImdlp() = default;
void fit(samples_t&, labels_t&); void fit(samples_t& X_, labels_t& y_) override;
inline cutPoints_t getCutPoints() const { return cutPoints; };
labels_t& transform(const samples_t&);
inline int get_depth() const { return depth; }; inline int get_depth() const { return depth; };
static inline std::string version() { return "1.1.3"; };
protected: protected:
size_t min_length = 3; size_t min_length = 3;
int depth = 0; int depth = 0;
@@ -27,9 +25,7 @@ namespace mdlp {
samples_t X = samples_t(); samples_t X = samples_t();
labels_t y = labels_t(); labels_t y = labels_t();
Metrics metrics = Metrics(y, indices); Metrics metrics = Metrics(y, indices);
cutPoints_t cutPoints;
size_t num_cut_points = numeric_limits<size_t>::max(); size_t num_cut_points = numeric_limits<size_t>::max();
labels_t discretizedData = labels_t();
static indices_t sortIndices(samples_t&, labels_t&); static indices_t sortIndices(samples_t&, labels_t&);
void computeCutPoints(size_t, size_t, int); void computeCutPoints(size_t, size_t, int);
void resizeCutPoints(); void resizeCutPoints();

51
Discretizer.cpp Normal file
View File

@@ -0,0 +1,51 @@
#include "Discretizer.h"
namespace mdlp {
labels_t& Discretizer::transform(const samples_t& data)
{
discretizedData.clear();
discretizedData.reserve(data.size());
// CutPoints always have more than two items
// Have to ignore first and last cut points provided
auto first = cutPoints.begin() + 1;
auto last = cutPoints.end() - 1;
for (const precision_t& item : data) {
auto upper = std::lower_bound(first, last, item);
int number = upper - first;
/*
OJO
*/
if (number < 0)
throw std::runtime_error("number is less than 0 in discretizer::transform");
discretizedData.push_back(number);
}
return discretizedData;
}
labels_t& Discretizer::fit_transform(samples_t& X_, labels_t& y_)
{
fit(X_, y_);
return transform(X_);
}
void Discretizer::fit_t(torch::Tensor& X_, torch::Tensor& y_)
{
auto num_elements = X_.numel();
samples_t X(X_.data_ptr<precision_t>(), X_.data_ptr<precision_t>() + num_elements);
labels_t y(y_.data_ptr<int>(), y_.data_ptr<int>() + num_elements);
fit(X, y);
}
torch::Tensor Discretizer::transform_t(torch::Tensor& X_)
{
auto num_elements = X_.numel();
samples_t X(X_.data_ptr<float>(), X_.data_ptr<float>() + num_elements);
auto result = transform(X);
return torch::tensor(result, torch::kInt32);
}
torch::Tensor Discretizer::fit_transform_t(torch::Tensor& X_, torch::Tensor& y_)
{
auto num_elements = X_.numel();
samples_t X(X_.data_ptr<precision_t>(), X_.data_ptr<precision_t>() + num_elements);
labels_t y(y_.data_ptr<int>(), y_.data_ptr<int>() + num_elements);
auto result = fit_transform(X, y);
return torch::tensor(result, torch::kInt32);
}
}

27
Discretizer.h Normal file
View File

@@ -0,0 +1,27 @@
#ifndef DISCRETIZER_H
#define DISCRETIZER_H
#include <string>
#include <algorithm>
#include <torch/torch.h>
#include "typesFImdlp.h"
namespace mdlp {
class Discretizer {
public:
Discretizer() = default;
virtual ~Discretizer() = default;
inline cutPoints_t getCutPoints() const { return cutPoints; };
virtual void fit(samples_t& X_, labels_t& y_) = 0;
labels_t& transform(const samples_t& data);
labels_t& fit_transform(samples_t& X_, labels_t& y_);
void fit_t(torch::Tensor& X_, torch::Tensor& y_);
torch::Tensor transform_t(torch::Tensor& X_);
torch::Tensor fit_transform_t(torch::Tensor& X_, torch::Tensor& y_);
static inline std::string version() { return "1.2.3"; };
protected:
labels_t discretizedData = labels_t();
cutPoints_t cutPoints; // At least two cutpoints must be provided, the first and the last will be ignored in transform
};
}
#endif

13
Makefile Normal file
View File

@@ -0,0 +1,13 @@
SHELL := /bin/bash
.DEFAULT_GOAL := build
.PHONY: build test
build:
@if [ -d build_release ]; then rm -fr build_release; fi
@mkdir build_release
@cmake -B build_release -S . -DCMAKE_BUILD_TYPE=Release -DENABLE_TESTING=OFF
@cmake --build build_release
test:
@echo "Testing..."
@cd tests && ./test

View File

@@ -4,8 +4,8 @@
using namespace std; using namespace std;
namespace mdlp { namespace mdlp {
Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), Metrics::Metrics(labels_t& y_, indices_t& indices_) : y(y_), indices(indices_),
numClasses(computeNumClasses(0, indices.size())) numClasses(computeNumClasses(0, indices_.size()))
{ {
} }

View File

@@ -14,21 +14,27 @@ The implementation tries to mitigate the problem of different label values with
Other features: Other features:
- Intervals with the same value of the variable are not taken into account for cutpoints. - Intervals with the same value of the variable are not taken into account for cutpoints.
- Intervals have to have more than two examples to be evaluated. - Intervals have to have more than two examples to be evaluated (mdlp).
The algorithm returns the cut points for the variable. - The algorithm returns the cut points for the variable.
- The transform method uses the cut points returning its index in the following way:
cut[i - 1] <= x < cut[i]
using the [std::upper_bound](https://en.cppreference.com/w/cpp/algorithm/upper_bound) method
- K-Bins discretization is also implemented, and "quantile" and "uniform" strategies are available.
## Sample ## Sample
To run the sample, just execute the following commands: To run the sample, just execute the following commands:
```bash ```bash
cd sample cmake -B build -S .
cmake -B build cmake --build build
cd build build/sample/sample -f iris -m 2
make build/sample/sample -h
./sample -f iris -m 2
./sample -h
``` ```
## Test ## Test
@@ -36,6 +42,5 @@ make
To run the tests and see coverage (llvm & gcovr have to be installed), execute the following commands: To run the tests and see coverage (llvm & gcovr have to be installed), execute the following commands:
```bash ```bash
cd tests make test
./test
``` ```

View File

@@ -1,21 +0,0 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "lldb puro",
"type": "cppdbg",
// "targetArchitecture": "arm64",
"request": "launch",
"program": "${workspaceRoot}/build/sample",
"args": [
"-f",
"iris"
],
"stopAtEntry": false,
"cwd": "${workspaceRoot}/build/",
"environment": [],
"externalConsole": false,
"MIMode": "lldb"
},
]
}

View File

@@ -1,5 +1,6 @@
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Debug) set(CMAKE_BUILD_TYPE Debug)
add_executable(sample sample.cpp ../tests/ArffFiles.cpp ../Metrics.cpp ../CPPFImdlp.cpp) add_executable(sample sample.cpp ../tests/ArffFiles.cpp)
target_link_libraries(sample mdlp "${TORCH_LIBRARIES}")

View File

@@ -5,13 +5,13 @@
#include <algorithm> #include <algorithm>
#include <cstring> #include <cstring>
#include <getopt.h> #include <getopt.h>
#include <torch/torch.h>
#include "../Discretizer.h"
#include "../CPPFImdlp.h" #include "../CPPFImdlp.h"
#include "../BinDisc.h"
#include "../tests/ArffFiles.h" #include "../tests/ArffFiles.h"
using namespace std; const string PATH = "tests/datasets/";
using namespace mdlp;
const string PATH = "../../tests/datasets/";
/* print a description of all supported options */ /* print a description of all supported options */
void usage(const char* path) void usage(const char* path)
@@ -20,17 +20,17 @@ void usage(const char* path)
const char* basename = strrchr(path, '/'); const char* basename = strrchr(path, '/');
basename = basename ? basename + 1 : path; basename = basename ? basename + 1 : path;
cout << "usage: " << basename << "[OPTION]" << endl; std::cout << "usage: " << basename << "[OPTION]" << std::endl;
cout << " -h, --help\t\t Print this help and exit." << endl; std::cout << " -h, --help\t\t Print this help and exit." << std::endl;
cout std::cout
<< " -f, --file[=FILENAME]\t {all, diabetes, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << " -f, --file[=FILENAME]\t {all, diabetes, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}."
<< endl; << std::endl;
cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl; std::cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << std::endl;
cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl; std::cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << std::endl;
cout std::cout
<< " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 -> any" << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 -> any"
<< endl; << std::endl;
cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl; std::cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << std::endl;
} }
tuple<string, string, int, int, float> parse_arguments(int argc, char** argv) tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
@@ -96,56 +96,79 @@ void process_file(const string& path, const string& file_name, bool class_last,
file.load(path + file_name + ".arff", class_last); file.load(path + file_name + ".arff", class_last);
const auto attributes = file.getAttributes(); const auto attributes = file.getAttributes();
const auto items = file.getSize(); const auto items = file.getSize();
cout << "Number of lines: " << items << endl; std::cout << "Number of lines: " << items << std::endl;
cout << "Attributes: " << endl; std::cout << "Attributes: " << std::endl;
for (auto attribute : attributes) { for (auto attribute : attributes) {
cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl; std::cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << std::endl;
} }
cout << "Class name: " << file.getClassName() << endl; std::cout << "Class name: " << file.getClassName() << std::endl;
cout << "Class type: " << file.getClassType() << endl; std::cout << "Class type: " << file.getClassType() << std::endl;
cout << "Data: " << endl; std::cout << "Data: " << std::endl;
vector<samples_t>& X = file.getX(); std::vector<mdlp::samples_t>& X = file.getX();
labels_t& y = file.getY(); mdlp::labels_t& y = file.getY();
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
for (auto feature : X) { for (auto feature : X) {
cout << fixed << setprecision(1) << feature[i] << " "; std::cout << fixed << setprecision(1) << feature[i] << " ";
} }
cout << y[i] << endl; std::cout << y[i] << std::endl;
} }
auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
size_t total = 0; size_t total = 0;
for (auto i = 0; i < attributes.size(); i++) { for (auto i = 0; i < attributes.size(); i++) {
auto min_max = minmax_element(X[i].begin(), X[i].end()); auto min_max = minmax_element(X[i].begin(), X[i].end());
cout << "Cut points for feature " << get<0>(attributes[i]) << ": [" << setprecision(3); std::cout << "Cut points for feature " << get<0>(attributes[i]) << ": [" << setprecision(3);
test.fit(X[i], y); test.fit(X[i], y);
auto cut_points = test.getCutPoints(); auto cut_points = test.getCutPoints();
for (auto item : cut_points) { for (auto item : cut_points) {
cout << item; std::cout << item;
if (item != cut_points.back()) if (item != cut_points.back())
cout << ", "; std::cout << ", ";
} }
total += test.getCutPoints().size(); total += test.getCutPoints().size();
cout << "]" << endl; std::cout << "]" << std::endl;
cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl; std::cout << "Min: " << *min_max.first << " Max: " << *min_max.second << std::endl;
cout << "--------------------------" << endl; std::cout << "--------------------------" << std::endl;
}
std::cout << "Total cut points ...: " << total << std::endl;
std::cout << "Total feature states: " << total + attributes.size() << std::endl;
std::cout << "Version ............: " << test.version() << std::endl;
std::cout << "Transformed data (vector)..: " << std::endl;
test.fit(X[0], y);
auto data = test.transform(X[0]);
for (int i = 130; i < 135; i++) {
std::cout << std::fixed << std::setprecision(1) << X[0][i] << " " << data[i] << std::endl;
}
auto Xt = torch::tensor(X[0], torch::kFloat32);
auto yt = torch::tensor(y, torch::kInt32);
//test.fit_t(Xt, yt);
auto result = test.fit_transform_t(Xt, yt);
std::cout << "Transformed data (torch)...: " << std::endl;
for (int i = 130; i < 135; i++) {
std::cout << std::fixed << std::setprecision(1) << Xt[i].item<float>() << " " << result[i].item<int>() << std::endl;
}
auto disc = mdlp::BinDisc(3);
auto res_v = disc.fit_transform(X[0], y);
disc.fit_t(Xt, yt);
auto res_t = disc.transform_t(Xt);
std::cout << "Transformed data (BinDisc)...: " << std::endl;
for (int i = 130; i < 135; i++) {
std::cout << std::fixed << std::setprecision(1) << Xt[i].item<float>() << " " << res_v[i] << " " << res_t[i].item<int>() << std::endl;
} }
cout << "Total cut points ...: " << total << endl;
cout << "Total feature states: " << total + attributes.size() << endl;
} }
void process_all_files(const map<string, bool>& datasets, const string& path, int max_depth, int min_length, void process_all_files(const map<string, bool>& datasets, const string& path, int max_depth, int min_length,
float max_cutpoints) float max_cutpoints)
{ {
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: " std::cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: "
<< max_cutpoints << endl << endl; << max_cutpoints << std::endl << std::endl;
printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)"); printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)");
printf("==================== ==== ==== ========\n"); printf("==================== ==== ==== ========\n");
for (const auto& dataset : datasets) { for (const auto& dataset : datasets) {
ArffFiles file; ArffFiles file;
file.load(path + dataset.first + ".arff", dataset.second); file.load(path + dataset.first + ".arff", dataset.second);
auto attributes = file.getAttributes(); auto attributes = file.getAttributes();
vector<samples_t>& X = file.getX(); std::vector<mdlp::samples_t>& X = file.getX();
labels_t& y = file.getY(); mdlp::labels_t& y = file.getY();
size_t timing = 0; size_t timing = 0;
size_t cut_points = 0; size_t cut_points = 0;
for (auto i = 0; i < attributes.size(); i++) { for (auto i = 0; i < attributes.size(); i++) {
@@ -163,7 +186,7 @@ void process_all_files(const map<string, bool>& datasets, const string& path, in
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
map<string, bool> datasets = { std::map<std::string, bool> datasets = {
{"diabetes", true}, {"diabetes", true},
{"glass", true}, {"glass", true},
{"iris", true}, {"iris", true},
@@ -173,14 +196,14 @@ int main(int argc, char** argv)
{"mfeat-factors", true}, {"mfeat-factors", true},
{"test", true} {"test", true}
}; };
string file_name; std::string file_name;
string path; std::string path;
int max_depth; int max_depth;
int min_length; int min_length;
float max_cutpoints; float max_cutpoints;
tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv); tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv);
if (datasets.find(file_name) == datasets.end() && file_name != "all") { if (datasets.find(file_name) == datasets.end() && file_name != "all") {
cout << "Invalid file name: " << file_name << endl; std::cout << "Invalid file name: " << file_name << std::endl;
usage(argv[0]); usage(argv[0]);
exit(1); exit(1);
} }
@@ -188,10 +211,10 @@ int main(int argc, char** argv)
process_all_files(datasets, path, max_depth, min_length, max_cutpoints); process_all_files(datasets, path, max_depth, min_length, max_cutpoints);
else { else {
process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints); process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints);
cout << "File name ....: " << file_name << endl; std::cout << "File name ....: " << file_name << std::endl;
cout << "Max depth ....: " << max_depth << endl; std::cout << "Max depth ....: " << max_depth << std::endl;
cout << "Min length ...: " << min_length << endl; std::cout << "Min length ...: " << min_length << std::endl;
cout << "Max cutpoints : " << max_cutpoints << endl; std::cout << "Max cutpoints : " << max_cutpoints << std::endl;
} }
return 0; return 0;
} }

View File

@@ -3,7 +3,7 @@ sonar.organization=rmontanana
# This is the name and version displayed in the SonarCloud UI. # This is the name and version displayed in the SonarCloud UI.
sonar.projectName=mdlp sonar.projectName=mdlp
sonar.projectVersion=1.1.3 sonar.projectVersion=1.2.1
# sonar.test.exclusions=tests/** # sonar.test.exclusions=tests/**
# sonar.tests=tests/ # sonar.tests=tests/
# sonar.coverage.exclusions=tests/**,sample/** # sonar.coverage.exclusions=tests/**,sample/**

View File

@@ -4,6 +4,7 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "ArffFiles.h" #include "ArffFiles.h"
#include "../BinDisc.h" #include "../BinDisc.h"
#include "Experiments.hpp"
namespace mdlp { namespace mdlp {
const float margin = 1e-4; const float margin = 1e-4;
@@ -37,12 +38,14 @@ namespace mdlp {
TEST_F(TestBinDisc3U, Easy3BinsUniform) TEST_F(TestBinDisc3U, Easy3BinsUniform)
{ {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
fit(X); auto y = labels_t();
fit(X, y);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_NEAR(3.66667, cuts[0], margin); ASSERT_EQ(4, cuts.size());
EXPECT_NEAR(6.33333, cuts[1], margin); EXPECT_NEAR(1, cuts.at(0), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]); EXPECT_NEAR(3.66667, cuts.at(1), margin);
EXPECT_EQ(3, cuts.size()); EXPECT_NEAR(6.33333, cuts.at(2), margin);
EXPECT_NEAR(9.0, cuts.at(3), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 }; labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
@@ -52,10 +55,11 @@ namespace mdlp {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_NEAR(3.666667, cuts[0], margin); ASSERT_EQ(4, cuts.size());
EXPECT_NEAR(6.333333, cuts[1], margin); EXPECT_NEAR(1, cuts[0], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]); EXPECT_NEAR(3.666667, cuts[1], margin);
EXPECT_EQ(3, cuts.size()); EXPECT_NEAR(6.333333, cuts[2], margin);
EXPECT_NEAR(9, cuts[3], margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 }; labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
@@ -65,12 +69,13 @@ namespace mdlp {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(4.0, cuts[0]); ASSERT_EQ(4, cuts.size());
EXPECT_EQ(7.0, cuts[1]); EXPECT_NEAR(1, cuts.at(0), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]); EXPECT_NEAR(4.0, cuts.at(1), margin);
EXPECT_EQ(3, cuts.size()); EXPECT_NEAR(7.0, cuts.at(2), margin);
EXPECT_NEAR(10.0, cuts.at(3), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
} }
TEST_F(TestBinDisc3Q, X10BinsQuantile) TEST_F(TestBinDisc3Q, X10BinsQuantile)
@@ -78,12 +83,13 @@ namespace mdlp {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(4, cuts[0]); ASSERT_EQ(4, cuts.size());
EXPECT_EQ(7, cuts[1]); EXPECT_NEAR(1, cuts.at(0), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]); EXPECT_NEAR(4.0, cuts.at(1), margin);
EXPECT_EQ(3, cuts.size()); EXPECT_NEAR(7.0, cuts.at(2), margin);
EXPECT_NEAR(10.0, cuts.at(3), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
} }
TEST_F(TestBinDisc3U, X11BinsUniform) TEST_F(TestBinDisc3U, X11BinsUniform)
@@ -91,10 +97,11 @@ namespace mdlp {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_NEAR(4.33333, cuts[0], margin); ASSERT_EQ(4, cuts.size());
EXPECT_NEAR(7.66667, cuts[1], margin); EXPECT_NEAR(1, cuts.at(0), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]); EXPECT_NEAR(4.33333, cuts.at(1), margin);
EXPECT_EQ(3, cuts.size()); EXPECT_NEAR(7.66667, cuts.at(2), margin);
EXPECT_NEAR(11.0, cuts.at(3), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
@@ -104,10 +111,11 @@ namespace mdlp {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_NEAR(4.33333, cuts[0], margin); ASSERT_EQ(4, cuts.size());
EXPECT_NEAR(7.66667, cuts[1], margin); EXPECT_NEAR(1, cuts.at(0), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]); EXPECT_NEAR(4.33333, cuts.at(1), margin);
EXPECT_EQ(3, cuts.size()); EXPECT_NEAR(7.66667, cuts.at(2), margin);
EXPECT_NEAR(11.0, cuts.at(3), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
@@ -117,8 +125,9 @@ namespace mdlp {
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]); ASSERT_EQ(2, cuts.size());
EXPECT_EQ(1, cuts.size()); EXPECT_NEAR(1, cuts.at(0), margin);
EXPECT_NEAR(1, cuts.at(1), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 0, 0 }; labels_t expected = { 0, 0, 0, 0, 0, 0 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
@@ -128,8 +137,9 @@ namespace mdlp {
samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]); ASSERT_EQ(2, cuts.size());
EXPECT_EQ(1, cuts.size()); EXPECT_NEAR(1, cuts.at(0), margin);
EXPECT_NEAR(1, cuts.at(1), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 0, 0 }; labels_t expected = { 0, 0, 0, 0, 0, 0 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
@@ -139,16 +149,18 @@ namespace mdlp {
samples_t X = {}; samples_t X = {};
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]); ASSERT_EQ(2, cuts.size());
EXPECT_EQ(1, cuts.size()); EXPECT_NEAR(0, cuts.at(0), margin);
EXPECT_NEAR(0, cuts.at(1), margin);
} }
TEST_F(TestBinDisc3Q, EmptyQuantile) TEST_F(TestBinDisc3Q, EmptyQuantile)
{ {
samples_t X = {}; samples_t X = {};
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(numeric_limits<float>::max(), cuts[0]); ASSERT_EQ(2, cuts.size());
EXPECT_EQ(1, cuts.size()); EXPECT_NEAR(0, cuts.at(0), margin);
EXPECT_NEAR(0, cuts.at(1), margin);
} }
TEST(TestBinDisc3, ExceptionNumberBins) TEST(TestBinDisc3, ExceptionNumberBins)
{ {
@@ -159,44 +171,41 @@ namespace mdlp {
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 }; samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_NEAR(1.66667, cuts[0], margin); ASSERT_EQ(4, cuts.size());
EXPECT_NEAR(2.33333, cuts[1], margin); EXPECT_NEAR(1, cuts.at(0), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]); EXPECT_NEAR(1.66667, cuts.at(1), margin);
EXPECT_EQ(3, cuts.size()); EXPECT_NEAR(2.33333, cuts.at(2), margin);
EXPECT_NEAR(3.0, cuts.at(3), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 2, 0, 0, 2, 0, 0, 2, 0, 0 }; labels_t expected = { 2, 0, 0, 2, 0, 0, 2, 0, 0 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
EXPECT_EQ(3.0, X[0]); // X is not modified ASSERT_EQ(3.0, X[0]); // X is not modified
} }
TEST_F(TestBinDisc3Q, EasyRepeated) TEST_F(TestBinDisc3Q, EasyRepeated)
{ {
samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 }; samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
std::cout << "cuts: "; ASSERT_EQ(3, cuts.size());
for (auto cut : cuts) { EXPECT_NEAR(1, cuts.at(0), margin);
std::cout << cut << " "; EXPECT_NEAR(1.66667, cuts.at(1), margin);
} EXPECT_NEAR(3.0, cuts.at(2), margin);
std::cout << std::endl;
std::cout << std::string(80, '-') << std::endl;
EXPECT_NEAR(1.66667, cuts[0], margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[1]);
EXPECT_EQ(2, cuts.size());
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 1, 0, 0, 1, 0, 0, 1, 0, 0 }; labels_t expected = { 1, 0, 0, 1, 0, 0, 1, 0, 0 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
EXPECT_EQ(3.0, X[0]); // X is not modified ASSERT_EQ(3.0, X[0]); // X is not modified
} }
TEST_F(TestBinDisc4U, Easy4BinsUniform) TEST_F(TestBinDisc4U, Easy4BinsUniform)
{ {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(3.75, cuts[0]); ASSERT_EQ(5, cuts.size());
EXPECT_EQ(6.5, cuts[1]); EXPECT_NEAR(1.0, cuts.at(0), margin);
EXPECT_EQ(9.25, cuts[2]); EXPECT_NEAR(3.75, cuts.at(1), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]); EXPECT_NEAR(6.5, cuts.at(2), margin);
EXPECT_EQ(4, cuts.size()); EXPECT_NEAR(9.25, cuts.at(3), margin);
EXPECT_NEAR(12.0, cuts.at(4), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 }; labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
@@ -206,11 +215,12 @@ namespace mdlp {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(3.75, cuts[0]); ASSERT_EQ(5, cuts.size());
EXPECT_EQ(6.5, cuts[1]); EXPECT_NEAR(1.0, cuts.at(0), margin);
EXPECT_EQ(9.25, cuts[2]); EXPECT_NEAR(3.75, cuts.at(1), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]); EXPECT_NEAR(6.5, cuts.at(2), margin);
EXPECT_EQ(4, cuts.size()); EXPECT_NEAR(9.25, cuts.at(3), margin);
EXPECT_NEAR(12.0, cuts.at(4), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 }; labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
@@ -220,13 +230,14 @@ namespace mdlp {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(4.0, cuts[0]); ASSERT_EQ(5, cuts.size());
EXPECT_EQ(7.0, cuts[1]); EXPECT_NEAR(1.0, cuts.at(0), margin);
EXPECT_EQ(10.0, cuts[2]); EXPECT_NEAR(4.0, cuts.at(1), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]); EXPECT_NEAR(7.0, cuts.at(2), margin);
EXPECT_EQ(4, cuts.size()); EXPECT_NEAR(10.0, cuts.at(3), margin);
EXPECT_NEAR(13.0, cuts.at(4), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
} }
TEST_F(TestBinDisc4Q, X13BinsQuantile) TEST_F(TestBinDisc4Q, X13BinsQuantile)
@@ -234,13 +245,14 @@ namespace mdlp {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(4.0, cuts[0]); ASSERT_EQ(5, cuts.size());
EXPECT_EQ(7.0, cuts[1]); EXPECT_NEAR(1.0, cuts.at(0), margin);
EXPECT_EQ(10.0, cuts[2]); EXPECT_NEAR(4.0, cuts.at(1), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]); EXPECT_NEAR(7.0, cuts.at(2), margin);
EXPECT_EQ(4, cuts.size()); EXPECT_NEAR(10.0, cuts.at(3), margin);
EXPECT_NEAR(13.0, cuts.at(4), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
} }
TEST_F(TestBinDisc4U, X14BinsUniform) TEST_F(TestBinDisc4U, X14BinsUniform)
@@ -248,11 +260,12 @@ namespace mdlp {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(4.25, cuts[0]); ASSERT_EQ(5, cuts.size());
EXPECT_EQ(7.5, cuts[1]); EXPECT_NEAR(1.0, cuts.at(0), margin);
EXPECT_EQ(10.75, cuts[2]); EXPECT_NEAR(4.25, cuts.at(1), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]); EXPECT_NEAR(7.5, cuts.at(2), margin);
EXPECT_EQ(4, cuts.size()); EXPECT_NEAR(10.75, cuts.at(3), margin);
EXPECT_NEAR(14.0, cuts.at(4), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
@@ -262,11 +275,12 @@ namespace mdlp {
samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 }; samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(4.25, cuts[0]); ASSERT_EQ(5, cuts.size());
EXPECT_EQ(7.5, cuts[1]); EXPECT_NEAR(1.0, cuts.at(0), margin);
EXPECT_EQ(10.75, cuts[2]); EXPECT_NEAR(4.25, cuts.at(1), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]); EXPECT_NEAR(7.5, cuts.at(2), margin);
EXPECT_EQ(4, cuts.size()); EXPECT_NEAR(10.75, cuts.at(3), margin);
EXPECT_NEAR(14.0, cuts.at(4), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
@@ -276,13 +290,14 @@ namespace mdlp {
samples_t X = { 15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 }; samples_t X = { 15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(4.5, cuts[0]); ASSERT_EQ(5, cuts.size());
EXPECT_EQ(8, cuts[1]); EXPECT_NEAR(1.0, cuts.at(0), margin);
EXPECT_EQ(11.5, cuts[2]); EXPECT_NEAR(4.5, cuts.at(1), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]); EXPECT_NEAR(8, cuts.at(2), margin);
EXPECT_EQ(4, cuts.size()); EXPECT_NEAR(11.5, cuts.at(3), margin);
EXPECT_NEAR(15.0, cuts.at(4), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 }; labels_t expected = { 3, 1, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
} }
TEST_F(TestBinDisc4Q, X15BinsQuantile) TEST_F(TestBinDisc4Q, X15BinsQuantile)
@@ -290,13 +305,14 @@ namespace mdlp {
samples_t X = { 15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 }; samples_t X = { 15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 };
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(4.5, cuts[0]); ASSERT_EQ(5, cuts.size());
EXPECT_EQ(8, cuts[1]); EXPECT_NEAR(1.0, cuts.at(0), margin);
EXPECT_EQ(11.5, cuts[2]); EXPECT_NEAR(4.5, cuts.at(1), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]); EXPECT_NEAR(8, cuts.at(2), margin);
EXPECT_EQ(4, cuts.size()); EXPECT_NEAR(11.5, cuts.at(3), margin);
EXPECT_NEAR(15.0, cuts.at(4), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0 }; labels_t expected = { 3, 3, 3, 3, 1, 0, 1, 2, 2, 2, 1, 0, 0, 1, 0 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
} }
TEST_F(TestBinDisc4U, RepeatedValuesUniform) TEST_F(TestBinDisc4U, RepeatedValuesUniform)
@@ -305,13 +321,14 @@ namespace mdlp {
// 0 1 2 3 4 5 6 7 8 9 // 0 1 2 3 4 5 6 7 8 9
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(1.0, cuts[0]); ASSERT_EQ(5, cuts.size());
EXPECT_EQ(2.0, cuts[1]); EXPECT_NEAR(0.0, cuts.at(0), margin);
EXPECT_EQ(3.0, cuts[2]); EXPECT_NEAR(1.0, cuts.at(1), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[3]); EXPECT_NEAR(2.0, cuts.at(2), margin);
EXPECT_EQ(4, cuts.size()); EXPECT_NEAR(3.0, cuts.at(3), margin);
EXPECT_NEAR(4.0, cuts.at(4), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 }; labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 3 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
} }
TEST_F(TestBinDisc4Q, RepeatedValuesQuantile) TEST_F(TestBinDisc4Q, RepeatedValuesQuantile)
@@ -320,12 +337,14 @@ namespace mdlp {
// 0 1 2 3 4 5 6 7 8 9 // 0 1 2 3 4 5 6 7 8 9
fit(X); fit(X);
auto cuts = getCutPoints(); auto cuts = getCutPoints();
EXPECT_EQ(2.0, cuts[0]); ASSERT_EQ(5, cuts.size());
EXPECT_EQ(3.0, cuts[1]); EXPECT_NEAR(0.0, cuts.at(0), margin);
EXPECT_EQ(numeric_limits<float>::max(), cuts[2]); EXPECT_NEAR(1.0, cuts.at(1), margin);
EXPECT_EQ(3, cuts.size()); EXPECT_NEAR(2.0, cuts.at(2), margin);
EXPECT_NEAR(3.0, cuts.at(3), margin);
EXPECT_NEAR(4.0, cuts.at(4), margin);
auto labels = transform(X); auto labels = transform(X);
labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 }; labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 3 };
EXPECT_EQ(expected, labels); EXPECT_EQ(expected, labels);
} }
TEST_F(TestBinDisc4U, irisUniform) TEST_F(TestBinDisc4U, irisUniform)
@@ -337,6 +356,13 @@ namespace mdlp {
auto Xt = transform(X[0]); auto Xt = transform(X[0]);
labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 }; labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
EXPECT_EQ(expected, Xt); EXPECT_EQ(expected, Xt);
auto Xtt = fit_transform(X[0], file.getY());
EXPECT_EQ(expected, Xtt);
auto Xt_t = torch::tensor(X[0], torch::kFloat32);
auto y_t = torch::tensor(file.getY(), torch::kInt32);
auto Xtt_t = fit_transform_t(Xt_t, y_t);
for (int i = 0; i < expected.size(); i++)
EXPECT_EQ(expected[i], Xtt_t[i].item<int>());
} }
TEST_F(TestBinDisc4Q, irisQuantile) TEST_F(TestBinDisc4Q, irisQuantile)
{ {
@@ -347,5 +373,44 @@ namespace mdlp {
auto Xt = transform(X[0]); auto Xt = transform(X[0]);
labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 }; labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 };
EXPECT_EQ(expected, Xt); EXPECT_EQ(expected, Xt);
auto Xtt = fit_transform(X[0], file.getY());
EXPECT_EQ(expected, Xtt);
auto Xt_t = torch::tensor(X[0], torch::kFloat32);
auto y_t = torch::tensor(file.getY(), torch::kInt32);
auto Xtt_t = fit_transform_t(Xt_t, y_t);
for (int i = 0; i < expected.size(); i++)
EXPECT_EQ(expected[i], Xtt_t[i].item<int>());
fit_t(Xt_t, y_t);
auto Xt_t2 = transform_t(Xt_t);
for (int i = 0; i < expected.size(); i++)
EXPECT_EQ(expected[i], Xt_t2[i].item<int>());
}
TEST(TestBinDiscGeneric, Fileset)
{
Experiments exps(data_path + "tests.txt");
int num = 0;
while (exps.is_next()) {
Experiment exp = exps.next();
std::cout << "Exp #: " << ++num << " From: " << exp.from_ << " To: " << exp.to_ << " Step: " << exp.step_ << " Bins: " << exp.n_bins_ << " Strategy: " << exp.strategy_ << std::endl;
BinDisc disc(exp.n_bins_, exp.strategy_ == "Q" ? strategy_t::QUANTILE : strategy_t::UNIFORM);
std::vector<float> test;
for (float i = exp.from_; i < exp.to_; i += exp.step_) {
test.push_back(i);
}
// show_vector(test, "Test");
auto empty = std::vector<int>();
auto Xt = disc.fit_transform(test, empty);
auto cuts = disc.getCutPoints();
EXPECT_EQ(exp.discretized_data_.size(), Xt.size());
for (int i = 0; i < exp.discretized_data_.size(); ++i) {
if (exp.discretized_data_.at(i) != Xt.at(i)) {
std::cout << "Error at " << i << " Expected: " << exp.discretized_data_.at(i) << " Got: " << Xt.at(i) << std::endl;
}
}
EXPECT_EQ(exp.cutpoints_.size(), cuts.size());
for (int i = 0; i < exp.cutpoints_.size(); ++i) {
EXPECT_NEAR(exp.cutpoints_.at(i), cuts.at(i), margin);
}
}
} }
} }

View File

@@ -1,10 +1,8 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 17)
cmake_policy(SET CMP0135 NEW)
include(FetchContent) include(FetchContent)
include_directories(${GTEST_INCLUDE_DIRS}) include_directories(${GTEST_INCLUDE_DIRS})
FetchContent_Declare( FetchContent_Declare(
googletest googletest
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
@@ -13,22 +11,35 @@ FetchContent_Declare(
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest) FetchContent_MakeAvailable(googletest)
find_package(Torch REQUIRED)
enable_testing() enable_testing()
include_directories(${TORCH_INCLUDE_DIRS})
add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp) add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp)
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp)
add_executable(BinDisc_unittest ../BinDisc.cpp ArffFiles.cpp BinDisc_unittest.cpp)
target_link_libraries(Metrics_unittest GTest::gtest_main) target_link_libraries(Metrics_unittest GTest::gtest_main)
target_link_libraries(FImdlp_unittest GTest::gtest_main)
target_link_libraries(BinDisc_unittest GTest::gtest_main)
target_compile_options(Metrics_unittest PRIVATE --coverage) target_compile_options(Metrics_unittest PRIVATE --coverage)
target_compile_options(FImdlp_unittest PRIVATE --coverage)
target_compile_options(BinDisc_unittest PRIVATE --coverage)
target_link_options(Metrics_unittest PRIVATE --coverage) target_link_options(Metrics_unittest PRIVATE --coverage)
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp ../Discretizer.cpp)
target_link_libraries(FImdlp_unittest GTest::gtest_main "${TORCH_LIBRARIES}")
target_compile_options(FImdlp_unittest PRIVATE --coverage)
target_link_options(FImdlp_unittest PRIVATE --coverage) target_link_options(FImdlp_unittest PRIVATE --coverage)
add_executable(BinDisc_unittest ../BinDisc.cpp ArffFiles.cpp BinDisc_unittest.cpp ../Discretizer.cpp)
target_link_libraries(BinDisc_unittest GTest::gtest_main "${TORCH_LIBRARIES}")
target_compile_options(BinDisc_unittest PRIVATE --coverage)
target_link_options(BinDisc_unittest PRIVATE --coverage) target_link_options(BinDisc_unittest PRIVATE --coverage)
add_executable(Discretizer_unittest ../BinDisc.cpp ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp ../Discretizer.cpp Discretizer_unittest.cpp)
target_link_libraries(Discretizer_unittest GTest::gtest_main "${TORCH_LIBRARIES}")
target_compile_options(Discretizer_unittest PRIVATE --coverage)
target_link_options(Discretizer_unittest PRIVATE --coverage)
include(GoogleTest) include(GoogleTest)
gtest_discover_tests(Metrics_unittest) gtest_discover_tests(Metrics_unittest)
gtest_discover_tests(FImdlp_unittest) gtest_discover_tests(FImdlp_unittest)
gtest_discover_tests(BinDisc_unittest) gtest_discover_tests(BinDisc_unittest)
gtest_discover_tests(Discretizer_unittest)

View File

@@ -0,0 +1,83 @@
#include <fstream>
#include <string>
#include <iostream>
#include "gtest/gtest.h"
#include "ArffFiles.h"
#include "../Discretizer.h"
#include "../BinDisc.h"
#include "../CPPFImdlp.h"
namespace mdlp {
const float margin = 1e-4;
static std::string set_data_path()
{
std::string path = "../datasets/";
std::ifstream file(path + "iris.arff");
if (file.is_open()) {
file.close();
return path;
}
return "../../tests/datasets/";
}
const std::string data_path = set_data_path();
TEST(Discretizer, Version)
{
Discretizer* disc = new BinDisc(4, strategy_t::UNIFORM);
auto version = disc->version();
delete disc;
std::cout << "Version computed: " << version;
EXPECT_EQ("1.2.3", version);
}
TEST(Discretizer, BinIrisUniform)
{
ArffFiles file;
Discretizer* disc = new BinDisc(4, strategy_t::UNIFORM);
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
auto y = labels_t();
disc->fit(X[0], y);
auto Xt = disc->transform(X[0]);
labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
delete disc;
EXPECT_EQ(expected, Xt);
}
TEST(Discretizer, BinIrisQuantile)
{
ArffFiles file;
Discretizer* disc = new BinDisc(4, strategy_t::QUANTILE);
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
auto y = labels_t();
disc->fit(X[0], y);
auto Xt = disc->transform(X[0]);
labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 };
delete disc;
EXPECT_EQ(expected, Xt);
}
TEST(Discretizer, FImdlpIris)
{
labels_t expected = {
5, 3, 4, 4, 5, 5, 5, 5, 2, 4, 5, 5, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5,
5, 4, 5, 3, 5, 5, 5, 4, 4, 5, 5, 5, 4, 4, 5, 4, 3, 5, 5, 0, 4, 5,
5, 3, 5, 4, 5, 4, 4, 4, 4, 0, 1, 1, 4, 0, 2, 0, 0, 3, 0, 2, 2, 4,
3, 0, 0, 0, 4, 1, 0, 1, 2, 3, 1, 3, 2, 0, 0, 0, 0, 0, 3, 5, 4, 0,
3, 0, 0, 3, 0, 0, 0, 3, 2, 2, 0, 1, 4, 0, 3, 2, 3, 3, 0, 2, 0, 5,
4, 0, 3, 0, 1, 4, 3, 5, 0, 0, 4, 1, 1, 0, 4, 4, 1, 3, 1, 3, 1, 5,
1, 1, 0, 3, 5, 4, 3, 4, 4, 4, 0, 4, 4, 3, 0, 3, 5, 3
};
ArffFiles file;
Discretizer* disc = new CPPFImdlp();
file.load(data_path + "iris.arff", true);
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
disc->fit(X[1], y);
auto computed = disc->transform(X[1]);
delete disc;
EXPECT_EQ(computed.size(), expected.size());
for (unsigned long i = 0; i < computed.size(); i++) {
EXPECT_EQ(computed[i], expected[i]);
}
}
}

108
tests/Experiments.hpp Normal file
View File

@@ -0,0 +1,108 @@
#ifndef EXPERIMENTS_HPP
#define EXPERIMENTS_HPP
#include<sstream>
#include<iostream>
#include<string>
#include<fstream>
#include<vector>
#include<tuple>
#include "../typesFImdlp.h"
class Experiment {
public:
Experiment(float from_, float to_, float step_, int n_bins, std::string strategy, std::vector<int> data_discretized, std::vector<float> cutpoints) :
from_{ from_ }, to_{ to_ }, step_{ step_ }, n_bins_{ n_bins }, strategy_{ strategy }, discretized_data_{ data_discretized }, cutpoints_{ cutpoints }
{
if (strategy != "Q" && strategy != "U") {
throw std::invalid_argument("Invalid strategy " + strategy);
}
}
float from_;
float to_;
float step_;
int n_bins_;
std::string strategy_;
std::vector<int> discretized_data_;
std::vector<float> cutpoints_;
};
class Experiments {
public:
Experiments(const std::string filename) : filename{ filename }
{
test_file.open(filename);
if (!test_file.is_open()) {
throw std::runtime_error("File " + filename + " not found");
}
exp_end = false;
}
~Experiments()
{
test_file.close();
}
bool end() const
{
return exp_end;
}
bool is_next()
{
while (std::getline(test_file, line) && line[0] == '#');
if (test_file.eof()) {
exp_end = true;
return false;
}
return true;
}
Experiment next()
{
return parse_experiment(line);
}
private:
std::tuple<float, float, float, int, std::string> parse_header(const std::string& line)
{
std::istringstream iss(line);
std::string from_, to_, step_, n_bins, strategy;
iss >> from_ >> to_ >> step_ >> n_bins >> strategy;
return { std::stof(from_), std::stof(to_), std::stof(step_), std::stoi(n_bins), strategy };
}
template <typename T>
std::vector<T> parse_vector(const std::string& line)
{
std::istringstream iss(line);
std::vector<T> data;
std::string d;
while (iss >> d) {
data.push_back(std::is_same<T, float>::value ? std::stof(d) : std::stoi(d));
}
return data;
}
Experiment parse_experiment(std::string& line)
{
if (line == "RANGE") {
std::getline(test_file, line);
auto [from_, to_, step_, n_bins, strategy] = parse_header(line);
} else {
std::getline(test_file, line);
}
std::getline(test_file, line);
auto data_discretized = parse_vector<int>(line);
std::getline(test_file, line);
auto cutpoints = parse_vector<float>(line);
return Experiment{ from_, to_, step_, n_bins, strategy, data_discretized, cutpoints };
}
std::ifstream test_file;
std::string filename;
std::string line;
bool exp_end;
};
template <typename T>
void show_vector(const std::vector<T>& data, std::string title)
{
std::cout << title << ": ";
std::string sep = "";
for (const auto& d : data) {
std::cout << sep << d;
sep = ", ";
}
std::cout << std::endl;
}
#endif

View File

@@ -124,7 +124,7 @@ namespace mdlp {
{ {
samples_t X_ = { 1, 2, 2, 3, 4, 2, 3 }; samples_t X_ = { 1, 2, 2, 3, 4, 2, 3 };
labels_t y_ = { 0, 0, 1, 2, 3, 4, 5 }; labels_t y_ = { 0, 0, 1, 2, 3, 4, 5 };
cutPoints_t expected = { 1.5f, 2.5f }; cutPoints_t expected = { 1.0, 1.5f, 2.5f, 4.0 };
fit(X_, y_); fit(X_, y_);
auto computed = getCutPoints(); auto computed = getCutPoints();
EXPECT_EQ(computed.size(), expected.size()); EXPECT_EQ(computed.size(), expected.size());
@@ -167,29 +167,31 @@ namespace mdlp {
y = { 1 }; y = { 1 };
fit(X, y); fit(X, y);
computed = getCutPoints(); computed = getCutPoints();
EXPECT_EQ(computed.size(), 0); EXPECT_EQ(computed.size(), 2);
X = { 1, 3 }; X = { 1, 3 };
y = { 1, 2 }; y = { 1, 2 };
fit(X, y); fit(X, y);
computed = getCutPoints(); computed = getCutPoints();
EXPECT_EQ(computed.size(), 0); EXPECT_EQ(computed.size(), 2);
X = { 2, 4 }; X = { 2, 4 };
y = { 1, 2 }; y = { 1, 2 };
fit(X, y); fit(X, y);
computed = getCutPoints(); computed = getCutPoints();
EXPECT_EQ(computed.size(), 0); EXPECT_EQ(computed.size(), 2);
X = { 1, 2, 3 }; X = { 1, 2, 3 };
y = { 1, 2, 2 }; y = { 1, 2, 2 };
fit(X, y); fit(X, y);
computed = getCutPoints(); computed = getCutPoints();
EXPECT_EQ(computed.size(), 1); EXPECT_EQ(computed.size(), 3);
EXPECT_NEAR(computed[0], 1.5, precision); EXPECT_NEAR(computed[0], 1, precision);
EXPECT_NEAR(computed[1], 1.5, precision);
EXPECT_NEAR(computed[2], 3, precision);
} }
TEST_F(TestFImdlp, TestArtificialDataset) TEST_F(TestFImdlp, TestArtificialDataset)
{ {
fit(X, y); fit(X, y);
cutPoints_t expected = { 5.05f }; cutPoints_t expected = { 4.7, 5.05, 6.0 };
vector<precision_t> computed = getCutPoints(); vector<precision_t> computed = getCutPoints();
EXPECT_EQ(computed.size(), expected.size()); EXPECT_EQ(computed.size(), expected.size());
for (unsigned long i = 0; i < computed.size(); i++) { for (unsigned long i = 0; i < computed.size(); i++) {
@@ -200,10 +202,10 @@ namespace mdlp {
TEST_F(TestFImdlp, TestIris) TEST_F(TestFImdlp, TestIris)
{ {
vector<cutPoints_t> expected = { vector<cutPoints_t> expected = {
{5.45f, 5.75f}, {4.3, 5.45f, 5.75f, 7.9},
{2.75f, 2.85f, 2.95f, 3.05f, 3.35f}, {2, 2.75f, 2.85f, 2.95f, 3.05f, 3.35f, 4.4},
{2.45f, 4.75f, 5.05f}, {1, 2.45f, 4.75f, 5.05f, 6.9},
{0.8f, 1.75f} {0.1, 0.8f, 1.75f, 2.5}
}; };
vector<int> depths = { 3, 5, 4, 3 }; vector<int> depths = { 3, 5, 4, 3 };
auto test = CPPFImdlp(); auto test = CPPFImdlp();
@@ -213,7 +215,7 @@ namespace mdlp {
TEST_F(TestFImdlp, ComputeCutPointsGCase) TEST_F(TestFImdlp, ComputeCutPointsGCase)
{ {
cutPoints_t expected; cutPoints_t expected;
expected = { 1.5 }; expected = { 0, 1.5, 2 };
samples_t X_ = { 0, 1, 2, 2, 2 }; samples_t X_ = { 0, 1, 2, 2, 2 };
labels_t y_ = { 1, 1, 1, 2, 2 }; labels_t y_ = { 1, 1, 1, 2, 2 };
fit(X_, y_); fit(X_, y_);
@@ -247,10 +249,10 @@ namespace mdlp {
// Set max_depth to 1 // Set max_depth to 1
auto test = CPPFImdlp(3, 1, 0); auto test = CPPFImdlp(3, 1, 0);
vector<cutPoints_t> expected = { vector<cutPoints_t> expected = {
{5.45f}, {4.3, 5.45f, 7.9},
{3.35f}, {2, 3.35f, 4.4},
{2.45f}, {1, 2.45f, 6.9},
{0.8f} {0.1, 0.8f, 2.5}
}; };
vector<int> depths = { 1, 1, 1, 1 }; vector<int> depths = { 1, 1, 1, 1 };
test_dataset(test, "iris", expected, depths); test_dataset(test, "iris", expected, depths);
@@ -261,10 +263,10 @@ namespace mdlp {
auto test = CPPFImdlp(75, 100, 0); auto test = CPPFImdlp(75, 100, 0);
// Set min_length to 75 // Set min_length to 75
vector<cutPoints_t> expected = { vector<cutPoints_t> expected = {
{5.45f, 5.75f}, {4.3, 5.45f, 5.75f, 7.9},
{2.85f, 3.35f}, {2, 2.85f, 3.35f, 4.4},
{2.45f, 4.75f}, {1, 2.45f, 4.75f, 6.9},
{0.8f, 1.75f} {0.1, 0.8f, 1.75f, 2.5}
}; };
vector<int> depths = { 3, 2, 2, 2 }; vector<int> depths = { 3, 2, 2, 2 };
test_dataset(test, "iris", expected, depths); test_dataset(test, "iris", expected, depths);
@@ -275,10 +277,10 @@ namespace mdlp {
// Set min_length to 75 // Set min_length to 75
auto test = CPPFImdlp(75, 2, 0); auto test = CPPFImdlp(75, 2, 0);
vector<cutPoints_t> expected = { vector<cutPoints_t> expected = {
{5.45f, 5.75f}, {4.3, 5.45f, 5.75f, 7.9},
{2.85f, 3.35f}, {2, 2.85f, 3.35f, 4.4},
{2.45f, 4.75f}, {1, 2.45f, 4.75f, 6.9},
{0.8f, 1.75f} {0.1, 0.8f, 1.75f, 2.5}
}; };
vector<int> depths = { 2, 2, 2, 2 }; vector<int> depths = { 2, 2, 2, 2 };
test_dataset(test, "iris", expected, depths); test_dataset(test, "iris", expected, depths);
@@ -289,10 +291,10 @@ namespace mdlp {
// Set min_length to 75 // Set min_length to 75
auto test = CPPFImdlp(75, 2, 1); auto test = CPPFImdlp(75, 2, 1);
vector<cutPoints_t> expected = { vector<cutPoints_t> expected = {
{5.45f}, {4.3, 5.45f, 7.9},
{2.85f}, {2, 2.85f, 4.4},
{2.45f}, {1, 2.45f, 6.9},
{0.8f} {0.1, 0.8f, 2.5}
}; };
vector<int> depths = { 2, 2, 2, 2 }; vector<int> depths = { 2, 2, 2, 2 };
test_dataset(test, "iris", expected, depths); test_dataset(test, "iris", expected, depths);
@@ -304,10 +306,10 @@ namespace mdlp {
// Set min_length to 75 // Set min_length to 75
auto test = CPPFImdlp(75, 2, 0.2f); auto test = CPPFImdlp(75, 2, 0.2f);
vector<cutPoints_t> expected = { vector<cutPoints_t> expected = {
{5.45f, 5.75f}, {4.3, 5.45f, 5.75f, 7.9},
{2.85f, 3.35f}, {2, 2.85f, 3.35f, 4.4},
{2.45f, 4.75f}, {1, 2.45f, 4.75f, 6.9},
{0.8f, 1.75f} {0.1, 0.8f, 1.75f, 2.5}
}; };
vector<int> depths = { 2, 2, 2, 2 }; vector<int> depths = { 2, 2, 2, 2 };
test_dataset(test, "iris", expected, depths); test_dataset(test, "iris", expected, depths);
@@ -327,7 +329,6 @@ namespace mdlp {
computed = compute_max_num_cut_points(); computed = compute_max_num_cut_points();
ASSERT_EQ(expected, computed); ASSERT_EQ(expected, computed);
} }
} }
TEST_F(TestFImdlp, TransformTest) TEST_F(TestFImdlp, TransformTest)
{ {
@@ -350,5 +351,10 @@ namespace mdlp {
for (unsigned long i = 0; i < computed.size(); i++) { for (unsigned long i = 0; i < computed.size(); i++) {
EXPECT_EQ(computed[i], expected[i]); EXPECT_EQ(computed[i], expected[i]);
} }
auto computed_ft = fit_transform(X[1], y);
EXPECT_EQ(computed_ft.size(), expected.size());
for (unsigned long i = 0; i < computed_ft.size(); i++) {
EXPECT_EQ(computed_ft[i], expected[i]);
}
} }
} }

View File

@@ -2,13 +2,13 @@
#include "../Metrics.h" #include "../Metrics.h"
namespace mdlp { namespace mdlp {
class TestMetrics: public Metrics, public testing::Test { class TestMetrics : public Metrics, public testing::Test {
public: public:
labels_t y_ = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; labels_t y_ = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
indices_t indices_ = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; indices_t indices_ = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
precision_t precision = 0.000001f; precision_t precision = 1e-6;
TestMetrics(): Metrics(y_, indices_) {}; TestMetrics() : Metrics(y_, indices_) {};
void SetUp() override void SetUp() override
{ {

149
tests/datasets/tests.txt Normal file
View File

@@ -0,0 +1,149 @@
#
# from, to, step, #bins, Q/U
# discretized data
# cut points
#
RANGE
0, 100, 1, 4, Q
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
0.0, 24.75, 49.5, 74.25, 99.0
RANGE
0, 50, 1, 4, Q
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
0.0, 12.25, 24.5, 36.75, 49.0
RANGE
0, 100, 1, 3, Q
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
0.0, 33.0, 66.0, 99.0
RANGE
0, 50, 1, 3, Q
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
0.0, 16.33333, 32.66667, 49.0
RANGE
0, 10, 1, 3, Q
0, 0, 0, 0, 1, 1, 1, 2, 2, 2
0.0, 3.0, 6.0, 9.0
RANGE
0, 100, 1, 4, U
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
0.0, 24.75, 49.5, 74.25, 99.0
RANGE
0, 50, 1, 4, U
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
0.0, 12.25, 24.5, 36.75, 49.0
RANGE
0, 100, 1, 3, U
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
0.0, 33.0, 66.0, 99.0
RANGE
0, 50, 1, 3, U
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
0.0, 16.33333, 32.66667, 49.0
RANGE
0, 10, 1, 3, U
0, 0, 0, 1, 1, 1, 2, 2, 2, 2
0.0, 3.0, 6.0, 9.0
RANGE
1, 10, 1, 3, Q
0, 0, 0, 1, 1, 1, 2, 2, 2
1.0, 3.66667, 6.33333, 9.0
RANGE
1, 10, 1, 3, U
0, 0, 0, 1, 1, 1, 2, 2, 2
1.0, 3.66667, 6.33333, 9.0
RANGE
1, 11, 1, 3, Q
0, 0, 0, 1, 1, 1, 1, 2, 2, 2
1.0, 4.0, 7.0, 10.0
RANGE
1, 11, 1, 3, U
0, 0, 0, 1, 1, 1, 2, 2, 2, 2
1.0, 4.0, 7.0, 10.0
RANGE
1, 12, 1, 3, Q
0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2
1.0, 4.33333, 7.66667, 11.0
RANGE
1, 12, 1, 3, U
0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2
1.0, 4.33333, 7.66667, 11.0
RANGE
1, 13, 1, 3, Q
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2
1.0, 4.66667, 8.33333, 12.0
RANGE
1, 13, 1, 3, U
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2
1.0, 4.66667, 8.33333, 12.0
RANGE
1, 14, 1, 3, Q
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2
1.0, 5.0, 9.0, 13.0
RANGE
1, 14, 1, 3, U
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2
1.0, 5.0, 9.0, 13.0
RANGE
1, 15, 1, 3, Q
0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2
1.0, 5.33333, 9.66667, 14.0
RANGE
1, 15, 1, 3, U
0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2
1.0, 5.33333, 9.66667, 14.0
VECTOR
Q3[3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0]
1, 0, 0, 1, 0, 0, 1, 0, 0
1.0, 1.66667, 3.0
VECTOR
U3[3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0]
2, 0, 0, 2, 0, 0, 2, 0, 0
1.0, 1.66667, 2.33333, 3.0
VECTOR
Q3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2
1.0, 4.66667, 8.33333, 12.0
VECTOR
U3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2
1.0, 4.66667, 8.33333, 12.0
VECTOR
Q3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2
1.0, 5.0, 9.0, 13.0
VECTOR
U3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2
1.0, 5.0, 9.0, 13.0
VECTOR
Q3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0]
0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2
1.0, 5.33333, 9.66667, 14.0
VECTOR
U3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0]
0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2
1.0, 5.33333, 9.66667, 14.0
VECTOR
Q3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2
1.0, 5.66667, 10.33333, 15.0
VECTOR
U3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2
1.0, 5.66667, 10.33333, 15.0
VECTOR
Q3[15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0]
2, 1, 2, 2, 1, 0, 2, 2, 1, 1, 1, 0, 0, 0, 0
1.0, 5.66667, 10.33333, 15.0
VECTOR
U3[15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0]
2, 1, 2, 2, 1, 0, 2, 2, 1, 1, 1, 0, 0, 0, 0
1.0, 5.66667, 10.33333, 15.0
VECTOR
Q3[0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0]
0, 0, 0, 0, 1, 1, 2, 2, 2, 2
0.0, 1.0, 3.0, 4.0
VECTOR
U3[0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0]
0, 0, 0, 0, 1, 1, 2, 2, 2, 2
0.0, 1.33333, 2.66667, 4.0

BIN
tests/k Executable file

Binary file not shown.

32
tests/k.cpp Normal file
View File

@@ -0,0 +1,32 @@
#include <iostream>
#include <vector>
#include <algorithm> // For std::lower_bound
std::vector<int> searchsorted(const std::vector<float>& cuts, const std::vector<float>& data) {
std::vector<int> indices;
indices.reserve(data.size());
for (const float& value : data) {
// Find the first position in 'a' where 'value' could be inserted to maintain order
auto it = std::lower_bound(cuts.begin(), cuts.end(), value);
// Calculate the index
int index = it - cuts.begin();
indices.push_back(index);
}
return indices;
}
int main() {
std::vector<float> cuts = { 10.0 };
std::vector<float> data = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 };
std::vector<int> result = searchsorted(cuts, data);
for (int idx : result) {
std::cout << idx << " ";
}
return 0;
}

BIN
tests/t Executable file

Binary file not shown.

102
tests/t.cpp Normal file
View File

@@ -0,0 +1,102 @@
#include <iostream>
#include <algorithm>
#include <cmath>
#include <vector>
#include <string>
typedef float precision_t;
std::vector<int> transform(const std::vector<float> cutPoints, const std::vector<float>& data)
{
std::vector<int> discretizedData;
discretizedData.reserve(data.size());
for (const float& item : data) {
auto upper = std::lower_bound(cutPoints.begin(), cutPoints.end(), item);
discretizedData.push_back(upper - cutPoints.begin());
}
return discretizedData;
}
template <typename T>
void show_vector(const std::vector<T>& data, std::string title)
{
std::cout << title << ": ";
std::string sep = "";
for (const auto& d : data) {
std::cout << sep << d;
sep = ", ";
}
std::cout << std::endl;
}
std::vector<precision_t> linspace(precision_t start, precision_t end, int num)
{
if (start == end) {
return { start, end };
}
precision_t delta = (end - start) / static_cast<precision_t>(num - 1);
std::vector<precision_t> linspc;
for (size_t i = 0; i < num - 1; ++i) {
precision_t val = start + delta * static_cast<precision_t>(i);
linspc.push_back(val);
}
return linspc;
}
size_t clip(const size_t n, size_t lower, size_t upper)
{
return std::max(lower, std::min(n, upper));
}
std::vector<precision_t> percentile(std::vector<precision_t>& data, std::vector<precision_t>& percentiles)
{
// Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html
std::vector<precision_t> results;
results.reserve(percentiles.size());
for (auto percentile : percentiles) {
const size_t i = static_cast<size_t>(std::floor(static_cast<double>(data.size() - 1) * percentile / 100.));
const auto indexLower = clip(i, 0, data.size() - 2);
const double percentI = static_cast<double>(indexLower) / static_cast<double>(data.size() - 1);
const double fraction =
(percentile / 100.0 - percentI) /
(static_cast<double>(indexLower + 1) / static_cast<double>(data.size() - 1) - percentI);
const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction;
if (value != results.back())
results.push_back(value);
}
return results;
}
int main()
{
// std::vector<float> test;
// std::vector<float> cuts = { 0, 24.75, 49.5, 74.25, 10000 };
// for (int i = 0; i < 100; ++i) {
// test.push_back(i);
// }
// auto Xt = transform(cuts, test);
// show_vector(Xt, "Discretized data:");
// std::vector<float> test2 = { 0,1,2,3,4,5,6,7,8,9,10,11 };
// std::vector<float> cuts2 = { 0,1,2,3,4,5,6,7,8,9 };
// auto Xt2 = transform(cuts2, test2);
// show_vector(Xt2, "discretized data2: ");
auto quantiles = linspace(0.0, 100.0, 3 + 1);
std::vector<float> data = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
std::vector<float> cutPoints;
std::sort(data.begin(), data.end());
cutPoints = percentile(data, quantiles);
cutPoints.push_back(std::numeric_limits<precision_t>::max());
data.push_back(15);
data.push_back(0);
cutPoints.pop_back();
cutPoints.erase(cutPoints.begin());
cutPoints.clear();
cutPoints.push_back(9.0);
auto Xt = transform(cutPoints, data);
show_vector(data, "Original data");
show_vector(Xt, "Discretized data");
show_vector(cutPoints, "Cutpoints");
return 0;
}
/*
n_bins = 3
data = [1,2,3,4,5,6,7,8,9,10]
quantiles = np.linspace(0, 100, n_bins + 1)
bin_edges = np.percentile(data, quantiles)
*/

View File

@@ -1,18 +1,15 @@
#!/bin/bash #!/bin/bash
if [ -d build ] ; then if [ -d build ] && [ "$1" != "run" ]; then
rm -fr build rm -fr build
fi fi
if [ -d gcovr-report ] ; then if [ -d gcovr-report ] ; then
rm -fr gcovr-report rm -fr gcovr-report
fi fi
cmake -S . -B build -Wno-dev cmake -S . -B build -Wno-dev -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS="--coverage" -DCMAKE_C_FLAGS="--coverage"
cmake --build build cmake --build build
cd build cd build
ctest --output-on-failure ctest --output-on-failure
cd .. cd ..
mkdir gcovr-report mkdir gcovr-report
#lcov --capture --directory ./ --output-file lcoverage/main_coverage.info
#lcov --remove lcoverage/main_coverage.info 'v1/*' '/Applications/*' '*/tests/*' --output-file lcoverage/main_coverage.info -q
#lcov --list lcoverage/main_coverage.info
cd .. cd ..
gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --gcov-filter "BinDisc.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml --exclude-noncode-lines gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --gcov-filter "BinDisc.cpp" --gcov-filter "Discretizer.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml --exclude-noncode-lines

View File

@@ -89,6 +89,7 @@ print(f"Quaintile {clf4q.bin_edges_=}")
print("-" * 80) print("-" * 80)
# #
data, meta = loadarff("tests/datasets/iris.arff") data, meta = loadarff("tests/datasets/iris.arff")
labelsu = [ labelsu = [
0, 0,
0, 0,
@@ -117,12 +118,12 @@ labelsu = [
0, 0,
0, 0,
0, 0,
1, 0,
1, 0,
0, 0,
0, 0,
1, 1,
1, 0,
1, 1,
0, 0,
0, 0,
@@ -149,11 +150,11 @@ labelsu = [
2, 2,
0, 0,
2, 2,
1, 0,
0, 0,
1, 1,
1, 1,
2, 1,
1, 1,
2, 2,
1, 1,
@@ -161,9 +162,9 @@ labelsu = [
2, 2,
1, 1,
1, 1,
1,
2, 2,
2, 1,
2,
2, 2,
2, 2,
2, 2,
@@ -181,7 +182,7 @@ labelsu = [
1, 1,
1, 1,
1, 1,
2, 1,
1, 1,
0, 0,
1, 1,
@@ -217,14 +218,14 @@ labelsu = [
2, 2,
3, 3,
2, 2,
2, 1,
2, 2,
3, 3,
3, 3,
3, 3,
2, 2,
2, 2,
2, 1,
3, 3,
2, 2,
2, 2,
@@ -393,12 +394,19 @@ labelsq = [
2, 2,
2, 2,
] ]
test(clf4u, data["sepallength"], labelsu, title="IrisUniform") # test(clf4u, data["sepallength"], labelsu, title="IrisUniform")
test(clf4q, data["sepallength"], labelsq, title="IrisQuantile") # test(clf4q, data["sepallength"], labelsq, title="IrisQuantile")
# print("Labels") sepallength = [[x] for x in data["sepallength"]]
# print(labels) clf4u.fit(sepallength)
# print("Expected") clf4q.fit(sepallength)
# print(expected) computedu = clf4u.transform(sepallength)
# for i in range(len(labels)): computedq = clf4q.transform(sepallength)
# if labels[i] != expected[i]: wrongu = 0
# print(f"Error at {i} {labels[i]} != {expected[i]}") wrongq = 0
for i in range(len(labelsu)):
if labelsu[i] != computedu[i]:
wrongu += 1
if labelsq[i] != computedq[i]:
wrongq += 1
print(f"Iris sepallength diff. between BinDisc & sklearn::KBins Uniform ={wrongu:3d}")
print(f"Iris sepallength diff. between BinDisc & sklearn::KBins Quantile ={wrongq:3d}")

50
tests/tests_do.py Normal file
View File

@@ -0,0 +1,50 @@
import json
from sklearn.preprocessing import KBinsDiscretizer
with open("datasets/tests.txt") as f:
data = f.readlines()
data = [x.strip() for x in data if x[0] != "#"]
for i in range(0, len(data), 4):
experiment_type = data[i]
print("Experiment:", data[i + 1])
if experiment_type == "RANGE":
range_data = data[i + 1]
from_, to_, step_, n_bins_, strategy_ = range_data.split(",")
X = [[float(x)] for x in range(int(from_), int(to_), int(step_))]
else:
strategy_ = data[i + 1][0]
n_bins_ = data[i + 1][1]
vector = data[i + 1][2:]
X = [[float(x)] for x in json.loads(vector)]
strategy = "quantile" if strategy_.strip() == "Q" else "uniform"
disc = KBinsDiscretizer(
n_bins=int(n_bins_),
encode="ordinal",
strategy=strategy,
)
expected_data = data[i + 2]
cuts_data = data[i + 3]
disc.fit(X)
result = disc.transform(X)
result = [int(x) for x in result.flatten()]
expected = [int(x) for x in expected_data.split(",")]
assert len(result) == len(expected)
for j in range(len(result)):
if result[j] != expected[j]:
print("Error at", j, "Expected=", expected[j], "Result=", result[j])
expected_cuts = disc.bin_edges_[0]
computed_cuts = [float(x) for x in cuts_data.split(",")]
assert len(expected_cuts) == len(computed_cuts)
for j in range(len(expected_cuts)):
if round(expected_cuts[j], 5) != computed_cuts[j]:
print(
"Error at",
j,
"Expected=",
expected_cuts[j],
"Result=",
computed_cuts[j],
)

133
tests/tests_generate.ipynb Normal file
View File

@@ -0,0 +1,133 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import KBinsDiscretizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"experiments_range = [\n",
" [0, 100, 1, 4, \"Q\"],\n",
" [0, 50, 1, 4, \"Q\"],\n",
" [0, 100, 1, 3, \"Q\"],\n",
" [0, 50, 1, 3, \"Q\"],\n",
" [0, 10, 1, 3, \"Q\"],\n",
" [0, 100, 1, 4, \"U\"],\n",
" [0, 50, 1, 4, \"U\"],\n",
" [0, 100, 1, 3, \"U\"],\n",
" [0, 50, 1, 3, \"U\"],\n",
"# \n",
" [0, 10, 1, 3, \"U\"],\n",
" [1, 10, 1, 3, \"Q\"],\n",
" [1, 10, 1, 3, \"U\"],\n",
" [1, 11, 1, 3, \"Q\"],\n",
" [1, 11, 1, 3, \"U\"],\n",
" [1, 12, 1, 3, \"Q\"],\n",
" [1, 12, 1, 3, \"U\"],\n",
" [1, 13, 1, 3, \"Q\"],\n",
" [1, 13, 1, 3, \"U\"],\n",
" [1, 14, 1, 3, \"Q\"],\n",
" [1, 14, 1, 3, \"U\"],\n",
" [1, 15, 1, 3, \"Q\"],\n",
" [1, 15, 1, 3, \"U\"]\n",
"]\n",
"experiments_vectors = [\n",
" (3, [3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0]),\n",
" (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]),\n",
" (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]),\n",
" (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0]),\n",
" (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]),\n",
" (3, [15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0]),\n",
" (3, [0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0])\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/rmontanana/miniconda3/lib/python3.11/site-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n",
" warnings.warn(\n"
]
}
],
"source": [
"def write_lists(file, data, cuts):\n",
" sep = \"\"\n",
" for res in data:\n",
" file.write(f\"{sep}{int(res):d}\")\n",
" sep= \", \"\n",
" file.write(\"\\n\")\n",
" sep = \"\"\n",
" for res in cuts:\n",
" file.write(sep + str(round(res,5)))\n",
" sep = \", \"\n",
" file.write(\"\\n\")\n",
"\n",
"with open(\"datasets/tests.txt\", \"w\") as file:\n",
" file.write(\"#\\n\")\n",
" file.write(\"# from, to, step, #bins, Q/U\\n\")\n",
" file.write(\"# discretized data\\n\")\n",
" file.write(\"# cut points\\n\")\n",
" file.write(\"#\\n\")\n",
" for experiment in experiments_range:\n",
" file.write(\"RANGE\\n\")\n",
" (from_, to_, step_, bins_, strategy) = experiment\n",
" disc = KBinsDiscretizer(n_bins=bins_, encode='ordinal', strategy='quantile' if strategy.strip() == \"Q\" else 'uniform')\n",
" data = [[x] for x in range(from_, to_, step_)]\n",
" disc.fit(data)\n",
" result = disc.transform(data)\n",
" file.write(f\"{from_}, {to_}, {step_}, {bins_}, {strategy}\\n\")\n",
" write_lists(file, result, disc.bin_edges_[0])\n",
" for n_bins, experiment in experiments_vectors:\n",
" for strategy in [\"Q\", \"U\"]:\n",
" file.write(\"VECTOR\\n\")\n",
" file.write(f\"{strategy}{n_bins}{experiment}\\n\")\n",
" disc = KBinsDiscretizer(\n",
" n_bins=n_bins,\n",
" encode=\"ordinal\",\n",
" \n",
" strategy=\"quantile\" if strategy.strip() == \"Q\" else \"uniform\",\n",
" )\n",
" data = [[x] for x in experiment]\n",
" result = disc.fit_transform(data)\n",
" write_lists(file, result, disc.bin_edges_[0])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}