28 Commits

Author SHA1 Message Date
2775698063 test: 2022-12-21 19:05:24 +01:00
9db16d9d3c feat: Add version method to cppfimdlp 2022-12-20 01:11:39 +01:00
edd464311f fix: 🐛 Fix Tests and sample mistake 2022-12-15 12:18:10 +01:00
fe32ed4b2a Update algorithm type to compute cut points 2022-12-15 12:12:44 +01:00
1d95311a7d fix: 🐛 Fix a bug when pip install tries to build the package of File not Found
#4
2022-12-14 12:23:07 +01:00
d8066ea274 Update branch name from master to main in CI 2022-12-13 18:46:15 +01:00
a2c1b07525 Update Badges and README 2022-12-13 18:40:53 +01:00
05c12561ac Add submodule to ci 2022-12-13 18:18:12 +01:00
8f4bdd262a Update ci 2022-12-13 18:09:26 +01:00
0740d1f515 Update submodule command 2022-12-13 17:57:44 +01:00
eb7f3dc092 Command to update submodule and update it 2022-12-13 17:56:12 +01:00
cfc18adf06 Fix c++ sample 2022-12-13 17:18:38 +01:00
3ae0d67884 Fix tests because stable_sort in c++ 2022-12-13 17:16:23 +01:00
0ca507c692 Add submodule 2022-12-13 17:05:11 +01:00
Ricardo Montañana Gómez
70b3af94cc Merge pull request #3 from Doctorado-ML/ci
Ci
2022-12-13 17:01:08 +01:00
9d66bd6fd0 Remove some testing files 2022-12-13 16:59:14 +01:00
9039139a32 Remove submodule to fix it 2022-12-13 16:56:15 +01:00
a5dc2d7162 Remove submodule to fix it 2022-12-13 16:54:37 +01:00
67726bf219 Added submodule fimdlp/cppmdlp 2022-12-13 15:57:52 +01:00
2cb15cadbc Added submodule fimdlp/cppmdlp 2022-12-13 14:12:03 +01:00
630ea1dfdb Samplecpp 2022-12-13 14:02:35 +01:00
74d420dbce Added scikit-learn to dependencies 2022-12-13 13:43:39 +01:00
707432cc28 Fix parameter in test 2022-12-13 13:42:11 +01:00
6bddb3ac43 second try 2022-12-13 13:39:54 +01:00
d29a5c6caf Update CI 2022-12-13 13:37:49 +01:00
e5b09f1610 Update CI codeql 2022-12-13 13:31:20 +01:00
Ricardo Montañana Gómez
75c5a095c5 Merge pull request #2 from Doctorado-ML/submodule
Submodule
2022-12-13 11:55:20 +01:00
Ricardo Montañana Gómez
4d1ce5d743 Create codeql.yml 2022-12-13 11:52:51 +01:00
43 changed files with 225 additions and 34094 deletions

82
.github/workflows/codeql.yml vendored Normal file
View File

@@ -0,0 +1,82 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL"
on:
push:
branches: ["main"]
pull_request:
# The branches below must be a subset of the branches above
branches: ["main"]
schedule:
- cron: "16 22 * * 0"
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
security-events: write
strategy:
fail-fast: false
matrix:
language: ["python"]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
submodules: recursive
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
# queries: security-extended,security-and-quality
# Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- if: matrix.language == 'python'
name: Autobuild
uses: github/codeql-action/autobuild@v2
- if: matrix.language == 'cpp'
name: Build CPP
run: |
pip install -q --upgrade pip
pip install -q scikit-learn cython
make install
# Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
# If the Autobuild fails above, remove it and uncomment the following three lines.
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
# - run: |
# echo "Run, Build Application using script"
# ./location_of_script_within_repo/buildscript.sh
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2
with:
category: "/language:${{matrix.language}}"

View File

@@ -2,9 +2,9 @@ name: CI
on:
push:
branches: [master]
branches: [main]
pull_request:
branches: [master]
branches: [main]
workflow_dispatch:
jobs:
@@ -12,11 +12,13 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [macos-latest, ubuntu-latest, windows-latest]
python: [3.9, "3.10"]
os: [ubuntu-latest]
python: ["3.10"]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2
with:
@@ -24,10 +26,10 @@ jobs:
- name: Install dependencies
run: |
pip install -q --upgrade pip
pip install -q scikit-learn cython
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
- name: Build and install
run: |
cd FImdlp
make install
- name: Lint
run: |
@@ -35,7 +37,7 @@ jobs:
flake8 --count --per-file-ignores="__init__.py:F401" src
- name: Tests
run: |
coverage run -m unittest discover -v - s src
coverage run -m unittest discover -v -s src
coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1

9
.gitignore vendored
View File

@@ -33,8 +33,8 @@ MANIFEST
*.manifest
*.spec
# Installer log2s
pip-log2.txt
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
@@ -56,7 +56,7 @@ coverage.xml
*.pot
# Django stuff:
*.log2
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
@@ -134,4 +134,5 @@ cmake-build-debug
cmake-build-debug/**
**/lcoverage/**
**/x/*
**/*.so
**/*.so
**/CMakeFiles

6
.gitmodules vendored
View File

@@ -1,3 +1,3 @@
[submodule "fimdlp/cppmdlp"]
path = src/cppfimdlp
url = https://github.com/rmontanana/mdlp
[submodule "src/cppmdlp"]
path = src/cppmdlp
url = https://github.com/rmontanana/mdlp.git

Binary file not shown.

1
MANIFEST.in Normal file
View File

@@ -0,0 +1 @@
include src/cppmdlp/CPPFImdlp.h

View File

@@ -15,6 +15,10 @@ coverage:
make test
coverage report -m
submodule:
git submodule update --remote src/cppmdlp
git submodule update --merge
lint: ## Lint and static-check
black src
flake8 --per-file-ignores="__init__.py:F401" src

View File

@@ -1,11 +1,21 @@
# FImdlp
[![CI](https://github.com/Doctorado-ML/FImdlp/actions/workflows/main.yml/badge.svg)](https://github.com/Doctorado-ML/FImdlp/actions/workflows/main.yml)
[![CodeQL](https://github.com/Doctorado-ML/FImdlp/actions/workflows/codeql.yml/badge.svg)](https://github.com/Doctorado-ML/FImdlp/actions/workflows/codeql.yml)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/8b4d784fee13401588aa8c06532a2f6d)](https://www.codacy.com/gh/Doctorado-ML/FImdlp/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/FImdlp&utm_campaign=Badge_Grade)
[![codecov](https://codecov.io/gh/Doctorado-ML/FImdlp/branch/main/graph/badge.svg?token=W8I45B5Z3J)](https://codecov.io/gh/Doctorado-ML/FImdlp)
[![pypy](https://img.shields.io/pypi/v/FImdlp?color=g)](https://img.shields.io/pypi/v/FImdlp?color=g)
![https://img.shields.io/badge/python-3.9%2B-blue](https://img.shields.io/badge/python-3.9%2B-brightgreen)
Discretization algorithm based on the paper by Usama M. Fayyad and Keki B. Irani
```
Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning. In Proceedings of the 13th International Joint Conference on Artificial Intelligence (IJCAI-95), pages 1022-1027, Montreal, Canada, August 1995.
## Installation
```bash
git clone --recurse-submodules https://github.com/doctorado-ml/FImdlp.git
```
## Build and usage sample
@@ -14,8 +24,8 @@ Multi-Interval Discretization of Continuous-Valued Attributes for Classification
```bash
pip install -e .
python samples/sample.py iris --original
python samples/sample.py iris --proposal
python samples/sample.py iris
python samples/sample.py iris --alternative
python samples/sample.py -h # for more options
```

View File

@@ -1,152 +0,0 @@
+++++++++++++++++++++++
( 0, 13) -> (4.3, 0)
( 1, 8) -> (4.4, 0)
( 2, 38) -> (4.4, 0)
( 3, 42) -> (4.4, 0)
( 4, 41) -> (4.5, 0)
( 5, 3) -> (4.6, 0)
( 6, 6) -> (4.6, 0)
( 7, 22) -> (4.6, 0)
( 8, 47) -> (4.6, 0)
( 9, 2) -> (4.7, 0)
( 10, 29) -> (4.7, 0)
( 11, 11) -> (4.8, 0)
( 12, 12) -> (4.8, 0)
( 13, 24) -> (4.8, 0)
( 14, 30) -> (4.8, 0)
( 15, 45) -> (4.8, 0)
( 16, 1) -> (4.9, 0)
( 17, 9) -> (4.9, 0)
( 18, 34) -> (4.9, 0)
( 19, 37) -> (4.9, 0)
( 20, 57) -> (4.9, 1)
( 21, 106) -> (4.9, 2)
( 22, 4) -> (5.0, 0)
( 23, 7) -> (5.0, 0)
( 24, 25) -> (5.0, 0)
( 25, 26) -> (5.0, 0)
( 26, 35) -> (5.0, 0)
( 27, 40) -> (5.0, 0)
( 28, 43) -> (5.0, 0)
( 29, 49) -> (5.0, 0)
( 30, 60) -> (5.0, 1)
( 31, 93) -> (5.0, 1)
( 32, 0) -> (5.1, 0)
( 33, 17) -> (5.1, 0)
( 34, 19) -> (5.1, 0)
( 35, 21) -> (5.1, 0)
( 36, 23) -> (5.1, 0)
( 37, 39) -> (5.1, 0)
( 38, 44) -> (5.1, 0)
( 39, 46) -> (5.1, 0)
( 40, 98) -> (5.1, 1)
( 41, 27) -> (5.2, 0)
( 42, 28) -> (5.2, 0)
( 43, 32) -> (5.2, 0)
( 44, 59) -> (5.2, 1)
( 45, 48) -> (5.3, 0)
( 46, 5) -> (5.4, 0)
( 47, 10) -> (5.4, 0)
( 48, 16) -> (5.4, 0)
( 49, 20) -> (5.4, 0)
( 50, 31) -> (5.4, 0)
( 51, 84) -> (5.4, 1)
( 52, 33) -> (5.5, 0)
( 53, 36) -> (5.5, 0)
( 54, 53) -> (5.5, 1)
( 55, 80) -> (5.5, 1)
( 56, 81) -> (5.5, 1)
( 57, 89) -> (5.5, 1)
( 58, 90) -> (5.5, 1)
( 59, 64) -> (5.6, 1)
( 60, 66) -> (5.6, 1)
( 61, 69) -> (5.6, 1)
( 62, 88) -> (5.6, 1)
( 63, 94) -> (5.6, 1)
( 64, 121) -> (5.6, 2)
( 65, 15) -> (5.7, 0)
( 66, 18) -> (5.7, 0)
( 67, 55) -> (5.7, 1)
( 68, 79) -> (5.7, 1)
( 69, 95) -> (5.7, 1)
( 70, 96) -> (5.7, 1)
( 71, 99) -> (5.7, 1)
( 72, 113) -> (5.7, 2)
( 73, 14) -> (5.8, 0)
( 74, 67) -> (5.8, 1)
( 75, 82) -> (5.8, 1)
( 76, 92) -> (5.8, 1)
( 77, 101) -> (5.8, 2)
( 78, 114) -> (5.8, 2)
( 79, 142) -> (5.8, 2)
( 80, 61) -> (5.9, 1)
( 81, 70) -> (5.9, 1)
( 82, 149) -> (5.9, 2)
( 83, 62) -> (6.0, 1)
( 84, 78) -> (6.0, 1)
( 85, 83) -> (6.0, 1)
( 86, 85) -> (6.0, 1)
( 87, 119) -> (6.0, 2)
( 88, 138) -> (6.0, 2)
( 89, 63) -> (6.1, 1)
( 90, 71) -> (6.1, 1)
( 91, 73) -> (6.1, 1)
( 92, 91) -> (6.1, 1)
( 93, 127) -> (6.1, 2)
( 94, 134) -> (6.1, 2)
( 95, 68) -> (6.2, 1)
( 96, 97) -> (6.2, 1)
( 97, 126) -> (6.2, 2)
( 98, 148) -> (6.2, 2)
( 99, 56) -> (6.3, 1)
(100, 72) -> (6.3, 1)
(101, 87) -> (6.3, 1)
(102, 100) -> (6.3, 2)
(103, 103) -> (6.3, 2)
(104, 123) -> (6.3, 2)
(105, 133) -> (6.3, 2)
(106, 136) -> (6.3, 2)
(107, 146) -> (6.3, 2)
(108, 51) -> (6.4, 1)
(109, 74) -> (6.4, 1)
(110, 111) -> (6.4, 2)
(111, 115) -> (6.4, 2)
(112, 128) -> (6.4, 2)
(113, 132) -> (6.4, 2)
(114, 137) -> (6.4, 2)
(115, 54) -> (6.5, 1)
(116, 104) -> (6.5, 2)
(117, 110) -> (6.5, 2)
(118, 116) -> (6.5, 2)
(119, 147) -> (6.5, 2)
(120, 58) -> (6.6, 1)
(121, 75) -> (6.6, 1)
(122, 65) -> (6.7, 1)
(123, 77) -> (6.7, 1)
(124, 86) -> (6.7, 1)
(125, 108) -> (6.7, 2)
(126, 124) -> (6.7, 2)
(127, 140) -> (6.7, 2)
(128, 144) -> (6.7, 2)
(129, 145) -> (6.7, 2)
(130, 76) -> (6.8, 1)
(131, 112) -> (6.8, 2)
(132, 143) -> (6.8, 2)
(133, 52) -> (6.9, 1)
(134, 120) -> (6.9, 2)
(135, 139) -> (6.9, 2)
(136, 141) -> (6.9, 2)
(137, 50) -> (7.0, 1)
(138, 102) -> (7.1, 2)
(139, 109) -> (7.2, 2)
(140, 125) -> (7.2, 2)
(141, 129) -> (7.2, 2)
(142, 107) -> (7.3, 2)
(143, 130) -> (7.4, 2)
(144, 105) -> (7.6, 2)
(145, 117) -> (7.7, 2)
(146, 118) -> (7.7, 2)
(147, 122) -> (7.7, 2)
(148, 135) -> (7.7, 2)
(149, 131) -> (7.9, 2)
+++++++++++++++++++++++

View File

@@ -1,117 +0,0 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
using namespace std;
ArffFiles::ArffFiles()
{
}
vector<string> ArffFiles::getLines()
{
return lines;
}
unsigned long int ArffFiles::getSize()
{
return lines.size();
}
vector<tuple<string, string>> ArffFiles::getAttributes()
{
return attributes;
}
string ArffFiles::getClassName()
{
return className;
}
string ArffFiles::getClassType()
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(string fileName, bool classLast)
{
ifstream file(fileName);
string keyword, attribute, type;
if (file.is_open()) {
string line;
while (getline(file, line)) {
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute >> type;
attributes.push_back(make_tuple(attribute, type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
if (classLast) {
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
} else {
className = get<0>(attributes.front());
classType = get<1>(attributes.front());
attributes.erase(attributes.begin());
}
generateDataset(classLast);
} else
throw invalid_argument("Unable to open file");
}
void ArffFiles::generateDataset(bool classLast)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
vector<string> yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? attributes.size() : 0;
for (int i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int pos = 0, xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
X[xIndex++][i] = stof(value);
}
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

View File

@@ -1,28 +0,0 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
#include <tuple>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<tuple<string, string>> attributes;
string className, classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset(bool);
public:
ArffFiles();
void load(string, bool = true);
vector<string> getLines();
unsigned long int getSize();
string getClassName();
string getClassType();
string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<tuple<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels_t);
};
#endif

View File

@@ -3,4 +3,4 @@ project(main)
set(CMAKE_CXX_STANDARD 14)
add_executable(sample sample.cpp ArffFiles.cpp ../src/fimdlp/cppmdlp/Metrics.cpp ../src/fimdlp/cppmdlp/CPPFImdlp.cpp)
add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp)

View File

@@ -1,4 +1,4 @@
#include "ArffFiles.h"
#include "../src/cppmdlp/tests/ArffFiles.h"
#include <iostream>
#include <vector>
#include <iomanip>
@@ -41,7 +41,7 @@ int main(int argc, char** argv)
}
cout << y[i] << endl;
}
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(false);
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0);
for (auto i = 0; i < attributes.size(); i++) {
cout << "Cut points for " << get<0>(attributes[i]) << endl;
cout << "--------------------------" << setprecision(3) << endl;

View File

@@ -14,8 +14,9 @@ datasets = {
}
ap = argparse.ArgumentParser()
ap.add_argument("--proposal", action="store_true")
ap.add_argument("--original", dest="proposal", action="store_false")
ap.add_argument(
"--alternative", dest="proposal", action="store_const", const=1
)
ap.add_argument("dataset", type=str, choices=datasets.keys())
args = ap.parse_args()
relative = "" if os.path.isdir("src") else ".."
@@ -29,7 +30,7 @@ class_name = df.columns.to_list()[class_column]
X = df.drop(class_name, axis=1)
y, _ = pd.factorize(df[class_name])
X = X.to_numpy()
test = FImdlp(proposal=args.proposal)
test = FImdlp(algorithm=args.proposal if args.proposal is not None else 0)
now = time.time()
test.fit(X, y)
fit_time = time.time()

1
src/cppmdlp Submodule

Submodule src/cppmdlp added at 35c532bf1d

View File

@@ -1,36 +0,0 @@
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
**/build
**/lcoverage
.idea
cmake-*

View File

@@ -1,7 +0,0 @@
cmake_minimum_required(VERSION 3.24)
project(mdlp)
set(CMAKE_CXX_STANDARD 17)
add_library(mdlp CPPFImdlp.cpp Metrics.cpp)

View File

@@ -1,160 +0,0 @@
#include <numeric>
#include <algorithm>
#include <set>
#include <cmath>
#include "CPPFImdlp.h"
#include "Metrics.h"
namespace mdlp {
CPPFImdlp::CPPFImdlp(bool proposal):proposal(proposal), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices))
{
}
CPPFImdlp::~CPPFImdlp()
= default;
CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_)
{
X = X_;
y = y_;
cutPoints.clear();
if (X.size() != y.size()) {
throw invalid_argument("X and y must have the same size");
}
if (X.size() == 0 || y.size() == 0) {
throw invalid_argument("X and y must have at least one element");
}
indices = sortIndices(X_);
metrics.setData(y, indices);
if (proposal)
computeCutPointsProposal();
else
computeCutPoints(0, X.size());
return *this;
}
void CPPFImdlp::computeCutPoints(size_t start, size_t end)
{
int cut;
if (end - start < 2)
return;
cut = getCandidate(start, end);
if (cut == -1 || !mdlp(start, cut, end)) {
// cut.value == -1 means that there is no candidate in the interval
// No boundary found, so we add both ends of the interval as cutpoints
// because they were selected by the algorithm before
if (start != 0)
cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2);
if (end != X.size())
cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2);
return;
}
computeCutPoints(start, cut);
computeCutPoints(cut, end);
}
void CPPFImdlp::computeCutPointsOriginal(size_t start, size_t end)
{
precision_t cut;
if (end - start < 2)
return;
cut = getCandidate(start, end);
if (cut == -1)
return;
if (mdlp(start, cut, end)) {
cutPoints.push_back((X[indices[cut]] + X[indices[cut - 1]]) / 2);
}
computeCutPointsOriginal(start, cut);
computeCutPointsOriginal(cut, end);
}
void CPPFImdlp::computeCutPointsProposal()
{
precision_t xPrev, xCur, xPivot, cutPoint;
int yPrev, yCur, yPivot;
size_t idx, numElements, start;
xCur = xPrev = X[indices[0]];
yCur = yPrev = y[indices[0]];
numElements = indices.size() - 1;
idx = start = 0;
while (idx < numElements) {
xPivot = xCur;
yPivot = yCur;
// Read the same values and check class changes
do {
idx++;
xCur = X[indices[idx]];
yCur = y[indices[idx]];
if (yCur != yPivot && xCur == xPivot) {
yPivot = -1;
}
}
while (idx < numElements && xCur == xPivot);
// Check if the class changed and there are more than 1 element
if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) {
start = idx;
cutPoint = (xPrev + xCur) / 2;
cutPoints.push_back(cutPoint);
}
yPrev = yPivot;
xPrev = xPivot;
}
}
long int CPPFImdlp::getCandidate(size_t start, size_t end)
{
long int candidate = -1, elements = end - start;
precision_t entropy_left, entropy_right, minEntropy = numeric_limits<precision_t>::max();
for (auto idx = start + 1; idx < end; idx++) {
// Cutpoints are always on boudndaries
if (y[indices[idx]] == y[indices[idx - 1]])
continue;
entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx);
entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end);
if (entropy_left + entropy_right < minEntropy) {
minEntropy = entropy_left + entropy_right;
candidate = idx;
}
}
return candidate;
}
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
{
int k, k1, k2;
precision_t ig, delta;
precision_t ent, ent1, ent2;
auto N = precision_t(end - start);
if (N < 2) {
return false;
}
k = metrics.computeNumClasses(start, end);
k1 = metrics.computeNumClasses(start, cut);
k2 = metrics.computeNumClasses(cut, end);
ent = metrics.entropy(start, end);
ent1 = metrics.entropy(start, cut);
ent2 = metrics.entropy(cut, end);
ig = metrics.informationGain(start, cut, end);
delta = log2(pow(3, precision_t(k)) - 2) -
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
precision_t term = 1 / N * (log2(N - 1) + delta);
return ig > term;
}
cutPoints_t CPPFImdlp::getCutPoints()
{
// Remove duplicates and sort
cutPoints_t output(cutPoints.size());
set<precision_t> s;
unsigned size = cutPoints.size();
for (unsigned i = 0; i < size; i++)
s.insert(cutPoints[i]);
output.assign(s.begin(), s.end());
sort(output.begin(), output.end());
return output;
}
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples_t& X_)
{
indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0);
for (size_t i = 0; i < X_.size(); i++)
sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
{ return X_[i1] < X_[i2]; });
return idx;
}
}

View File

@@ -1,33 +0,0 @@
#ifndef CPPFIMDLP_H
#define CPPFIMDLP_H
#include "typesFImdlp.h"
#include "Metrics.h"
#include <utility>
namespace mdlp {
class CPPFImdlp {
protected:
bool proposal;
indices_t indices; // sorted indices to use with X and y
samples_t X;
labels_t y;
Metrics metrics;
cutPoints_t cutPoints;
static indices_t sortIndices(samples_t&);
void computeCutPoints(size_t, size_t);
long int getCandidate(size_t, size_t);
bool mdlp(size_t, size_t, size_t);
// Original algorithm
void computeCutPointsOriginal(size_t, size_t);
bool goodCut(size_t, size_t, size_t);
void computeCutPointsProposal();
public:
CPPFImdlp(bool);
~CPPFImdlp();
CPPFImdlp& fit(samples_t&, labels_t&);
samples_t getCutPoints();
};
}
#endif

View File

@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2022 Ricardo Montañana Gómez
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,65 +0,0 @@
#include "Metrics.h"
#include <set>
#include <cmath>
using namespace std;
namespace mdlp {
Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
{
}
int Metrics::computeNumClasses(size_t start, size_t end)
{
set<int> nClasses;
for (auto i = start; i < end; ++i) {
nClasses.insert(y[indices[i]]);
}
return nClasses.size();
}
void Metrics::setData(labels_t& y_, indices_t& indices_)
{
indices = indices_;
y = y_;
numClasses = computeNumClasses(0, indices.size());
entropyCache.clear();
igCache.clear();
}
precision_t Metrics::entropy(size_t start, size_t end)
{
precision_t p, ventropy = 0;
int nElements = 0;
labels_t counts(numClasses + 1, 0);
if (end - start < 2)
return 0;
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
return entropyCache[make_tuple(start, end)];
}
for (auto i = &indices[start]; i != &indices[end]; ++i) {
counts[y[*i]]++;
nElements++;
}
for (auto count : counts) {
if (count > 0) {
p = (precision_t)count / nElements;
ventropy -= p * log2(p);
}
}
entropyCache[make_tuple(start, end)] = ventropy;
return ventropy;
}
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
{
precision_t iGain;
precision_t entropyInterval, entropyLeft, entropyRight;
int nElementsLeft = cut - start, nElementsRight = end - cut;
int nElements = end - start;
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
return igCache[make_tuple(start, cut, end)];
}
entropyInterval = entropy(start, end);
entropyLeft = entropy(start, cut);
entropyRight = entropy(cut, end);
iGain = entropyInterval - ((precision_t)nElementsLeft * entropyLeft + (precision_t)nElementsRight * entropyRight) / nElements;
igCache[make_tuple(start, cut, end)] = iGain;
return iGain;
}
}

View File

@@ -1,20 +0,0 @@
#ifndef CCMETRICS_H
#define CCMETRICS_H
#include "typesFImdlp.h"
namespace mdlp {
class Metrics {
protected:
labels_t& y;
indices_t& indices;
int numClasses;
cacheEnt_t entropyCache;
cacheIg_t igCache;
public:
Metrics(labels_t&, indices_t&);
void setData(labels_t&, indices_t&);
int computeNumClasses(size_t, size_t);
precision_t entropy(size_t, size_t);
precision_t informationGain(size_t, size_t, size_t);
};
}
#endif

View File

@@ -1,2 +0,0 @@
# mdlp
Discretization algorithm based on the paper by Fayyad &amp; Irani Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning

View File

@@ -1,117 +0,0 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
using namespace std;
ArffFiles::ArffFiles()
{
}
vector<string> ArffFiles::getLines()
{
return lines;
}
unsigned long int ArffFiles::getSize()
{
return lines.size();
}
vector<tuple<string, string>> ArffFiles::getAttributes()
{
return attributes;
}
string ArffFiles::getClassName()
{
return className;
}
string ArffFiles::getClassType()
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(string fileName, bool classLast)
{
ifstream file(fileName);
string keyword, attribute, type;
if (file.is_open()) {
string line;
while (getline(file, line)) {
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute >> type;
attributes.push_back(make_tuple(attribute, type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
if (classLast) {
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
} else {
className = get<0>(attributes.front());
classType = get<1>(attributes.front());
attributes.erase(attributes.begin());
}
generateDataset(classLast);
} else
throw invalid_argument("Unable to open file");
}
void ArffFiles::generateDataset(bool classLast)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
vector<string> yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? attributes.size() : 0;
for (int i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int pos = 0, xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
X[xIndex++][i] = stof(value);
}
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

View File

@@ -1,28 +0,0 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
#include <tuple>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<tuple<string, string>> attributes;
string className, classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset(bool);
public:
ArffFiles();
void load(string, bool = true);
vector<string> getLines();
unsigned long int getSize();
string getClassName();
string getClassType();
string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<tuple<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels_t);
};
#endif

View File

@@ -1,6 +0,0 @@
cmake_minimum_required(VERSION 3.24)
project(main)
set(CMAKE_CXX_STANDARD 17)
add_executable(sample sample.cpp ArffFiles.cpp ../Metrics.cpp ../CPPFImdlp.cpp)

View File

@@ -1,54 +0,0 @@
#include "ArffFiles.h"
#include <iostream>
#include <vector>
#include <iomanip>
#include "../CPPFImdlp.h"
using namespace std;
int main(int argc, char** argv)
{
ArffFiles file;
vector<string> lines;
string path = "../../tests/datasets/";
map<string, bool > datasets = {
{"mfeat-factors", true},
{"iris", true},
{"letter", true},
{"kdd_JapaneseVowels", false}
};
if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {
cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl;
return 1;
}
file.load(path + argv[1] + ".arff", datasets[argv[1]]);
auto attributes = file.getAttributes();
int items = file.getSize();
cout << "Number of lines: " << items << endl;
cout << "Attributes: " << endl;
for (auto attribute : attributes) {
cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl;
}
cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl;
vector<vector<float>>& X = file.getX();
vector<int>& y = file.getY();
for (int i = 0; i < 50; i++) {
for (auto feature : X) {
cout << fixed << setprecision(1) << feature[i] << " ";
}
cout << y[i] << endl;
}
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(false);
for (auto i = 0; i < attributes.size(); i++) {
cout << "Cut points for " << get<0>(attributes[i]) << endl;
cout << "--------------------------" << setprecision(3) << endl;
test.fit(X[i], y);
for (auto item : test.getCutPoints()) {
cout << item << endl;
}
}
return 0;
}

View File

@@ -1,2 +0,0 @@
build
build/*

View File

@@ -1,32 +0,0 @@
cmake_minimum_required(VERSION 3.14)
project(FImdlp)
# GoogleTest requires at least C++14
set(CMAKE_CXX_STANDARD 14)
include(FetchContent)
include_directories(${GTEST_INCLUDE_DIRS})
FetchContent_Declare(
googletest
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
)
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)
enable_testing()
add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp)
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ../Metrics.cpp FImdlp_unittest.cpp)
target_link_libraries(Metrics_unittest GTest::gtest_main)
target_link_libraries(FImdlp_unittest GTest::gtest_main)
target_compile_options(Metrics_unittest PRIVATE --coverage)
target_compile_options(FImdlp_unittest PRIVATE --coverage)
target_link_options(Metrics_unittest PRIVATE --coverage)
target_link_options(FImdlp_unittest PRIVATE --coverage)
include(GoogleTest)
gtest_discover_tests(Metrics_unittest)
gtest_discover_tests(FImdlp_unittest)

View File

@@ -1,141 +0,0 @@
#include "gtest/gtest.h"
#include "../Metrics.h"
#include "../CPPFImdlp.h"
#include <iostream>
namespace mdlp {
class TestFImdlp : public CPPFImdlp, public testing::Test {
public:
precision_t precision = 0.000001;
TestFImdlp() : CPPFImdlp(false) {}
void SetUp() {
// 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
//(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9};
y = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2};
proposal = false;
fit(X, y);
}
void setProposal(bool value) {
proposal = value;
}
// void initIndices()
// {
// indices = indices_t();
// }
void checkSortedVector() {
indices_t testSortedIndices = sortIndices(X);
precision_t prev = X[testSortedIndices[0]];
for (auto i = 0; i < X.size(); ++i) {
EXPECT_EQ(testSortedIndices[i], indices[i]);
EXPECT_LE(prev, X[testSortedIndices[i]]);
prev = X[testSortedIndices[i]];
}
}
void checkCutPoints(cutPoints_t &expected) {
int expectedSize = expected.size();
EXPECT_EQ(cutPoints.size(), expectedSize);
for (auto i = 0; i < cutPoints.size(); i++) {
EXPECT_NEAR(cutPoints[i], expected[i], precision);
}
}
template<typename T, typename A>
void checkVectors(std::vector<T, A> const &expected, std::vector<T, A> const &computed) {
EXPECT_EQ(expected.size(), computed.size());
ASSERT_EQ(expected.size(), computed.size());
for (auto i = 0; i < expected.size(); i++) {
EXPECT_NEAR(expected[i], computed[i],precision);
}
}
};
TEST_F(TestFImdlp, FitErrorEmptyDataset) {
X = samples_t();
y = labels_t();
EXPECT_THROW(fit(X, y), std::invalid_argument);
}
TEST_F(TestFImdlp, FitErrorDifferentSize) {
X = {1, 2, 3};
y = {1, 2};
EXPECT_THROW(fit(X, y), std::invalid_argument);
}
TEST_F(TestFImdlp, SortIndices) {
X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9};
indices = {4, 3, 6, 8, 2, 1, 5, 0, 9, 7};
checkSortedVector();
X = {5.77, 5.88, 5.99};
indices = {0, 1, 2};
checkSortedVector();
X = {5.33, 5.22, 5.11};
indices = {2, 1, 0};
checkSortedVector();
}
TEST_F(TestFImdlp, TestDataset) {
proposal = false;
fit(X, y);
computeCutPointsOriginal(0, 10);
cutPoints_t expected = {5.6499996185302734};
vector<precision_t> computed = getCutPoints();
computed = getCutPoints();
int expectedSize = expected.size();
EXPECT_EQ(computed.size(), expected.size());
for (auto i = 0; i < expectedSize; i++) {
EXPECT_NEAR(computed[i], expected[i], precision);
}
}
TEST_F(TestFImdlp, ComputeCutPointsOriginal) {
cutPoints_t expected = {5.65};
proposal = false;
computeCutPointsOriginal(0, 10);
checkCutPoints(expected);
}
TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) {
cutPoints_t expected;
proposal = false;
expected = {2};
samples_t X_ = {0, 1, 2, 2};
labels_t y_ = {1, 1, 1, 2};
fit(X_, y_);
checkCutPoints(expected);
}
TEST_F(TestFImdlp, ComputeCutPointsProposal) {
proposal = true;
cutPoints_t expected;
expected = {};
fit(X, y);
computeCutPointsProposal();
checkCutPoints(expected);
}
TEST_F(TestFImdlp, ComputeCutPointsProposalGCase) {
cutPoints_t expected;
expected = {1.5};
proposal = true;
samples_t X_ = {0, 1, 2, 2};
labels_t y_ = {1, 1, 1, 2};
fit(X_, y_);
checkCutPoints(expected);
}
TEST_F(TestFImdlp, GetCutPoints) {
samples_t computed, expected = {5.65};
proposal = false;
computeCutPointsOriginal(0, 10);
computed = getCutPoints();
for (auto item: cutPoints)
cout << setprecision(6) << item << endl;
checkVectors(expected, computed);
}
}

View File

@@ -1,43 +0,0 @@
#include "gtest/gtest.h"
#include "../Metrics.h"
namespace mdlp {
class TestMetrics: public Metrics, public testing::Test {
public:
labels_t y;
samples_t X;
indices_t indices;
precision_t precision = 0.000001;
TestMetrics(): Metrics(y, indices) {}
void SetUp()
{
y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
setData(y, indices);
}
};
TEST_F(TestMetrics, NumClasses)
{
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
EXPECT_EQ(1, computeNumClasses(4, 8));
EXPECT_EQ(2, computeNumClasses(0, 10));
EXPECT_EQ(2, computeNumClasses(8, 10));
}
TEST_F(TestMetrics, Entropy)
{
EXPECT_EQ(1, entropy(0, 10));
EXPECT_EQ(0, entropy(0, 5));
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
setData(y, indices);
ASSERT_NEAR(0.468996, entropy(0, 10), precision);
}
TEST_F(TestMetrics, InformationGain)
{
ASSERT_NEAR(1, informationGain(0, 5, 10), precision);
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
setData(y, indices);
ASSERT_NEAR(0.108032, informationGain(0, 5, 10), precision);
}
}

View File

@@ -1,4 +0,0 @@
rm -fr lcoverage/*
lcov --capture --directory ./ --output-file lcoverage/main_coverage.info
genhtml lcoverage/main_coverage.info --output-directory lcoverage
open lcoverage/index.html

View File

@@ -1,225 +0,0 @@
% 1. Title: Iris Plants Database
%
% 2. Sources:
% (a) Creator: R.A. Fisher
% (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
% (c) Date: July, 1988
%
% 3. Past Usage:
% - Publications: too many to mention!!! Here are a few.
% 1. Fisher,R.A. "The use of multiple measurements in taxonomic problems"
% Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions
% to Mathematical Statistics" (John Wiley, NY, 1950).
% 2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
% (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
% 3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
% Structure and Classification Rule for Recognition in Partially Exposed
% Environments". IEEE Transactions on Pattern Analysis and Machine
% Intelligence, Vol. PAMI-2, No. 1, 67-71.
% -- Results:
% -- very low misclassification rates (0% for the setosa class)
% 4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE
% Transactions on Information Theory, May 1972, 431-433.
% -- Results:
% -- very low misclassification rates again
% 5. See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II
% conceptual clustering system finds 3 classes in the data.
%
% 4. Relevant Information:
% --- This is perhaps the best known database to be found in the pattern
% recognition literature. Fisher's paper is a classic in the field
% and is referenced frequently to this day. (See Duda & Hart, for
% example.) The data set contains 3 classes of 50 instances each,
% where each class refers to a type of iris plant. One class is
% linearly separable from the other 2; the latter are NOT linearly
% separable from each other.
% --- Predicted attribute: class of iris plant.
% --- This is an exceedingly simple domain.
%
% 5. Number of Instances: 150 (50 in each of three classes)
%
% 6. Number of Attributes: 4 numeric, predictive attributes and the class
%
% 7. Attribute Information:
% 1. sepal length in cm
% 2. sepal width in cm
% 3. petal length in cm
% 4. petal width in cm
% 5. class:
% -- Iris Setosa
% -- Iris Versicolour
% -- Iris Virginica
%
% 8. Missing Attribute Values: None
%
% Summary Statistics:
% Min Max Mean SD Class Correlation
% sepal length: 4.3 7.9 5.84 0.83 0.7826
% sepal width: 2.0 4.4 3.05 0.43 -0.4194
% petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
% petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
%
% 9. Class Distribution: 33.3% for each of 3 classes.
@RELATION iris
@ATTRIBUTE sepallength REAL
@ATTRIBUTE sepalwidth REAL
@ATTRIBUTE petallength REAL
@ATTRIBUTE petalwidth REAL
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
@DATA
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
5.7,3.8,1.7,0.3,Iris-setosa
5.1,3.8,1.5,0.3,Iris-setosa
5.4,3.4,1.7,0.2,Iris-setosa
5.1,3.7,1.5,0.4,Iris-setosa
4.6,3.6,1.0,0.2,Iris-setosa
5.1,3.3,1.7,0.5,Iris-setosa
4.8,3.4,1.9,0.2,Iris-setosa
5.0,3.0,1.6,0.2,Iris-setosa
5.0,3.4,1.6,0.4,Iris-setosa
5.2,3.5,1.5,0.2,Iris-setosa
5.2,3.4,1.4,0.2,Iris-setosa
4.7,3.2,1.6,0.2,Iris-setosa
4.8,3.1,1.6,0.2,Iris-setosa
5.4,3.4,1.5,0.4,Iris-setosa
5.2,4.1,1.5,0.1,Iris-setosa
5.5,4.2,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.0,3.2,1.2,0.2,Iris-setosa
5.5,3.5,1.3,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
4.4,3.0,1.3,0.2,Iris-setosa
5.1,3.4,1.5,0.2,Iris-setosa
5.0,3.5,1.3,0.3,Iris-setosa
4.5,2.3,1.3,0.3,Iris-setosa
4.4,3.2,1.3,0.2,Iris-setosa
5.0,3.5,1.6,0.6,Iris-setosa
5.1,3.8,1.9,0.4,Iris-setosa
4.8,3.0,1.4,0.3,Iris-setosa
5.1,3.8,1.6,0.2,Iris-setosa
4.6,3.2,1.4,0.2,Iris-setosa
5.3,3.7,1.5,0.2,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor
5.7,2.8,4.5,1.3,Iris-versicolor
6.3,3.3,4.7,1.6,Iris-versicolor
4.9,2.4,3.3,1.0,Iris-versicolor
6.6,2.9,4.6,1.3,Iris-versicolor
5.2,2.7,3.9,1.4,Iris-versicolor
5.0,2.0,3.5,1.0,Iris-versicolor
5.9,3.0,4.2,1.5,Iris-versicolor
6.0,2.2,4.0,1.0,Iris-versicolor
6.1,2.9,4.7,1.4,Iris-versicolor
5.6,2.9,3.6,1.3,Iris-versicolor
6.7,3.1,4.4,1.4,Iris-versicolor
5.6,3.0,4.5,1.5,Iris-versicolor
5.8,2.7,4.1,1.0,Iris-versicolor
6.2,2.2,4.5,1.5,Iris-versicolor
5.6,2.5,3.9,1.1,Iris-versicolor
5.9,3.2,4.8,1.8,Iris-versicolor
6.1,2.8,4.0,1.3,Iris-versicolor
6.3,2.5,4.9,1.5,Iris-versicolor
6.1,2.8,4.7,1.2,Iris-versicolor
6.4,2.9,4.3,1.3,Iris-versicolor
6.6,3.0,4.4,1.4,Iris-versicolor
6.8,2.8,4.8,1.4,Iris-versicolor
6.7,3.0,5.0,1.7,Iris-versicolor
6.0,2.9,4.5,1.5,Iris-versicolor
5.7,2.6,3.5,1.0,Iris-versicolor
5.5,2.4,3.8,1.1,Iris-versicolor
5.5,2.4,3.7,1.0,Iris-versicolor
5.8,2.7,3.9,1.2,Iris-versicolor
6.0,2.7,5.1,1.6,Iris-versicolor
5.4,3.0,4.5,1.5,Iris-versicolor
6.0,3.4,4.5,1.6,Iris-versicolor
6.7,3.1,4.7,1.5,Iris-versicolor
6.3,2.3,4.4,1.3,Iris-versicolor
5.6,3.0,4.1,1.3,Iris-versicolor
5.5,2.5,4.0,1.3,Iris-versicolor
5.5,2.6,4.4,1.2,Iris-versicolor
6.1,3.0,4.6,1.4,Iris-versicolor
5.8,2.6,4.0,1.2,Iris-versicolor
5.0,2.3,3.3,1.0,Iris-versicolor
5.6,2.7,4.2,1.3,Iris-versicolor
5.7,3.0,4.2,1.2,Iris-versicolor
5.7,2.9,4.2,1.3,Iris-versicolor
6.2,2.9,4.3,1.3,Iris-versicolor
5.1,2.5,3.0,1.1,Iris-versicolor
5.7,2.8,4.1,1.3,Iris-versicolor
6.3,3.3,6.0,2.5,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
7.1,3.0,5.9,2.1,Iris-virginica
6.3,2.9,5.6,1.8,Iris-virginica
6.5,3.0,5.8,2.2,Iris-virginica
7.6,3.0,6.6,2.1,Iris-virginica
4.9,2.5,4.5,1.7,Iris-virginica
7.3,2.9,6.3,1.8,Iris-virginica
6.7,2.5,5.8,1.8,Iris-virginica
7.2,3.6,6.1,2.5,Iris-virginica
6.5,3.2,5.1,2.0,Iris-virginica
6.4,2.7,5.3,1.9,Iris-virginica
6.8,3.0,5.5,2.1,Iris-virginica
5.7,2.5,5.0,2.0,Iris-virginica
5.8,2.8,5.1,2.4,Iris-virginica
6.4,3.2,5.3,2.3,Iris-virginica
6.5,3.0,5.5,1.8,Iris-virginica
7.7,3.8,6.7,2.2,Iris-virginica
7.7,2.6,6.9,2.3,Iris-virginica
6.0,2.2,5.0,1.5,Iris-virginica
6.9,3.2,5.7,2.3,Iris-virginica
5.6,2.8,4.9,2.0,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
6.3,2.7,4.9,1.8,Iris-virginica
6.7,3.3,5.7,2.1,Iris-virginica
7.2,3.2,6.0,1.8,Iris-virginica
6.2,2.8,4.8,1.8,Iris-virginica
6.1,3.0,4.9,1.8,Iris-virginica
6.4,2.8,5.6,2.1,Iris-virginica
7.2,3.0,5.8,1.6,Iris-virginica
7.4,2.8,6.1,1.9,Iris-virginica
7.9,3.8,6.4,2.0,Iris-virginica
6.4,2.8,5.6,2.2,Iris-virginica
6.3,2.8,5.1,1.5,Iris-virginica
6.1,2.6,5.6,1.4,Iris-virginica
7.7,3.0,6.1,2.3,Iris-virginica
6.3,3.4,5.6,2.4,Iris-virginica
6.4,3.1,5.5,1.8,Iris-virginica
6.0,3.0,4.8,1.8,Iris-virginica
6.9,3.1,5.4,2.1,Iris-virginica
6.7,3.1,5.6,2.4,Iris-virginica
6.9,3.1,5.1,2.3,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
6.8,3.2,5.9,2.3,Iris-virginica
6.7,3.3,5.7,2.5,Iris-virginica
6.7,3.0,5.2,2.3,Iris-virginica
6.3,2.5,5.0,1.9,Iris-virginica
6.5,3.0,5.2,2.0,Iris-virginica
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica
%
%
%

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,12 +0,0 @@
cmake -S . -B build -Wno-dev
if test $? -ne 0; then
echo "Error in creating build commands."
exit 1
fi
cmake --build build
if test $? -ne 0; then
echo "Error in build command."
exit 1
fi
cd build
ctest --output-on-failure

View File

@@ -1,16 +0,0 @@
#ifndef TYPES_H
#define TYPES_H
#include <vector>
#include <map>
using namespace std;
namespace mdlp {
typedef float precision_t;
typedef vector<precision_t> samples_t;
typedef vector<int> labels_t;
typedef vector<size_t> indices_t;
typedef vector<precision_t> cutPoints_t;
typedef map<tuple<int, int>, precision_t> cacheEnt_t;
typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
}
#endif

View File

@@ -1 +1 @@
__version__ = "0.9.1"
__version__ = "0.9.2"

View File

@@ -1,20 +1,21 @@
# distutils: language = c++
# cython: language_level = 3
from libcpp.vector cimport vector
from libcpp cimport bool
from libcpp.string cimport string
cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
ctypedef float precision_t
cdef cppclass CPPFImdlp:
CPPFImdlp(bool) except +
CPPFImdlp(int) except +
CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
vector[precision_t] getCutPoints()
string version()
cdef class CFImdlp:
cdef CPPFImdlp *thisptr
def __cinit__(self, proposal):
self.thisptr = new CPPFImdlp(proposal)
def __cinit__(self, algorithm):
self.thisptr = new CPPFImdlp(algorithm)
def __dealloc__(self):
del self.thisptr
def fit(self, X, y):
@@ -22,4 +23,6 @@ cdef class CFImdlp:
return self
def get_cut_points(self):
return self.thisptr.getCutPoints()
def get_version(self):
return self.thisptr.version()

View File

@@ -7,14 +7,18 @@ from joblib import Parallel, delayed
class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, n_jobs=-1, proposal=False):
def __init__(self, algorithm=0, n_jobs=-1):
self.algorithm = algorithm
self.n_jobs = n_jobs
self.proposal = proposal
"""Fayyad - Irani MDLP discretization algorithm based implementation.
Parameters
----------
algorithm : int, default=0
The type of algorithm to use computing the cut points.
0 - Definitive implementation
1 - Alternative proposal
n_jobs : int, default=-1
The number of jobs to run in parallel. :meth:`fit` and
:meth:`transform`, are parallelized over the features. ``-1`` means
@@ -94,9 +98,15 @@ class FImdlp(TransformerMixin, BaseEstimator):
return self
def _fit_discretizer(self, feature):
self.discretizer_[feature] = CFImdlp(proposal=self.proposal)
self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[feature].get_cut_points()
if feature in self.features_:
self.discretizer_[feature] = CFImdlp(algorithm=self.algorithm)
self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[
feature
].get_cut_points()
else:
self.discretizer_[feature] = None
self.cut_points_[feature] = []
def _discretize_feature(self, feature, X, result):
if feature in self.features_:
@@ -125,7 +135,10 @@ class FImdlp(TransformerMixin, BaseEstimator):
raise ValueError(
"Shape of input is different from what was seen in `fit`"
)
result = np.zeros_like(X, dtype=np.int32) - 1
if len(self.features_) == self.n_features_:
result = np.zeros_like(X, dtype=np.int32) - 1
else:
result = np.zeros_like(X) - 1
Parallel(n_jobs=self.n_jobs, prefer="threads")(
delayed(self._discretize_feature)(feature, X[:, feature], result)
for feature in range(self.n_features_)

View File

@@ -14,57 +14,52 @@ class FImdlpTest(unittest.TestCase):
def test_init(self):
clf = FImdlp()
self.assertEqual(-1, clf.n_jobs)
self.assertFalse(clf.proposal)
clf = FImdlp(proposal=True, n_jobs=7)
self.assertTrue(clf.proposal)
self.assertEqual(0, clf.algorithm)
clf = FImdlp(algorithm=1, n_jobs=7)
self.assertEqual(1, clf.algorithm)
self.assertEqual(7, clf.n_jobs)
def test_fit_proposal(self):
clf = FImdlp(proposal=True)
def test_fit_definitive(self):
clf = FImdlp(algorithm=0)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[], []], clf.get_cut_points())
self.assertListEqual([[2.0], [3.0]], clf.get_cut_points())
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
expected = [
[
4.900000095367432,
5.0,
5.099999904632568,
5.400000095367432,
5.699999809265137,
],
[2.6999998092651367, 2.9000000953674316, 3.1999998092651367],
[2.3499999046325684, 4.5, 4.800000190734863],
[0.75, 1.399999976158142, 1.5, 1.7000000476837158],
[5.449999809265137, 6.25],
[2.8499999046325684, 3.0, 3.049999952316284, 3.3499999046325684],
[2.450000047683716, 4.75, 5.050000190734863],
[0.800000011920929, 1.4500000476837158, 1.75],
]
self.assertListEqual(expected, clf.get_cut_points())
self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_)
def test_fit_original(self):
clf = FImdlp(proposal=False)
def test_fit_alternative(self):
clf = FImdlp(algorithm=1)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[], []], clf.get_cut_points())
self.assertListEqual([[2], [3]], clf.get_cut_points())
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
expected = [
[5.5, 5.800000190734863],
[3.0999999046325684],
[2.450000047683716, 4.800000190734863, 5.099999904632568],
[0.800000011920929, 1.7000000476837158],
[5.449999809265137, 5.75],
[2.8499999046325684, 3.3499999046325684],
[2.450000047683716, 4.75],
[0.800000011920929, 1.75],
]
self.assertListEqual(expected, clf.get_cut_points())
self.assertListEqual([0, 1, 2, 3], clf.features_)
@@ -88,15 +83,58 @@ class FImdlpTest(unittest.TestCase):
def test_fit_features(self):
clf = FImdlp()
clf.fit([[1, 2], [3, 4]], [1, 2], features=[0])
res = clf.transform([[1, 2], [3, 4]])
self.assertListEqual(res.tolist(), [[0, 2], [0, 4]])
clf.fit([[1, -2], [3, 4]], [1, 2], features=[0])
res = clf.transform([[1, -2], [3, 4]])
self.assertListEqual(res.tolist(), [[0, -2], [1, 4]])
X, y = load_iris(return_X_y=True)
X_expected = X[:, [0, 2]].copy()
clf.fit(X, y, features=[1, 3])
X_computed = clf.transform(X)
self.assertListEqual(
X_expected[:, 0].tolist(), X_computed[:, 0].tolist()
)
self.assertListEqual(
X_expected[:, 1].tolist(), X_computed[:, 2].tolist()
)
self.assertEqual(X_computed.dtype, np.float64)
def test_transform_original(self):
clf = FImdlp(proposal=False)
def test_transform_definitive(self):
clf = FImdlp(algorithm=0)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
X_transformed = clf.transform(X)
self.assertListEqual(
X_transformed.tolist(), clf.fit(X, y).transform(X).tolist()
)
self.assertEqual(X_transformed.dtype, np.int32)
expected = [
[1, 0, 1, 1],
[1, 1, 1, 1],
[1, 0, 1, 1],
[0, 0, 1, 1],
[1, 0, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1],
]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(algorithm=0)
clf.transform([[1, 2], [3, 4]])
def test_transform_alternative(self):
clf = FImdlp(algorithm=1)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
@@ -107,47 +145,17 @@ class FImdlpTest(unittest.TestCase):
clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
)
expected = [
[0, 0, 1, 1],
[1, 0, 1, 1],
[2, 1, 1, 1],
[2, 0, 1, 1],
[1, 0, 1, 1],
[0, 0, 1, 1],
[1, 0, 1, 1],
[1, 0, 1, 1],
[1, 0, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1],
]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(proposal=False)
clf.transform([[1, 2], [3, 4]])
def test_transform_proposal(self):
clf = FImdlp(proposal=True)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
self.assertListEqual(
clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
)
expected = [
[4, 0, 1, 1],
[5, 2, 2, 2],
[5, 0, 1, 1],
[1, 0, 1, 1],
[4, 1, 1, 1],
[5, 2, 1, 1],
[5, 1, 1, 1],
]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(proposal=True)
clf = FImdlp(algorithm=1)
clf.transform([[1, 2], [3, 4]])

Binary file not shown.