47 Commits
v0.9.2 ... main

Author SHA1 Message Date
3bae1fe390 Fix attribute name extraction in ArffFiles 2023-07-06 16:57:25 +02:00
Ricardo Montañana Gómez
a7098a907e Merge pull request #7 from Doctorado-ML/hiperparameters
Add hyperparameters to discretized

Hyperparameters added:

- min_length: int, default=3: The minimum length of an interval to be considered to be discretized.
- max_depth: int, default=1e6: The maximum depth of the discretization process.
- max_cuts: float, default=0: The maximum number of cut points to be computed for each feature. Compute all cutpoints and select the ones that produce less entropy
2023-04-25 18:05:12 +02:00
6e17548563 Add url to pyproject and comment to mdlp 2023-04-25 17:53:36 +02:00
dd42e186d5 Reformat Arfffiles.h 2023-04-25 17:16:04 +02:00
3a100bbba7 Add mdlp version to Makefile
Refactor sample.py
2023-04-25 17:11:40 +02:00
17a66858f8 Update version number to 0.9.4 2023-04-25 16:58:23 +02:00
3ed491cd34 Update mdlp version
Add mimimun mdlp version test
Update sample.cpp
2023-04-25 12:05:52 +02:00
878cd379ee Change arff library to sample.py 2023-04-14 11:20:48 +02:00
25d341aee5 Update samples and Readme 2023-04-12 17:40:25 +02:00
fa8c4a221d Remove duplicated lines 2023-04-11 19:45:37 +02:00
947d54202d Update hyperparams info 2023-04-11 19:35:39 +02:00
d04cb389c0 Update tests and module mdlp version 2023-04-11 19:33:57 +02:00
0768d68a36 add join_fit target info 2023-04-08 12:22:03 +02:00
e44bca0420 Move limits include to CPPFImldp header 2023-03-22 18:21:52 +01:00
c2294613df Move limits include to CPPFImldp header 2023-03-22 18:19:01 +01:00
1069fc8ff4 Add last mdlp version and update sample.cpp 2023-03-21 10:18:51 +01:00
95bc29c7f2 Remove trailing space in attribute type of Arff 2023-03-20 20:27:47 +01:00
da9db322da Fix sklearn requirement 2023-03-20 18:58:55 +01:00
e3c329b2e5 Add min_length as percentage of # samples 2023-03-20 18:57:26 +01:00
7368dd9ff4 Refactor ArffFiles in main project 2023-03-20 17:45:58 +01:00
b5c6a49e19 Add last version of mdlp 2023-03-19 19:22:07 +01:00
c2a0d33604 Add last mdlp version 2023-03-19 19:14:32 +01:00
e6a56e3140 Update samples 2023-03-14 11:47:30 +01:00
ccce9725b3 Add max_cuts hyperparamter as in mdlp 2023-03-13 18:14:56 +01:00
aa55d3a340 New version of library and tests 2023-02-26 17:59:08 +01:00
900cccf76b Update discretizer to new library 2023-02-25 18:52:21 +01:00
f65efe3dfd Update the c++ sources with new version 2023-02-24 11:04:06 +01:00
e9d19d41da Add changed submodule 2023-02-22 11:56:39 +01:00
6450ccb9bd Add changed submodule 2023-02-22 11:34:27 +01:00
5d2f32bb0e Add needed header file to MANIFEST 2023-02-22 11:33:26 +01:00
Ricardo Montañana Gómez
8249e55b0c Merge pull request #6 from Doctorado-ML/joinfeatures
- Add a join_fit feature that can update a fitted discretizer. Making it possible to discretize a variable by taking into account the label and a list of other features of the dataset. Used in local discretization with bayesian estimators.
- Add factorize method to be able to simulate the pandas factorize method.
- Remove the algorithm hyperparameter as it is no longer needed
- Add get_states_feature method to obtain a list of states of any feature based on the number of cut points computed while fitting the discretizer
2023-02-22 10:44:43 +01:00
40871f128d Add 1.1.0 version of mdlp 2023-02-22 10:15:33 +01:00
718c9d0e63 make static methods factorize and test_sklrn_trans 2023-02-20 20:12:36 +01:00
e0b7cae9a0 Remove algorithm hyperparameter in discretizer 2023-02-20 18:26:51 +01:00
31d79a77fa Add get_states_feature method 2023-02-13 17:34:50 +01:00
2d495293bb Add range_features method 2023-02-13 16:15:50 +01:00
9899781640 Complete join_fit and remove MultiDiscretizer 2023-02-05 00:30:03 +01:00
f20496203e refactor Multidiscretizer to use one per column 2023-02-04 19:23:15 +01:00
cf09d92ccc add MultiDiscretizer 2023-02-04 17:45:36 +01:00
1186e4ad53 chore: 🔖 Upgrade version number to 0.9.3 2023-01-28 19:15:26 +01:00
7913f5151e Add version command to Makefile 2023-01-28 19:14:32 +01:00
050b923631 feat: Add factorize method to transformer 2023-01-28 10:35:07 +01:00
29fc88cecc test: Add scikit learn compatibility check_estimator test 2023-01-26 23:20:51 +01:00
16b31ec293 test: Complete join_transform test 2023-01-26 11:17:10 +01:00
ca7d158ac8 feat: ⚗️ Add join_transform method and cpp factorize 2023-01-26 10:47:27 +01:00
Ricardo Montañana Gómez
34cd54f77e feat: ♻️ Add Classic algorithm as number 2 to compare performance 2023-01-13 11:47:01 +01:00
70bf03155c Add scikit-learn as requirement 2022-12-23 14:07:36 +01:00
19 changed files with 927 additions and 176 deletions

View File

@@ -20,14 +20,14 @@ jobs:
with:
submodules: recursive
- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python }}
- name: Install dependencies
run: |
pip install -q --upgrade pip
pip install -q scikit-learn cython
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
pip install -q coverage black flake8 codacy-coverage
- name: Build and install
run: |
make install
@@ -40,7 +40,7 @@ jobs:
coverage run -m unittest discover -v -s src
coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml

View File

@@ -1,3 +1,5 @@
include src/cppmdlp/CPPFImdlp.h
include src/cppmdlp/typesFImdlp.h
include src/cppmdlp/Metrics.h
include src/fimdlp/Factorize.h
include src/fimdlp/ArffFiles.h

View File

@@ -37,6 +37,12 @@ install: ## Build extension
audit: ## Audit pip
pip-audit
version:
@echo "Current Python version .: $(shell python --version)"
@echo "Current FImdlp version .: $(shell python -c "from fimdlp import _version; print(_version.__version__)")"
@echo "Current mdlp version ...: $(shell python -c "from fimdlp.cppfimdlp import CFImdlp; print(CFImdlp().get_version().decode())")"
@echo "Installed FImdlp version: $(shell pip show fimdlp | grep Version | cut -d' ' -f2)"
help: ## Show help message
@IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \

View File

@@ -25,7 +25,7 @@ git clone --recurse-submodules https://github.com/doctorado-ml/FImdlp.git
```bash
pip install -e .
python samples/sample.py iris
python samples/sample.py iris --alternative
python samples/sample.py iris -c 2
python samples/sample.py -h # for more options
```
@@ -33,9 +33,12 @@ python samples/sample.py -h # for more options
```bash
cd samples
mkdir build
cmake -B build
cd build
cmake ..
make
./sample iris
./sample -f iris -c 2
./sample -h
```
### Based on
[https://github.com/rmontanana/mdlp](https://github.com/rmontanana/mdlp)

View File

@@ -18,10 +18,10 @@ authors = [
{ name = "Ricardo Montañana", email = "ricardo.montanana@alu.uclm.es" },
]
dynamic = ['version']
dependencies = ["numpy", "joblib"]
dependencies = ["numpy", "joblib", "scikit-learn"]
requires-python = ">=3.9"
classifiers = [
"Development Status :: 3 - Alpha",
"Development Status :: 4 - Beta",
"Intended Audience :: Science/Research",
"Intended Audience :: Developers",
"Topic :: Software Development",
@@ -33,14 +33,16 @@ classifiers = [
"Programming Language :: Python",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
]
[project.urls]
Home = "https://github.com/doctorado-ml/FImdlp"
Base = "https://github.com/rmontanana/mdlp"
[tool.black]
line-length = 79
target_version = ['py39', 'py310']
target_version = ['py39', 'py310', 'py311']
include = '\.pyi?$'
exclude = '''
/(

View File

@@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 3.20)
project(main)
project(sample)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)
add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp)

View File

@@ -1,30 +1,101 @@
#include "../src/cppmdlp/tests/ArffFiles.h"
#include <iostream>
#include <vector>
#include <iomanip>
#include <chrono>
#include <algorithm>
#include <cstring>
#include <getopt.h>
#include "../src/cppmdlp/CPPFImdlp.h"
#include "../src/cppmdlp/tests/ArffFiles.h"
using namespace std;
using namespace mdlp;
int main(int argc, char** argv)
const string PATH = "../../src/cppmdlp/tests/datasets/";
/* print a description of all supported options */
void usage(const char* path)
{
ArffFiles file;
vector<string> lines;
string path = "../../src/cppmdlp/tests/datasets/";
map<string, bool > datasets = {
{"mfeat-factors", true},
{"iris", true},
{"letter", true},
{"kdd_JapaneseVowels", false}
};
if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {
cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl;
return 1;
/* take only the last portion of the path */
const char* basename = strrchr(path, '/');
basename = basename ? basename + 1 : path;
cout << "usage: " << basename << "[OPTION]" << endl;
cout << " -h, --help\t\t Print this help and exit." << endl;
cout
<< " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}."
<< endl;
cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl;
cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl;
cout
<< " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 -> any"
<< endl;
cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl;
}
file.load(path + argv[1] + ".arff", datasets[argv[1]]);
auto attributes = file.getAttributes();
int items = file.getSize();
tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
{
string file_name;
string path = PATH;
int max_depth = numeric_limits<int>::max();
int min_length = 3;
float max_cutpoints = 0;
const vector<struct option> long_options = {
{"help", no_argument, nullptr, 'h'},
{"file", required_argument, nullptr, 'f'},
{"path", required_argument, nullptr, 'p'},
{"max_depth", required_argument, nullptr, 'm'},
{"max_cutpoints", required_argument, nullptr, 'c'},
{"min_length", required_argument, nullptr, 'n'},
{nullptr, no_argument, nullptr, 0}
};
while (true) {
const auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options.data(), nullptr);
if (c == -1)
break;
switch (c) {
case 'h':
usage(argv[0]);
exit(0);
case 'f':
file_name = string(optarg);
break;
case 'm':
max_depth = stoi(optarg);
break;
case 'n':
min_length = stoi(optarg);
break;
case 'c':
max_cutpoints = stof(optarg);
break;
case 'p':
path = optarg;
if (path.back() != '/')
path += '/';
break;
case '?':
usage(argv[0]);
exit(1);
default:
abort();
}
}
if (file_name.empty()) {
usage(argv[0]);
exit(1);
}
return make_tuple(file_name, path, max_depth, min_length, max_cutpoints);
}
void process_file(const string& path, const string& file_name, bool class_last, int max_depth, int min_length,
float max_cutpoints)
{
ArffFiles file;
file.load(path + file_name + ".arff", class_last);
const auto attributes = file.getAttributes();
const auto items = file.getSize();
cout << "Number of lines: " << items << endl;
cout << "Attributes: " << endl;
for (auto attribute : attributes) {
@@ -33,22 +104,93 @@ int main(int argc, char** argv)
cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl;
vector<vector<float>>& X = file.getX();
vector<int>& y = file.getY();
for (int i = 0; i < 50; i++) {
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
for (int i = 0; i < 5; i++) {
for (auto feature : X) {
cout << fixed << setprecision(1) << feature[i] << " ";
}
cout << y[i] << endl;
}
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0);
auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
size_t total = 0;
for (auto i = 0; i < attributes.size(); i++) {
cout << "Cut points for " << get<0>(attributes[i]) << endl;
cout << "--------------------------" << setprecision(3) << endl;
auto min_max = minmax_element(X[i].begin(), X[i].end());
cout << "Cut points for feature " << get<0>(attributes[i]) << ": [" << setprecision(3);
test.fit(X[i], y);
for (auto item : test.getCutPoints()) {
cout << item << endl;
auto cut_points = test.getCutPoints();
for (auto item : cut_points) {
cout << item;
if (item != cut_points.back())
cout << ", ";
}
total += test.getCutPoints().size();
cout << "]" << endl;
cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl;
cout << "--------------------------" << endl;
}
cout << "Total cut points ...: " << total << endl;
cout << "Total feature states: " << total + attributes.size() << endl;
}
void process_all_files(const map<string, bool>& datasets, const string& path, int max_depth, int min_length,
float max_cutpoints)
{
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: "
<< max_cutpoints << endl << endl;
printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)");
printf("==================== ==== ==== ========\n");
for (const auto& dataset : datasets) {
ArffFiles file;
file.load(path + dataset.first + ".arff", dataset.second);
auto attributes = file.getAttributes();
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
size_t timing = 0;
size_t cut_points = 0;
for (auto i = 0; i < attributes.size(); i++) {
auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
test.fit(X[i], y);
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
timing += std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
cut_points += test.getCutPoints().size();
}
printf("%-20s %4lu %4zu %8zu\n", dataset.first.c_str(), attributes.size(), cut_points, timing);
}
}
int main(int argc, char** argv)
{
map<string, bool> datasets = {
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
{"test", true}
};
string file_name;
string path;
int max_depth;
int min_length;
float max_cutpoints;
tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv);
if (datasets.find(file_name) == datasets.end() && file_name != "all") {
cout << "Invalid file name: " << file_name << endl;
usage(argv[0]);
exit(1);
}
if (file_name == "all")
process_all_files(datasets, path, max_depth, min_length, max_cutpoints);
else {
process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints);
cout << "File name ....: " << file_name << endl;
cout << "Max depth ....: " << max_depth << endl;
cout << "Min length ...: " << min_length << endl;
cout << "Max cutpoints : " << max_cutpoints << endl;
}
return 0;
}

View File

@@ -1,21 +1,37 @@
import time
import argparse
import os
from scipy.io import arff
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from fimdlp.mdlp import FImdlp
from fimdlp.cppfimdlp import CArffFiles
datasets = {
"mfeat-factors": True,
"iris": True,
"glass": True,
"liver-disorders": True,
"letter": True,
"kdd_JapaneseVowels": False,
}
ap = argparse.ArgumentParser()
ap.add_argument(
"--alternative", dest="proposal", action="store_const", const=1
"-n",
"--min_length",
type=int,
default=3,
help="Minimum length of interval",
)
ap.add_argument(
"-m", "--max_depth", type=int, default=9999, help="Maximum depth"
)
ap.add_argument(
"-c",
"--max_cuts",
type=float,
default=0,
help="Maximum number of cut points",
)
ap.add_argument("dataset", type=str, choices=datasets.keys())
args = ap.parse_args()
@@ -23,22 +39,34 @@ relative = "" if os.path.isdir("src") else ".."
file_name = os.path.join(
relative, "src", "cppmdlp", "tests", "datasets", args.dataset
)
data = arff.loadarff(file_name + ".arff")
df = pd.DataFrame(data[0])
class_column = -1 if datasets[args.dataset] else 0
class_name = df.columns.to_list()[class_column]
X = df.drop(class_name, axis=1)
y, _ = pd.factorize(df[class_name])
X = X.to_numpy()
test = FImdlp(algorithm=args.proposal if args.proposal is not None else 0)
arff = CArffFiles()
arff.load(bytes(f"{file_name}.arff", "utf-8"))
X = arff.get_X()
y = arff.get_y()
attributes = arff.get_attributes()
attributes = [x[0].decode() for x in attributes]
df = pd.DataFrame(X, columns=attributes)
class_name = arff.get_class_name().decode()
df[class_name] = y
test = FImdlp(
min_length=args.min_length,
max_depth=args.max_depth,
max_cuts=args.max_cuts,
)
now = time.time()
test.fit(X, y)
fit_time = time.time()
print("Fitting: ", fit_time - now)
print(f"Fitting ....: {fit_time - now:7.5f} seconds")
now = time.time()
Xt = test.transform(X)
print("Transforming: ", time.time() - now)
print(test.get_cut_points())
print(f"Transforming: {time.time() - now:7.5f} seconds")
cut_points = test.get_cut_points()
for i, cuts in enumerate(cut_points):
print(f"Cut points for feature {attributes[i]}: {cuts}")
print(f"Min: {min(X[:, i]):6.4f} Max: {max(X[:, i]):6.4f}")
num_cuts = sum([len(x) for x in cut_points])
print(f"Total cut points ...: {num_cuts}")
print(f"Total feature states: {num_cuts + len(attributes)}")
clf = RandomForestClassifier(random_state=0)
print(
"Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y)

View File

@@ -14,10 +14,14 @@ setup(
"src/fimdlp/cfimdlp.pyx",
"src/cppmdlp/CPPFImdlp.cpp",
"src/cppmdlp/Metrics.cpp",
"src/fimdlp/Factorize.cpp",
"src/fimdlp/ArffFiles.cpp",
],
language="c++",
include_dirs=["fimdlp"],
extra_compile_args=["-std=c++2a"],
extra_compile_args=[
"-std=c++11",
],
),
]
)

132
src/fimdlp/ArffFiles.cpp Normal file
View File

@@ -0,0 +1,132 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
using namespace std;
ArffFiles::ArffFiles() = default;
vector<string> ArffFiles::getLines() const
{
return lines;
}
unsigned long int ArffFiles::getSize() const
{
return lines.size();
}
vector<pair<string, string>> ArffFiles::getAttributes() const
{
return attributes;
}
string ArffFiles::getClassName() const
{
return className;
}
string ArffFiles::getClassType() const
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(const string& fileName, bool classLast)
{
ifstream file(fileName);
if (!file.is_open()) {
throw invalid_argument("Unable to open file");
}
string line;
string keyword;
string attribute;
string type;
string type_w;
while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute;
type = "";
while (ss >> type_w)
type += type_w + " ";
attributes.emplace_back(trim(attribute), trim(type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
if (classLast) {
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
} else {
className = get<0>(attributes.front());
classType = get<1>(attributes.front());
attributes.erase(attributes.begin());
}
generateDataset(classLast);
}
void ArffFiles::generateDataset(bool classLast)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
auto yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0;
for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int pos = 0;
int xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
X[xIndex++][i] = stof(value);
}
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" '\n\r\t"));
s.erase(s.find_last_not_of(" '\n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (const string& label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

34
src/fimdlp/ArffFiles.h Normal file
View File

@@ -0,0 +1,34 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<pair<string, string>> attributes;
string className;
string classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset(bool);
public:
ArffFiles();
void load(const string&, bool = true);
vector<string> getLines() const;
unsigned long int getSize() const;
string getClassName() const;
string getClassType() const;
static string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<pair<string, string>> getAttributes() const;
static vector<int> factorize(const vector<string>& labels_t);
};
#endif

18
src/fimdlp/Factorize.cpp Normal file
View File

@@ -0,0 +1,18 @@
#include "Factorize.h"
namespace utils {
vector<int> cppFactorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (const string& label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}
}

10
src/fimdlp/Factorize.h Normal file
View File

@@ -0,0 +1,10 @@
#ifndef FACTORIZE_H
#define FACTORIZE_H
#include <vector>
#include <map>
#include <string>
namespace utils {
using namespace std;
vector<int> cppFactorize(const vector<string>&);
}
#endif

View File

@@ -1,8 +1,4 @@
from ._version import __version__
def version():
return __version__
all = ["FImdlp", "__version__"]

View File

@@ -1 +1 @@
__version__ = "0.9.2"
__version__ = "0.9.4"

View File

@@ -1,20 +1,27 @@
# distutils: language = c++
# cython: language_level = 3
from libcpp.vector cimport vector
from libcpp.pair cimport pair
from libcpp.string cimport string
from libcpp cimport bool
import numpy as np
cdef extern from "limits.h":
cdef int INT_MAX
cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
ctypedef float precision_t
cdef cppclass CPPFImdlp:
CPPFImdlp(int) except +
CPPFImdlp() except +
CPPFImdlp(size_t, int, float) except +
CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
int get_depth()
vector[precision_t] getCutPoints()
string version()
cdef class CFImdlp:
cdef CPPFImdlp *thisptr
def __cinit__(self, algorithm):
self.thisptr = new CPPFImdlp(algorithm)
def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX, float max_cuts=0):
self.thisptr = new CPPFImdlp(min_length, max_depth, max_cuts)
def __dealloc__(self):
del self.thisptr
def fit(self, X, y):
@@ -24,3 +31,51 @@ cdef class CFImdlp:
return self.thisptr.getCutPoints()
def get_version(self):
return self.thisptr.version()
def get_depth(self):
return self.thisptr.get_depth()
def __reduce__(self):
return (CFImdlp, ())
cdef extern from "Factorize.h" namespace "utils":
vector[int] cppFactorize(vector[string] &input_vector)
def factorize(input_vector):
return cppFactorize(input_vector)
cdef extern from "ArffFiles.h":
cdef cppclass ArffFiles:
ArffFiles() except +
void load(string, bool)
unsigned long int getSize()
string getClassName()
string getClassType()
string trim(const string&)
vector[vector[float]]& getX()
vector[int]& getY()
vector[string] getLines()
vector[pair[string, string]] getAttributes()
cdef class CArffFiles:
cdef ArffFiles *thisptr
def __cinit__(self):
self.thisptr = new ArffFiles()
def __dealloc__(self):
del self.thisptr
def load(self, string filename, bool verbose = True):
self.thisptr.load(filename, verbose)
def get_size(self):
return self.thisptr.getSize()
def get_class_name(self):
return self.thisptr.getClassName()
def get_class_type(self):
return self.thisptr.getClassType()
def get_X(self):
return np.array(self.thisptr.getX()).T
def get_y(self):
return self.thisptr.getY()
def get_lines(self):
return self.thisptr.getLines()
def get_attributes(self):
return self.thisptr.getAttributes()
def __reduce__(self):
return (CArffFiles, ())

View File

@@ -1,52 +1,56 @@
import numpy as np
from .cppfimdlp import CFImdlp
from .cppfimdlp import CFImdlp, factorize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from joblib import Parallel, delayed
from ._version import __version__
class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, algorithm=0, n_jobs=-1):
self.algorithm = algorithm
def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0):
self.n_jobs = n_jobs
self.min_length = min_length
self.max_depth = max_depth
self.max_cuts = max_cuts
"""Fayyad - Irani MDLP discretization algorithm based implementation.
Parameters
----------
algorithm : int, default=0
The type of algorithm to use computing the cut points.
0 - Definitive implementation
1 - Alternative proposal
n_jobs : int, default=-1
The number of jobs to run in parallel. :meth:`fit` and
:meth:`transform`, are parallelized over the features. ``-1`` means
using all cores available.
min_length: int, default=3
The minimum length of an interval to be considered to be discretized.
max_depth: int, default=1e6
The maximum depth of the discretization process.
max_cuts: float, default=0
The maximum number of cut points to be computed for each feature.
Attributes
----------
n_features_ : int
n_features_in_ : int
The number of features of the data passed to :meth:`fit`.
discretizer_ : list
The list of discretizers, one for each feature.
cut_points_ : list
The list of cut points for each feature.
X_ : array
the samples used to fit, shape (n_samples, n_features)
y_ : array
the labels used to fit, shape (n_samples,)
X_ : array, shape (n_samples, n_features)
the samples used to fit
y_ : array, shape(n_samples,)
the labels used to fit
features_ : list
the list of features to be discretized
"""
def _check_params_fit(self, X, y, expected_args, kwargs):
"""Check the common parameters passed to fit"""
def _more_tags(self):
return {"preserves_dtype": [np.int32], "requires_y": True}
def _check_args(self, X, y, expected_args, kwargs):
# Check that X and y have correct shape
X, y = check_X_y(X, y)
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
# Default values
self.features_ = [i for i in range(X.shape[1])]
for key, value in kwargs.items():
@@ -67,15 +71,24 @@ class FImdlp(TransformerMixin, BaseEstimator):
raise ValueError("Feature index out of range")
return X, y
def _update_params(self, X, y):
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
self.n_features_in_ = X.shape[1]
@staticmethod
def get_version():
return f"{__version__}({CFImdlp().get_version().decode()})"
def fit(self, X, y, **kwargs):
"""A reference implementation of a fitting function for a transformer.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
X : array, shape (n_samples, n_features)
The training input samples.
y : None
There is no need of a target in a transformer, yet the pipeline API
requires this parameter.
y : array, shape (n_samples,)
the labels used to fit
features : list, default=[i for i in range(n_features)]
The list of features to be discretized.
Returns
@@ -83,23 +96,34 @@ class FImdlp(TransformerMixin, BaseEstimator):
self : object
Returns self.
"""
X, y = self._check_params_fit(
X, y = self._check_args(
X, y, expected_args=["features"], kwargs=kwargs
)
self.n_features_ = X.shape[1]
self._update_params(X, y)
self.X_ = X
self.y_ = y
self.discretizer_ = [None] * self.n_features_
self.cut_points_ = [None] * self.n_features_
self.efective_min_length_ = (
self.min_length
if self.min_length > 1
else int(self.min_length * X.shape[0])
)
self.discretizer_ = [None] * self.n_features_in_
self.cut_points_ = [None] * self.n_features_in_
Parallel(n_jobs=self.n_jobs, prefer="threads")(
delayed(self._fit_discretizer)(feature)
for feature in range(self.n_features_)
for feature in range(self.n_features_in_)
)
# target of every feature. Start with -1 => y (see join_fit)
self.target_ = [-1] * self.n_features_in_
return self
def _fit_discretizer(self, feature):
if feature in self.features_:
self.discretizer_[feature] = CFImdlp(algorithm=self.algorithm)
self.discretizer_[feature] = CFImdlp(
min_length=self.efective_min_length_,
max_depth=self.max_depth,
max_cuts=self.max_cuts,
)
self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[
feature
@@ -118,7 +142,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
"""Discretize X values.
Parameters
----------
X : {array-like}, shape (n_samples, n_features)
X : array, shape (n_samples, n_features)
The input samples.
Returns
-------
@@ -126,25 +150,41 @@ class FImdlp(TransformerMixin, BaseEstimator):
The array containing the discretized values of ``X``.
"""
# Check is fit had been called
check_is_fitted(self, "n_features_")
check_is_fitted(self, "n_features_in_")
# Input validation
X = check_array(X)
# Check that the input is of the same shape as the one passed
# during fit.
if X.shape[1] != self.n_features_:
if X.shape[1] != self.n_features_in_:
raise ValueError(
"Shape of input is different from what was seen in `fit`"
)
if len(self.features_) == self.n_features_:
if len(self.features_) == self.n_features_in_:
result = np.zeros_like(X, dtype=np.int32) - 1
else:
result = np.zeros_like(X) - 1
Parallel(n_jobs=self.n_jobs, prefer="threads")(
delayed(self._discretize_feature)(feature, X[:, feature], result)
for feature in range(self.n_features_)
for feature in range(self.n_features_in_)
)
return result
@staticmethod
def factorize(yy):
"""Factorize the input labels
Parameters
----------
yy : array, shape (n_samples,)
Labels to be factorized, MUST be bytes, i.e. b"0", b"1", ...
Returns
-------
array, shape (n_samples,)
Factorized labels
"""
return factorize(yy)
def get_cut_points(self):
"""Get the cut points for each feature.
Returns
@@ -153,6 +193,78 @@ class FImdlp(TransformerMixin, BaseEstimator):
The list of cut points for each feature.
"""
result = []
for feature in range(self.n_features_):
for feature in range(self.n_features_in_):
result.append(self.cut_points_[feature])
return result
def get_states_feature(self, feature):
"""Return the states a feature can take
Parameters
----------
feature : int
feature to get the states
Returns
-------
list
states of the feature
"""
if feature in self.features_:
return list(range(len(self.cut_points_[feature]) + 1))
return None
def join_fit(self, features, target, data):
"""Join the selected features with the labels and fit the discretizer
of the target variable
join - fit - transform
Parameters
----------
features : [list]
index of the features to join with the labels
target : [int]
index of the target variable to discretize
data: [array] shape (n_samples, n_features)
dataset that contains the features to join
Returns
-------
result: np.array
The target variable newly discretized
"""
check_is_fitted(self, "n_features_in_")
if len(features) < 1 or len(features) > self.n_features_in_:
raise ValueError(
"Number of features must be in range [1, "
f"{self.n_features_in_}]"
)
for feature in features:
if feature < 0 or feature >= self.n_features_in_:
raise ValueError(
f"Feature {feature} not in range [0, "
f"{self.n_features_in_})"
)
if target < 0 or target >= self.n_features_in_:
raise ValueError(
f"Target {target} not in range [0, {self.n_features_in_})"
)
if target in features:
raise ValueError("Target cannot be in features to join")
y_join = [
f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode()
for item_y, items_x in zip(self.y_, data[:, features])
]
# Store in target_ the features used with class to discretize target
self.target_[target] = features + [-1]
self.y_join_ = y_join
self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))
self.cut_points_[target] = self.discretizer_[target].get_cut_points()
# return the discretized target variable with the new cut points
return np.searchsorted(self.cut_points_[target], self.X_[:, target])
def get_depths(self):
res = [0] * self.n_features_in_
for feature in self.features_:
res[feature] = self.discretizer_[feature].get_depth()
return res

View File

@@ -1,67 +1,57 @@
import unittest
import sklearn
from sklearn.datasets import load_iris
import numpy as np
from sklearn.datasets import load_iris
from sklearn.utils.estimator_checks import check_estimator
from ..cppfimdlp import CFImdlp, factorize, CArffFiles
from ..mdlp import FImdlp
from .. import version
from .._version import __version__
from .. import __version__
class FImdlpTest(unittest.TestCase):
delta = 1e-6 # same tolerance as in C++ code
def test_version(self):
self.assertEqual(version(), __version__)
clf = FImdlp()
self.assertEqual(
clf.get_version(),
f"{__version__}({CFImdlp().get_version().decode()})",
)
def test_minimum_mdlp_version(self):
mdlp_version = tuple(
int(c) for c in CFImdlp().get_version().decode().split(".")[0:3]
)
minimum_mdlp_version = (1, 1, 2)
self.assertTrue(mdlp_version >= minimum_mdlp_version)
def test_init(self):
clf = FImdlp()
self.assertEqual(-1, clf.n_jobs)
self.assertEqual(0, clf.algorithm)
clf = FImdlp(algorithm=1, n_jobs=7)
self.assertEqual(1, clf.algorithm)
self.assertEqual(3, clf.min_length)
self.assertEqual(1e6, clf.max_depth)
clf = FImdlp(n_jobs=7, min_length=24, max_depth=17)
self.assertEqual(7, clf.n_jobs)
self.assertEqual(24, clf.min_length)
self.assertEqual(17, clf.max_depth)
def test_fit_definitive(self):
clf = FImdlp(algorithm=0)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[2.0], [3.0]], clf.get_cut_points())
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertEqual(clf.n_features_in_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
expected = [
[5.449999809265137, 6.25],
[2.8499999046325684, 3.0, 3.049999952316284, 3.3499999046325684],
[2.450000047683716, 4.75, 5.050000190734863],
[0.800000011920929, 1.4500000476837158, 1.75],
[5.45, 5.75],
[2.75, 2.85, 2.95, 3.05, 3.35],
[2.45, 4.75, 5.05],
[0.8, 1.75],
]
self.assertListEqual(expected, clf.get_cut_points())
self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_)
def test_fit_alternative(self):
clf = FImdlp(algorithm=1)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[2], [3]], clf.get_cut_points())
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
expected = [
[5.449999809265137, 5.75],
[2.8499999046325684, 3.3499999046325684],
[2.450000047683716, 4.75],
[0.800000011920929, 1.75],
]
self.assertListEqual(expected, clf.get_cut_points())
computed = clf.get_cut_points()
for item_computed, item_expected in zip(computed, expected):
for x_, y_ in zip(item_computed, item_expected):
self.assertAlmostEqual(x_, y_, delta=self.delta)
self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_)
@@ -82,8 +72,12 @@ class FImdlpTest(unittest.TestCase):
clf.fit([[1, 2], [3, 4]], [1, 2], features=[0, 2])
def test_fit_features(self):
clf = FImdlp()
clf = FImdlp(n_jobs=-1)
# Two samples doesn't have enough information to split
clf.fit([[1, -2], [3, 4]], [1, 2], features=[0])
self.assertListEqual(clf.get_cut_points(), [[], []])
clf.fit([[1, -2], [3, 4], [5, 6]], [1, 2, 2], features=[0])
self.assertListEqual(clf.get_cut_points(), [[2], []])
res = clf.transform([[1, -2], [3, 4]])
self.assertListEqual(res.tolist(), [[0, -2], [1, 4]])
X, y = load_iris(return_X_y=True)
@@ -98,15 +92,15 @@ class FImdlpTest(unittest.TestCase):
)
self.assertEqual(X_computed.dtype, np.float64)
def test_transform_definitive(self):
clf = FImdlp(algorithm=0)
clf.fit([[1, 2], [3, 4]], [1, 2])
def test_transform(self):
clf = FImdlp()
clf.fit([[1, 2], [3, 4], [5, 6]], [1, 2, 2])
self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertEqual(clf.n_features_in_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
X_transformed = clf.transform(X)
@@ -116,46 +110,258 @@ class FImdlpTest(unittest.TestCase):
self.assertEqual(X_transformed.dtype, np.int32)
expected = [
[1, 0, 1, 1],
[1, 1, 1, 1],
[1, 0, 1, 1],
[0, 0, 1, 1],
[1, 0, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1],
]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(algorithm=0)
clf.transform([[1, 2], [3, 4]])
def test_transform_alternative(self):
clf = FImdlp(algorithm=1)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [1, 1]]
)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
self.assertListEqual(
clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
)
expected = [
[1, 0, 1, 1],
[2, 1, 1, 1],
[2, 3, 1, 1],
[2, 0, 1, 1],
[0, 0, 1, 1],
[1, 0, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1],
[1, 3, 1, 1],
[1, 2, 1, 1],
]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(algorithm=1)
clf = FImdlp()
clf.transform([[1, 2], [3, 4]])
def test_cppfactorize(self):
source = [
b"f0",
b"f1",
b"f2",
b"f3",
b"f4",
b"f5",
b"f6",
b"f1",
b"f1",
b"f7",
b"f8",
]
expected = [0, 1, 2, 3, 4, 5, 6, 1, 1, 7, 8]
computed = factorize(source)
self.assertListEqual(expected, computed)
def test_join_fit(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f3", b"f4", b"f4"])
x = np.array(
[
[0, 1, 2, 3, 4, 5],
[0, 2, 2, 3, 4, 5],
[1, 2, 3, 4, 5, 5],
[2, 3, 4, 5, 6, 6],
[3, 4, 5, 6, 7, 7],
[1, 2, 2, 3, 5, 7],
[1, 3, 4, 4, 4, 7],
]
)
expected = [0, 1, 1, 2, 2, 1, 2]
clf = FImdlp()
clf.fit(x, factorize(y))
computed = clf.join_fit([0, 2, 3, 4], 1, x)
self.assertListEqual(computed.tolist(), expected)
expected_y = [
b"00234",
b"00234",
b"11345",
b"22456",
b"23567",
b"31235",
b"31444",
]
self.assertListEqual(expected_y, clf.y_join_)
def test_join_fit_error(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
x = np.array(
[
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[1, 2, 3, 4, 5],
[2, 3, 4, 5, 6],
[3, 4, 5, 6, 7],
]
)
clf = FImdlp()
clf.fit(x, factorize(y))
with self.assertRaises(ValueError) as exception:
clf.join_fit([], 1, x)
self.assertEqual(
str(exception.exception),
"Number of features must be in range [1, 5]",
)
with self.assertRaises(ValueError) as exception:
FImdlp().join_fit([0, 4], 1, x)
self.assertTrue(
str(exception.exception).startswith(
"This FImdlp instance is not fitted yet."
)
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 5], 1, x)
self.assertEqual(
str(exception.exception),
"Feature 5 not in range [0, 5)",
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 2], 5, x)
self.assertEqual(
str(exception.exception),
"Target 5 not in range [0, 5)",
)
with self.assertRaises(ValueError) as exception:
clf.join_fit([0, 2], 2, x)
self.assertEqual(
str(exception.exception),
"Target cannot be in features to join",
)
def test_factorize(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"])
clf = FImdlp()
computed = clf.factorize(y)
self.assertListEqual([0, 0, 1, 2, 3], computed)
y = [b"f4", b"f0", b"f0", b"f2", b"f3"]
clf = FImdlp()
computed = clf.factorize(y)
self.assertListEqual([0, 1, 1, 2, 3], computed)
def test_join_fit_info(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
clf.join_fit([0, 2], 1, X)
clf.join_fit([0, 3], 2, X)
clf.join_fit([1, 2], 3, X)
expected = [-1, [0, 2, -1], [0, 3, -1], [1, 2, -1]]
self.assertListEqual(expected, clf.target_)
@staticmethod
def test_sklearn_transformer():
for check, test in check_estimator(FImdlp(), generate_only=True):
test(check)
def test_states_feature(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected = []
for i in [3, 6, 4, 3]:
expected.append(list(range(i)))
for feature in range(X.shape[1]):
self.assertListEqual(
expected[feature], clf.get_states_feature(feature)
)
def test_states_no_feature(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertIsNone(clf.get_states_feature(4))
def test_MaxDepth(self):
clf = FImdlp(max_depth=1)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45],
[3.35],
[2.45],
[0.8],
]
expected_depths = [1] * 4
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_MinLength(self):
clf = FImdlp(min_length=75)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45, 5.75],
[2.85, 3.35],
[2.45, 4.75],
[0.8, 1.75],
]
expected_depths = [3, 2, 2, 2]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_MinLengthMaxDepth(self):
clf = FImdlp(min_length=75, max_depth=2)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45, 5.75],
[2.85, 3.35],
[2.45, 4.75],
[0.8, 1.75],
]
expected_depths = [2, 2, 2, 2]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_max_cuts(self):
clf = FImdlp(max_cuts=1)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45],
[2.85],
[2.45],
[0.8],
]
expected_depths = [3, 5, 4, 3]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_ArffFiles(self):
loader = CArffFiles()
loader.load(b"src/cppmdlp/tests/datasets/iris.arff")
X = loader.get_X()
y = loader.get_y()
expected = [
(b"sepallength", b"REAL"),
(b"sepalwidth", b"REAL"),
(b"petallength", b"REAL"),
(b"petalwidth", b"REAL"),
]
self.assertListEqual(loader.get_attributes(), expected)
self.assertListEqual(y[:10], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
expected = [
b"5.1,3.5,1.4,0.2,Iris-setosa",
b"4.9,3.0,1.4,0.2,Iris-setosa",
b"4.7,3.2,1.3,0.2,Iris-setosa",
b"4.6,3.1,1.5,0.2,Iris-setosa",
b"5.0,3.6,1.4,0.2,Iris-setosa",
b"5.4,3.9,1.7,0.4,Iris-setosa",
b"4.6,3.4,1.4,0.3,Iris-setosa",
b"5.0,3.4,1.5,0.2,Iris-setosa",
b"4.4,2.9,1.4,0.2,Iris-setosa",
b"4.9,3.1,1.5,0.1,Iris-setosa",
]
self.assertListEqual(loader.get_lines()[:10], expected)
expected_X = [
[5.0999999, 3.5, 1.39999998, 0.2],
[4.9000001, 3, 1.39999998, 0.2],
[4.69999981, 3.20000005, 1.29999995, 0.2],
]
for computed, expected in zip(X[:3].tolist(), expected_X):
for c, e in zip(computed, expected):
self.assertAlmostEqual(c, e, delta=self.delta)