26 Commits
v0.9.3 ... main

Author SHA1 Message Date
3bae1fe390 Fix attribute name extraction in ArffFiles 2023-07-06 16:57:25 +02:00
Ricardo Montañana Gómez
a7098a907e Merge pull request #7 from Doctorado-ML/hiperparameters
Add hyperparameters to discretized

Hyperparameters added:

- min_length: int, default=3: The minimum length of an interval to be considered to be discretized.
- max_depth: int, default=1e6: The maximum depth of the discretization process.
- max_cuts: float, default=0: The maximum number of cut points to be computed for each feature. Compute all cutpoints and select the ones that produce less entropy
2023-04-25 18:05:12 +02:00
6e17548563 Add url to pyproject and comment to mdlp 2023-04-25 17:53:36 +02:00
dd42e186d5 Reformat Arfffiles.h 2023-04-25 17:16:04 +02:00
3a100bbba7 Add mdlp version to Makefile
Refactor sample.py
2023-04-25 17:11:40 +02:00
17a66858f8 Update version number to 0.9.4 2023-04-25 16:58:23 +02:00
3ed491cd34 Update mdlp version
Add mimimun mdlp version test
Update sample.cpp
2023-04-25 12:05:52 +02:00
878cd379ee Change arff library to sample.py 2023-04-14 11:20:48 +02:00
25d341aee5 Update samples and Readme 2023-04-12 17:40:25 +02:00
fa8c4a221d Remove duplicated lines 2023-04-11 19:45:37 +02:00
947d54202d Update hyperparams info 2023-04-11 19:35:39 +02:00
d04cb389c0 Update tests and module mdlp version 2023-04-11 19:33:57 +02:00
0768d68a36 add join_fit target info 2023-04-08 12:22:03 +02:00
e44bca0420 Move limits include to CPPFImldp header 2023-03-22 18:21:52 +01:00
c2294613df Move limits include to CPPFImldp header 2023-03-22 18:19:01 +01:00
1069fc8ff4 Add last mdlp version and update sample.cpp 2023-03-21 10:18:51 +01:00
95bc29c7f2 Remove trailing space in attribute type of Arff 2023-03-20 20:27:47 +01:00
da9db322da Fix sklearn requirement 2023-03-20 18:58:55 +01:00
e3c329b2e5 Add min_length as percentage of # samples 2023-03-20 18:57:26 +01:00
7368dd9ff4 Refactor ArffFiles in main project 2023-03-20 17:45:58 +01:00
b5c6a49e19 Add last version of mdlp 2023-03-19 19:22:07 +01:00
c2a0d33604 Add last mdlp version 2023-03-19 19:14:32 +01:00
e6a56e3140 Update samples 2023-03-14 11:47:30 +01:00
ccce9725b3 Add max_cuts hyperparamter as in mdlp 2023-03-13 18:14:56 +01:00
aa55d3a340 New version of library and tests 2023-02-26 17:59:08 +01:00
900cccf76b Update discretizer to new library 2023-02-25 18:52:21 +01:00
18 changed files with 637 additions and 92 deletions

View File

@@ -20,14 +20,14 @@ jobs:
with: with:
submodules: recursive submodules: recursive
- name: Set up Python ${{ matrix.python }} - name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2 uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python }} python-version: ${{ matrix.python }}
- name: Install dependencies - name: Install dependencies
run: | run: |
pip install -q --upgrade pip pip install -q --upgrade pip
pip install -q scikit-learn cython pip install -q scikit-learn cython
pip install -q --upgrade codecov coverage black flake8 codacy-coverage pip install -q coverage black flake8 codacy-coverage
- name: Build and install - name: Build and install
run: | run: |
make install make install
@@ -40,7 +40,7 @@ jobs:
coverage run -m unittest discover -v -s src coverage run -m unittest discover -v -s src
coverage xml coverage xml
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
uses: codecov/codecov-action@v1 uses: codecov/codecov-action@v3
with: with:
token: ${{ secrets.CODECOV_TOKEN }} token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml files: ./coverage.xml

View File

@@ -2,3 +2,4 @@ include src/cppmdlp/CPPFImdlp.h
include src/cppmdlp/typesFImdlp.h include src/cppmdlp/typesFImdlp.h
include src/cppmdlp/Metrics.h include src/cppmdlp/Metrics.h
include src/fimdlp/Factorize.h include src/fimdlp/Factorize.h
include src/fimdlp/ArffFiles.h

View File

@@ -40,6 +40,7 @@ audit: ## Audit pip
version: version:
@echo "Current Python version .: $(shell python --version)" @echo "Current Python version .: $(shell python --version)"
@echo "Current FImdlp version .: $(shell python -c "from fimdlp import _version; print(_version.__version__)")" @echo "Current FImdlp version .: $(shell python -c "from fimdlp import _version; print(_version.__version__)")"
@echo "Current mdlp version ...: $(shell python -c "from fimdlp.cppfimdlp import CFImdlp; print(CFImdlp().get_version().decode())")"
@echo "Installed FImdlp version: $(shell pip show fimdlp | grep Version | cut -d' ' -f2)" @echo "Installed FImdlp version: $(shell pip show fimdlp | grep Version | cut -d' ' -f2)"
help: ## Show help message help: ## Show help message

View File

@@ -25,7 +25,7 @@ git clone --recurse-submodules https://github.com/doctorado-ml/FImdlp.git
```bash ```bash
pip install -e . pip install -e .
python samples/sample.py iris python samples/sample.py iris
python samples/sample.py iris --alternative python samples/sample.py iris -c 2
python samples/sample.py -h # for more options python samples/sample.py -h # for more options
``` ```
@@ -33,9 +33,12 @@ python samples/sample.py -h # for more options
```bash ```bash
cd samples cd samples
mkdir build cmake -B build
cd build cd build
cmake ..
make make
./sample iris ./sample -f iris -c 2
./sample -h
``` ```
### Based on
[https://github.com/rmontanana/mdlp](https://github.com/rmontanana/mdlp)

12
k.py
View File

@@ -1,12 +0,0 @@
from sklearn.datasets import load_wine
from fimdlp.mdlp import FImdlp
X, y = load_wine(return_X_y=True)
trans = FImdlp()
Xt = trans.join_transform(X, y, 12)
print("X shape = ", X.shape)
print("Xt.shape=", Xt.shape)
print("Xt ", Xt[:10])
print("trans.X_ shape = ", trans.X_.shape)
print("trans.y_ ", trans.y_[:10])
print("y_join ", trans.y_join_[:10])

View File

@@ -21,7 +21,7 @@ dynamic = ['version']
dependencies = ["numpy", "joblib", "scikit-learn"] dependencies = ["numpy", "joblib", "scikit-learn"]
requires-python = ">=3.9" requires-python = ">=3.9"
classifiers = [ classifiers = [
"Development Status :: 3 - Alpha", "Development Status :: 4 - Beta",
"Intended Audience :: Science/Research", "Intended Audience :: Science/Research",
"Intended Audience :: Developers", "Intended Audience :: Developers",
"Topic :: Software Development", "Topic :: Software Development",
@@ -33,14 +33,16 @@ classifiers = [
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
] ]
[project.urls] [project.urls]
Home = "https://github.com/doctorado-ml/FImdlp" Home = "https://github.com/doctorado-ml/FImdlp"
Base = "https://github.com/rmontanana/mdlp"
[tool.black] [tool.black]
line-length = 79 line-length = 79
target_version = ['py39', 'py310'] target_version = ['py39', 'py310', 'py311']
include = '\.pyi?$' include = '\.pyi?$'
exclude = ''' exclude = '''
/( /(

View File

@@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
project(main) project(sample)
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)
add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp) add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp)

View File

@@ -1,30 +1,101 @@
#include "../src/cppmdlp/tests/ArffFiles.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <iomanip> #include <iomanip>
#include <chrono>
#include <algorithm>
#include <cstring>
#include <getopt.h>
#include "../src/cppmdlp/CPPFImdlp.h" #include "../src/cppmdlp/CPPFImdlp.h"
#include "../src/cppmdlp/tests/ArffFiles.h"
using namespace std; using namespace std;
using namespace mdlp;
int main(int argc, char** argv) const string PATH = "../../src/cppmdlp/tests/datasets/";
/* print a description of all supported options */
void usage(const char* path)
{
/* take only the last portion of the path */
const char* basename = strrchr(path, '/');
basename = basename ? basename + 1 : path;
cout << "usage: " << basename << "[OPTION]" << endl;
cout << " -h, --help\t\t Print this help and exit." << endl;
cout
<< " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}."
<< endl;
cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl;
cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl;
cout
<< " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 -> any"
<< endl;
cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl;
}
tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
{
string file_name;
string path = PATH;
int max_depth = numeric_limits<int>::max();
int min_length = 3;
float max_cutpoints = 0;
const vector<struct option> long_options = {
{"help", no_argument, nullptr, 'h'},
{"file", required_argument, nullptr, 'f'},
{"path", required_argument, nullptr, 'p'},
{"max_depth", required_argument, nullptr, 'm'},
{"max_cutpoints", required_argument, nullptr, 'c'},
{"min_length", required_argument, nullptr, 'n'},
{nullptr, no_argument, nullptr, 0}
};
while (true) {
const auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options.data(), nullptr);
if (c == -1)
break;
switch (c) {
case 'h':
usage(argv[0]);
exit(0);
case 'f':
file_name = string(optarg);
break;
case 'm':
max_depth = stoi(optarg);
break;
case 'n':
min_length = stoi(optarg);
break;
case 'c':
max_cutpoints = stof(optarg);
break;
case 'p':
path = optarg;
if (path.back() != '/')
path += '/';
break;
case '?':
usage(argv[0]);
exit(1);
default:
abort();
}
}
if (file_name.empty()) {
usage(argv[0]);
exit(1);
}
return make_tuple(file_name, path, max_depth, min_length, max_cutpoints);
}
void process_file(const string& path, const string& file_name, bool class_last, int max_depth, int min_length,
float max_cutpoints)
{ {
ArffFiles file; ArffFiles file;
vector<string> lines;
string path = "../../src/cppmdlp/tests/datasets/";
map<string, bool > datasets = {
{"mfeat-factors", true},
{"iris", true},
{"letter", true},
{"kdd_JapaneseVowels", false}
};
if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {
cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl;
return 1;
}
file.load(path + argv[1] + ".arff", datasets[argv[1]]); file.load(path + file_name + ".arff", class_last);
auto attributes = file.getAttributes(); const auto attributes = file.getAttributes();
int items = file.getSize(); const auto items = file.getSize();
cout << "Number of lines: " << items << endl; cout << "Number of lines: " << items << endl;
cout << "Attributes: " << endl; cout << "Attributes: " << endl;
for (auto attribute : attributes) { for (auto attribute : attributes) {
@@ -33,22 +104,93 @@ int main(int argc, char** argv)
cout << "Class name: " << file.getClassName() << endl; cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl; cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl; cout << "Data: " << endl;
vector<vector<float>>& X = file.getX(); vector<samples_t>& X = file.getX();
vector<int>& y = file.getY(); labels_t& y = file.getY();
for (int i = 0; i < 50; i++) { for (int i = 0; i < 5; i++) {
for (auto feature : X) { for (auto feature : X) {
cout << fixed << setprecision(1) << feature[i] << " "; cout << fixed << setprecision(1) << feature[i] << " ";
} }
cout << y[i] << endl; cout << y[i] << endl;
} }
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0); auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
size_t total = 0;
for (auto i = 0; i < attributes.size(); i++) { for (auto i = 0; i < attributes.size(); i++) {
cout << "Cut points for " << get<0>(attributes[i]) << endl; auto min_max = minmax_element(X[i].begin(), X[i].end());
cout << "--------------------------" << setprecision(3) << endl; cout << "Cut points for feature " << get<0>(attributes[i]) << ": [" << setprecision(3);
test.fit(X[i], y); test.fit(X[i], y);
for (auto item : test.getCutPoints()) { auto cut_points = test.getCutPoints();
cout << item << endl; for (auto item : cut_points) {
cout << item;
if (item != cut_points.back())
cout << ", ";
} }
total += test.getCutPoints().size();
cout << "]" << endl;
cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl;
cout << "--------------------------" << endl;
}
cout << "Total cut points ...: " << total << endl;
cout << "Total feature states: " << total + attributes.size() << endl;
}
void process_all_files(const map<string, bool>& datasets, const string& path, int max_depth, int min_length,
float max_cutpoints)
{
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: "
<< max_cutpoints << endl << endl;
printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)");
printf("==================== ==== ==== ========\n");
for (const auto& dataset : datasets) {
ArffFiles file;
file.load(path + dataset.first + ".arff", dataset.second);
auto attributes = file.getAttributes();
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
size_t timing = 0;
size_t cut_points = 0;
for (auto i = 0; i < attributes.size(); i++) {
auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
test.fit(X[i], y);
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
timing += std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
cut_points += test.getCutPoints().size();
}
printf("%-20s %4lu %4zu %8zu\n", dataset.first.c_str(), attributes.size(), cut_points, timing);
}
}
int main(int argc, char** argv)
{
map<string, bool> datasets = {
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
{"test", true}
};
string file_name;
string path;
int max_depth;
int min_length;
float max_cutpoints;
tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv);
if (datasets.find(file_name) == datasets.end() && file_name != "all") {
cout << "Invalid file name: " << file_name << endl;
usage(argv[0]);
exit(1);
}
if (file_name == "all")
process_all_files(datasets, path, max_depth, min_length, max_cutpoints);
else {
process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints);
cout << "File name ....: " << file_name << endl;
cout << "Max depth ....: " << max_depth << endl;
cout << "Min length ...: " << min_length << endl;
cout << "Max cutpoints : " << max_cutpoints << endl;
} }
return 0; return 0;
} }

View File

@@ -1,21 +1,37 @@
import time import time
import argparse import argparse
import os import os
from scipy.io import arff
import pandas as pd import pandas as pd
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from fimdlp.mdlp import FImdlp from fimdlp.mdlp import FImdlp
from fimdlp.cppfimdlp import CArffFiles
datasets = { datasets = {
"mfeat-factors": True, "mfeat-factors": True,
"iris": True, "iris": True,
"glass": True,
"liver-disorders": True,
"letter": True, "letter": True,
"kdd_JapaneseVowels": False, "kdd_JapaneseVowels": False,
} }
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
ap.add_argument( ap.add_argument(
"--alternative", dest="proposal", action="store_const", const=1 "-n",
"--min_length",
type=int,
default=3,
help="Minimum length of interval",
)
ap.add_argument(
"-m", "--max_depth", type=int, default=9999, help="Maximum depth"
)
ap.add_argument(
"-c",
"--max_cuts",
type=float,
default=0,
help="Maximum number of cut points",
) )
ap.add_argument("dataset", type=str, choices=datasets.keys()) ap.add_argument("dataset", type=str, choices=datasets.keys())
args = ap.parse_args() args = ap.parse_args()
@@ -23,22 +39,34 @@ relative = "" if os.path.isdir("src") else ".."
file_name = os.path.join( file_name = os.path.join(
relative, "src", "cppmdlp", "tests", "datasets", args.dataset relative, "src", "cppmdlp", "tests", "datasets", args.dataset
) )
data = arff.loadarff(file_name + ".arff") arff = CArffFiles()
df = pd.DataFrame(data[0]) arff.load(bytes(f"{file_name}.arff", "utf-8"))
class_column = -1 if datasets[args.dataset] else 0 X = arff.get_X()
class_name = df.columns.to_list()[class_column] y = arff.get_y()
X = df.drop(class_name, axis=1) attributes = arff.get_attributes()
y, _ = pd.factorize(df[class_name]) attributes = [x[0].decode() for x in attributes]
X = X.to_numpy() df = pd.DataFrame(X, columns=attributes)
test = FImdlp(algorithm=args.proposal if args.proposal is not None else 0) class_name = arff.get_class_name().decode()
df[class_name] = y
test = FImdlp(
min_length=args.min_length,
max_depth=args.max_depth,
max_cuts=args.max_cuts,
)
now = time.time() now = time.time()
test.fit(X, y) test.fit(X, y)
fit_time = time.time() fit_time = time.time()
print("Fitting: ", fit_time - now) print(f"Fitting ....: {fit_time - now:7.5f} seconds")
now = time.time() now = time.time()
Xt = test.transform(X) Xt = test.transform(X)
print("Transforming: ", time.time() - now) print(f"Transforming: {time.time() - now:7.5f} seconds")
print(test.get_cut_points()) cut_points = test.get_cut_points()
for i, cuts in enumerate(cut_points):
print(f"Cut points for feature {attributes[i]}: {cuts}")
print(f"Min: {min(X[:, i]):6.4f} Max: {max(X[:, i]):6.4f}")
num_cuts = sum([len(x) for x in cut_points])
print(f"Total cut points ...: {num_cuts}")
print(f"Total feature states: {num_cuts + len(attributes)}")
clf = RandomForestClassifier(random_state=0) clf = RandomForestClassifier(random_state=0)
print( print(
"Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y) "Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y)

View File

@@ -15,6 +15,7 @@ setup(
"src/cppmdlp/CPPFImdlp.cpp", "src/cppmdlp/CPPFImdlp.cpp",
"src/cppmdlp/Metrics.cpp", "src/cppmdlp/Metrics.cpp",
"src/fimdlp/Factorize.cpp", "src/fimdlp/Factorize.cpp",
"src/fimdlp/ArffFiles.cpp",
], ],
language="c++", language="c++",
include_dirs=["fimdlp"], include_dirs=["fimdlp"],

132
src/fimdlp/ArffFiles.cpp Normal file
View File

@@ -0,0 +1,132 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
using namespace std;
ArffFiles::ArffFiles() = default;
vector<string> ArffFiles::getLines() const
{
return lines;
}
unsigned long int ArffFiles::getSize() const
{
return lines.size();
}
vector<pair<string, string>> ArffFiles::getAttributes() const
{
return attributes;
}
string ArffFiles::getClassName() const
{
return className;
}
string ArffFiles::getClassType() const
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(const string& fileName, bool classLast)
{
ifstream file(fileName);
if (!file.is_open()) {
throw invalid_argument("Unable to open file");
}
string line;
string keyword;
string attribute;
string type;
string type_w;
while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute;
type = "";
while (ss >> type_w)
type += type_w + " ";
attributes.emplace_back(trim(attribute), trim(type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
if (classLast) {
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
} else {
className = get<0>(attributes.front());
classType = get<1>(attributes.front());
attributes.erase(attributes.begin());
}
generateDataset(classLast);
}
void ArffFiles::generateDataset(bool classLast)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
auto yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0;
for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int pos = 0;
int xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
X[xIndex++][i] = stof(value);
}
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" '\n\r\t"));
s.erase(s.find_last_not_of(" '\n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (const string& label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

34
src/fimdlp/ArffFiles.h Normal file
View File

@@ -0,0 +1,34 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<pair<string, string>> attributes;
string className;
string classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset(bool);
public:
ArffFiles();
void load(const string&, bool = true);
vector<string> getLines() const;
unsigned long int getSize() const;
string getClassName() const;
string getClassType() const;
static string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<pair<string, string>> getAttributes() const;
static vector<int> factorize(const vector<string>& labels_t);
};
#endif

View File

@@ -7,7 +7,7 @@ namespace utils {
yy.reserve(labels_t.size()); yy.reserve(labels_t.size());
map<string, int> labelMap; map<string, int> labelMap;
int i = 0; int i = 0;
for (string label : labels_t) { for (const string& label : labels_t) {
if (labelMap.find(label) == labelMap.end()) { if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++; labelMap[label] = i++;
} }

View File

@@ -1 +1 @@
__version__ = "0.9.3" __version__ = "0.9.4"

View File

@@ -1,20 +1,27 @@
# distutils: language = c++ # distutils: language = c++
# cython: language_level = 3 # cython: language_level = 3
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp.pair cimport pair
from libcpp.string cimport string from libcpp.string cimport string
from libcpp cimport bool
import numpy as np
cdef extern from "limits.h":
cdef int INT_MAX
cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
ctypedef float precision_t ctypedef float precision_t
cdef cppclass CPPFImdlp: cdef cppclass CPPFImdlp:
CPPFImdlp() except + CPPFImdlp() except +
CPPFImdlp(size_t, int, float) except +
CPPFImdlp& fit(vector[precision_t]&, vector[int]&) CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
int get_depth()
vector[precision_t] getCutPoints() vector[precision_t] getCutPoints()
string version() string version()
cdef class CFImdlp: cdef class CFImdlp:
cdef CPPFImdlp *thisptr cdef CPPFImdlp *thisptr
def __cinit__(self): def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX, float max_cuts=0):
self.thisptr = new CPPFImdlp() self.thisptr = new CPPFImdlp(min_length, max_depth, max_cuts)
def __dealloc__(self): def __dealloc__(self):
del self.thisptr del self.thisptr
def fit(self, X, y): def fit(self, X, y):
@@ -24,6 +31,8 @@ cdef class CFImdlp:
return self.thisptr.getCutPoints() return self.thisptr.getCutPoints()
def get_version(self): def get_version(self):
return self.thisptr.version() return self.thisptr.version()
def get_depth(self):
return self.thisptr.get_depth()
def __reduce__(self): def __reduce__(self):
return (CFImdlp, ()) return (CFImdlp, ())
@@ -31,3 +40,42 @@ cdef extern from "Factorize.h" namespace "utils":
vector[int] cppFactorize(vector[string] &input_vector) vector[int] cppFactorize(vector[string] &input_vector)
def factorize(input_vector): def factorize(input_vector):
return cppFactorize(input_vector) return cppFactorize(input_vector)
cdef extern from "ArffFiles.h":
cdef cppclass ArffFiles:
ArffFiles() except +
void load(string, bool)
unsigned long int getSize()
string getClassName()
string getClassType()
string trim(const string&)
vector[vector[float]]& getX()
vector[int]& getY()
vector[string] getLines()
vector[pair[string, string]] getAttributes()
cdef class CArffFiles:
cdef ArffFiles *thisptr
def __cinit__(self):
self.thisptr = new ArffFiles()
def __dealloc__(self):
del self.thisptr
def load(self, string filename, bool verbose = True):
self.thisptr.load(filename, verbose)
def get_size(self):
return self.thisptr.getSize()
def get_class_name(self):
return self.thisptr.getClassName()
def get_class_type(self):
return self.thisptr.getClassType()
def get_X(self):
return np.array(self.thisptr.getX()).T
def get_y(self):
return self.thisptr.getY()
def get_lines(self):
return self.thisptr.getLines()
def get_attributes(self):
return self.thisptr.getAttributes()
def __reduce__(self):
return (CArffFiles, ())

View File

@@ -6,12 +6,13 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from joblib import Parallel, delayed from joblib import Parallel, delayed
from ._version import __version__ from ._version import __version__
# from ._version import __version__
class FImdlp(TransformerMixin, BaseEstimator): class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, n_jobs=-1): def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0):
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.min_length = min_length
self.max_depth = max_depth
self.max_cuts = max_cuts
"""Fayyad - Irani MDLP discretization algorithm based implementation. """Fayyad - Irani MDLP discretization algorithm based implementation.
@@ -21,6 +22,12 @@ class FImdlp(TransformerMixin, BaseEstimator):
The number of jobs to run in parallel. :meth:`fit` and The number of jobs to run in parallel. :meth:`fit` and
:meth:`transform`, are parallelized over the features. ``-1`` means :meth:`transform`, are parallelized over the features. ``-1`` means
using all cores available. using all cores available.
min_length: int, default=3
The minimum length of an interval to be considered to be discretized.
max_depth: int, default=1e6
The maximum depth of the discretization process.
max_cuts: float, default=0
The maximum number of cut points to be computed for each feature.
Attributes Attributes
---------- ----------
@@ -95,17 +102,28 @@ class FImdlp(TransformerMixin, BaseEstimator):
self._update_params(X, y) self._update_params(X, y)
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
self.efective_min_length_ = (
self.min_length
if self.min_length > 1
else int(self.min_length * X.shape[0])
)
self.discretizer_ = [None] * self.n_features_in_ self.discretizer_ = [None] * self.n_features_in_
self.cut_points_ = [None] * self.n_features_in_ self.cut_points_ = [None] * self.n_features_in_
Parallel(n_jobs=self.n_jobs, prefer="threads")( Parallel(n_jobs=self.n_jobs, prefer="threads")(
delayed(self._fit_discretizer)(feature) delayed(self._fit_discretizer)(feature)
for feature in range(self.n_features_in_) for feature in range(self.n_features_in_)
) )
# target of every feature. Start with -1 => y (see join_fit)
self.target_ = [-1] * self.n_features_in_
return self return self
def _fit_discretizer(self, feature): def _fit_discretizer(self, feature):
if feature in self.features_: if feature in self.features_:
self.discretizer_[feature] = CFImdlp() self.discretizer_[feature] = CFImdlp(
min_length=self.efective_min_length_,
max_depth=self.max_depth,
max_cuts=self.max_cuts,
)
self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[ self.cut_points_[feature] = self.discretizer_[
feature feature
@@ -232,13 +250,21 @@ class FImdlp(TransformerMixin, BaseEstimator):
f"Target {target} not in range [0, {self.n_features_in_})" f"Target {target} not in range [0, {self.n_features_in_})"
) )
if target in features: if target in features:
raise ValueError("Target cannot in features to join") raise ValueError("Target cannot be in features to join")
y_join = [ y_join = [
f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode()
for item_y, items_x in zip(self.y_, data[:, features]) for item_y, items_x in zip(self.y_, data[:, features])
] ]
# Store in target_ the features used with class to discretize target
self.target_[target] = features + [-1]
self.y_join_ = y_join self.y_join_ = y_join
self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))
self.cut_points_[target] = self.discretizer_[target].get_cut_points() self.cut_points_[target] = self.discretizer_[target].get_cut_points()
# return the discretized target variable with the new cut points # return the discretized target variable with the new cut points
return np.searchsorted(self.cut_points_[target], self.X_[:, target]) return np.searchsorted(self.cut_points_[target], self.X_[:, target])
def get_depths(self):
res = [0] * self.n_features_in_
for feature in self.features_:
res[feature] = self.discretizer_[feature].get_depth()
return res

View File

@@ -3,14 +3,14 @@ import sklearn
import numpy as np import numpy as np
from sklearn.datasets import load_iris from sklearn.datasets import load_iris
from sklearn.utils.estimator_checks import check_estimator from sklearn.utils.estimator_checks import check_estimator
from ..cppfimdlp import CFImdlp, factorize from ..cppfimdlp import CFImdlp, factorize, CArffFiles
from ..mdlp import FImdlp from ..mdlp import FImdlp
from .. import __version__ from .. import __version__
# from .._version import __version__
class FImdlpTest(unittest.TestCase): class FImdlpTest(unittest.TestCase):
delta = 1e-6 # same tolerance as in C++ code
def test_version(self): def test_version(self):
clf = FImdlp() clf = FImdlp()
self.assertEqual( self.assertEqual(
@@ -18,11 +18,22 @@ class FImdlpTest(unittest.TestCase):
f"{__version__}({CFImdlp().get_version().decode()})", f"{__version__}({CFImdlp().get_version().decode()})",
) )
def test_minimum_mdlp_version(self):
mdlp_version = tuple(
int(c) for c in CFImdlp().get_version().decode().split(".")[0:3]
)
minimum_mdlp_version = (1, 1, 2)
self.assertTrue(mdlp_version >= minimum_mdlp_version)
def test_init(self): def test_init(self):
clf = FImdlp() clf = FImdlp()
self.assertEqual(-1, clf.n_jobs) self.assertEqual(-1, clf.n_jobs)
clf = FImdlp(n_jobs=7) self.assertEqual(3, clf.min_length)
self.assertEqual(1e6, clf.max_depth)
clf = FImdlp(n_jobs=7, min_length=24, max_depth=17)
self.assertEqual(7, clf.n_jobs) self.assertEqual(7, clf.n_jobs)
self.assertEqual(24, clf.min_length)
self.assertEqual(17, clf.max_depth)
def test_fit_definitive(self): def test_fit_definitive(self):
clf = FImdlp() clf = FImdlp()
@@ -32,15 +43,15 @@ class FImdlpTest(unittest.TestCase):
self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_)) self.assertTrue(np.array_equal(y, clf.y_))
expected = [ expected = [
[5.449999809265137, 5.75], [5.45, 5.75],
[2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684], [2.75, 2.85, 2.95, 3.05, 3.35],
[2.45, 4.75, 5.050000190734863], [2.45, 4.75, 5.05],
[0.8, 1.75], [0.8, 1.75],
] ]
computed = clf.get_cut_points() computed = clf.get_cut_points()
for item_computed, item_expected in zip(computed, expected): for item_computed, item_expected in zip(computed, expected):
for x_, y_ in zip(item_computed, item_expected): for x_, y_ in zip(item_computed, item_expected):
self.assertAlmostEqual(x_, y_) self.assertAlmostEqual(x_, y_, delta=self.delta)
self.assertListEqual([0, 1, 2, 3], clf.features_) self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3]) clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_) self.assertListEqual([0, 2, 3], clf.features_)
@@ -132,22 +143,32 @@ class FImdlpTest(unittest.TestCase):
self.assertListEqual(expected, computed) self.assertListEqual(expected, computed)
def test_join_fit(self): def test_join_fit(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) y = np.array([b"f0", b"f0", b"f2", b"f3", b"f3", b"f4", b"f4"])
x = np.array( x = np.array(
[ [
[0, 1, 2, 3, 4], [0, 1, 2, 3, 4, 5],
[0, 1, 2, 3, 4], [0, 2, 2, 3, 4, 5],
[1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 5],
[2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 6],
[3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 7],
[1, 2, 2, 3, 5, 7],
[1, 3, 4, 4, 4, 7],
] ]
) )
expected = [0, 0, 1, 2, 2] expected = [0, 1, 1, 2, 2, 1, 2]
clf = FImdlp() clf = FImdlp()
clf.fit(x, factorize(y)) clf.fit(x, factorize(y))
computed = clf.join_fit([0, 2], 1, x) computed = clf.join_fit([0, 2, 3, 4], 1, x)
self.assertListEqual(computed.tolist(), expected) self.assertListEqual(computed.tolist(), expected)
expected_y = [b"002", b"002", b"113", b"224", b"335"] expected_y = [
b"00234",
b"00234",
b"11345",
b"22456",
b"23567",
b"31235",
b"31444",
]
self.assertListEqual(expected_y, clf.y_join_) self.assertListEqual(expected_y, clf.y_join_)
def test_join_fit_error(self): def test_join_fit_error(self):
@@ -192,7 +213,7 @@ class FImdlpTest(unittest.TestCase):
clf.join_fit([0, 2], 2, x) clf.join_fit([0, 2], 2, x)
self.assertEqual( self.assertEqual(
str(exception.exception), str(exception.exception),
"Target cannot in features to join", "Target cannot be in features to join",
) )
def test_factorize(self): def test_factorize(self):
@@ -205,6 +226,16 @@ class FImdlpTest(unittest.TestCase):
computed = clf.factorize(y) computed = clf.factorize(y)
self.assertListEqual([0, 1, 1, 2, 3], computed) self.assertListEqual([0, 1, 1, 2, 3], computed)
def test_join_fit_info(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
clf.join_fit([0, 2], 1, X)
clf.join_fit([0, 3], 2, X)
clf.join_fit([1, 2], 3, X)
expected = [-1, [0, 2, -1], [0, 3, -1], [1, 2, -1]]
self.assertListEqual(expected, clf.target_)
@staticmethod @staticmethod
def test_sklearn_transformer(): def test_sklearn_transformer():
for check, test in check_estimator(FImdlp(), generate_only=True): for check, test in check_estimator(FImdlp(), generate_only=True):
@@ -227,3 +258,110 @@ class FImdlpTest(unittest.TestCase):
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
clf.fit(X, y) clf.fit(X, y)
self.assertIsNone(clf.get_states_feature(4)) self.assertIsNone(clf.get_states_feature(4))
def test_MaxDepth(self):
clf = FImdlp(max_depth=1)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45],
[3.35],
[2.45],
[0.8],
]
expected_depths = [1] * 4
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_MinLength(self):
clf = FImdlp(min_length=75)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45, 5.75],
[2.85, 3.35],
[2.45, 4.75],
[0.8, 1.75],
]
expected_depths = [3, 2, 2, 2]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_MinLengthMaxDepth(self):
clf = FImdlp(min_length=75, max_depth=2)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45, 5.75],
[2.85, 3.35],
[2.45, 4.75],
[0.8, 1.75],
]
expected_depths = [2, 2, 2, 2]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_max_cuts(self):
clf = FImdlp(max_cuts=1)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45],
[2.85],
[2.45],
[0.8],
]
expected_depths = [3, 5, 4, 3]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_ArffFiles(self):
loader = CArffFiles()
loader.load(b"src/cppmdlp/tests/datasets/iris.arff")
X = loader.get_X()
y = loader.get_y()
expected = [
(b"sepallength", b"REAL"),
(b"sepalwidth", b"REAL"),
(b"petallength", b"REAL"),
(b"petalwidth", b"REAL"),
]
self.assertListEqual(loader.get_attributes(), expected)
self.assertListEqual(y[:10], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
expected = [
b"5.1,3.5,1.4,0.2,Iris-setosa",
b"4.9,3.0,1.4,0.2,Iris-setosa",
b"4.7,3.2,1.3,0.2,Iris-setosa",
b"4.6,3.1,1.5,0.2,Iris-setosa",
b"5.0,3.6,1.4,0.2,Iris-setosa",
b"5.4,3.9,1.7,0.4,Iris-setosa",
b"4.6,3.4,1.4,0.3,Iris-setosa",
b"5.0,3.4,1.5,0.2,Iris-setosa",
b"4.4,2.9,1.4,0.2,Iris-setosa",
b"4.9,3.1,1.5,0.1,Iris-setosa",
]
self.assertListEqual(loader.get_lines()[:10], expected)
expected_X = [
[5.0999999, 3.5, 1.39999998, 0.2],
[4.9000001, 3, 1.39999998, 0.2],
[4.69999981, 3.20000005, 1.29999995, 0.2],
]
for computed, expected in zip(X[:3].tolist(), expected_X):
for c, e in zip(computed, expected):
self.assertAlmostEqual(c, e, delta=self.delta)