Merge pull request #7 from Doctorado-ML/hiperparameters

Add hyperparameters to discretized

Hyperparameters added:

- min_length: int, default=3: The minimum length of an interval to be considered to be discretized.
- max_depth: int, default=1e6: The maximum depth of the discretization process.
- max_cuts: float, default=0: The maximum number of cut points to be computed for each feature. Compute all cutpoints and select the ones that produce less entropy
This commit is contained in:
Ricardo Montañana Gómez
2023-04-25 18:05:12 +02:00
committed by GitHub
18 changed files with 637 additions and 92 deletions

View File

@@ -20,14 +20,14 @@ jobs:
with: with:
submodules: recursive submodules: recursive
- name: Set up Python ${{ matrix.python }} - name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2 uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python }} python-version: ${{ matrix.python }}
- name: Install dependencies - name: Install dependencies
run: | run: |
pip install -q --upgrade pip pip install -q --upgrade pip
pip install -q scikit-learn cython pip install -q scikit-learn cython
pip install -q --upgrade codecov coverage black flake8 codacy-coverage pip install -q coverage black flake8 codacy-coverage
- name: Build and install - name: Build and install
run: | run: |
make install make install
@@ -40,7 +40,7 @@ jobs:
coverage run -m unittest discover -v -s src coverage run -m unittest discover -v -s src
coverage xml coverage xml
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
uses: codecov/codecov-action@v1 uses: codecov/codecov-action@v3
with: with:
token: ${{ secrets.CODECOV_TOKEN }} token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml files: ./coverage.xml

View File

@@ -1,4 +1,5 @@
include src/cppmdlp/CPPFImdlp.h include src/cppmdlp/CPPFImdlp.h
include src/cppmdlp/typesFImdlp.h include src/cppmdlp/typesFImdlp.h
include src/cppmdlp/Metrics.h include src/cppmdlp/Metrics.h
include src/fimdlp/Factorize.h include src/fimdlp/Factorize.h
include src/fimdlp/ArffFiles.h

View File

@@ -40,6 +40,7 @@ audit: ## Audit pip
version: version:
@echo "Current Python version .: $(shell python --version)" @echo "Current Python version .: $(shell python --version)"
@echo "Current FImdlp version .: $(shell python -c "from fimdlp import _version; print(_version.__version__)")" @echo "Current FImdlp version .: $(shell python -c "from fimdlp import _version; print(_version.__version__)")"
@echo "Current mdlp version ...: $(shell python -c "from fimdlp.cppfimdlp import CFImdlp; print(CFImdlp().get_version().decode())")"
@echo "Installed FImdlp version: $(shell pip show fimdlp | grep Version | cut -d' ' -f2)" @echo "Installed FImdlp version: $(shell pip show fimdlp | grep Version | cut -d' ' -f2)"
help: ## Show help message help: ## Show help message

View File

@@ -25,7 +25,7 @@ git clone --recurse-submodules https://github.com/doctorado-ml/FImdlp.git
```bash ```bash
pip install -e . pip install -e .
python samples/sample.py iris python samples/sample.py iris
python samples/sample.py iris --alternative python samples/sample.py iris -c 2
python samples/sample.py -h # for more options python samples/sample.py -h # for more options
``` ```
@@ -33,9 +33,12 @@ python samples/sample.py -h # for more options
```bash ```bash
cd samples cd samples
mkdir build cmake -B build
cd build cd build
cmake ..
make make
./sample iris ./sample -f iris -c 2
./sample -h
``` ```
### Based on
[https://github.com/rmontanana/mdlp](https://github.com/rmontanana/mdlp)

12
k.py
View File

@@ -1,12 +0,0 @@
from sklearn.datasets import load_wine
from fimdlp.mdlp import FImdlp
X, y = load_wine(return_X_y=True)
trans = FImdlp()
Xt = trans.join_transform(X, y, 12)
print("X shape = ", X.shape)
print("Xt.shape=", Xt.shape)
print("Xt ", Xt[:10])
print("trans.X_ shape = ", trans.X_.shape)
print("trans.y_ ", trans.y_[:10])
print("y_join ", trans.y_join_[:10])

View File

@@ -21,7 +21,7 @@ dynamic = ['version']
dependencies = ["numpy", "joblib", "scikit-learn"] dependencies = ["numpy", "joblib", "scikit-learn"]
requires-python = ">=3.9" requires-python = ">=3.9"
classifiers = [ classifiers = [
"Development Status :: 3 - Alpha", "Development Status :: 4 - Beta",
"Intended Audience :: Science/Research", "Intended Audience :: Science/Research",
"Intended Audience :: Developers", "Intended Audience :: Developers",
"Topic :: Software Development", "Topic :: Software Development",
@@ -33,14 +33,16 @@ classifiers = [
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
] ]
[project.urls] [project.urls]
Home = "https://github.com/doctorado-ml/FImdlp" Home = "https://github.com/doctorado-ml/FImdlp"
Base = "https://github.com/rmontanana/mdlp"
[tool.black] [tool.black]
line-length = 79 line-length = 79
target_version = ['py39', 'py310'] target_version = ['py39', 'py310', 'py311']
include = '\.pyi?$' include = '\.pyi?$'
exclude = ''' exclude = '''
/( /(

View File

@@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
project(main) project(sample)
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)
add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp) add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp)

View File

@@ -1,30 +1,101 @@
#include "../src/cppmdlp/tests/ArffFiles.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <iomanip> #include <iomanip>
#include <chrono>
#include <algorithm>
#include <cstring>
#include <getopt.h>
#include "../src/cppmdlp/CPPFImdlp.h" #include "../src/cppmdlp/CPPFImdlp.h"
#include "../src/cppmdlp/tests/ArffFiles.h"
using namespace std; using namespace std;
using namespace mdlp;
int main(int argc, char** argv) const string PATH = "../../src/cppmdlp/tests/datasets/";
/* print a description of all supported options */
void usage(const char* path)
{
/* take only the last portion of the path */
const char* basename = strrchr(path, '/');
basename = basename ? basename + 1 : path;
cout << "usage: " << basename << "[OPTION]" << endl;
cout << " -h, --help\t\t Print this help and exit." << endl;
cout
<< " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}."
<< endl;
cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl;
cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl;
cout
<< " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 -> any"
<< endl;
cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl;
}
tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
{
string file_name;
string path = PATH;
int max_depth = numeric_limits<int>::max();
int min_length = 3;
float max_cutpoints = 0;
const vector<struct option> long_options = {
{"help", no_argument, nullptr, 'h'},
{"file", required_argument, nullptr, 'f'},
{"path", required_argument, nullptr, 'p'},
{"max_depth", required_argument, nullptr, 'm'},
{"max_cutpoints", required_argument, nullptr, 'c'},
{"min_length", required_argument, nullptr, 'n'},
{nullptr, no_argument, nullptr, 0}
};
while (true) {
const auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options.data(), nullptr);
if (c == -1)
break;
switch (c) {
case 'h':
usage(argv[0]);
exit(0);
case 'f':
file_name = string(optarg);
break;
case 'm':
max_depth = stoi(optarg);
break;
case 'n':
min_length = stoi(optarg);
break;
case 'c':
max_cutpoints = stof(optarg);
break;
case 'p':
path = optarg;
if (path.back() != '/')
path += '/';
break;
case '?':
usage(argv[0]);
exit(1);
default:
abort();
}
}
if (file_name.empty()) {
usage(argv[0]);
exit(1);
}
return make_tuple(file_name, path, max_depth, min_length, max_cutpoints);
}
void process_file(const string& path, const string& file_name, bool class_last, int max_depth, int min_length,
float max_cutpoints)
{ {
ArffFiles file; ArffFiles file;
vector<string> lines;
string path = "../../src/cppmdlp/tests/datasets/";
map<string, bool > datasets = {
{"mfeat-factors", true},
{"iris", true},
{"letter", true},
{"kdd_JapaneseVowels", false}
};
if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {
cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl;
return 1;
}
file.load(path + argv[1] + ".arff", datasets[argv[1]]); file.load(path + file_name + ".arff", class_last);
auto attributes = file.getAttributes(); const auto attributes = file.getAttributes();
int items = file.getSize(); const auto items = file.getSize();
cout << "Number of lines: " << items << endl; cout << "Number of lines: " << items << endl;
cout << "Attributes: " << endl; cout << "Attributes: " << endl;
for (auto attribute : attributes) { for (auto attribute : attributes) {
@@ -33,22 +104,93 @@ int main(int argc, char** argv)
cout << "Class name: " << file.getClassName() << endl; cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl; cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl; cout << "Data: " << endl;
vector<vector<float>>& X = file.getX(); vector<samples_t>& X = file.getX();
vector<int>& y = file.getY(); labels_t& y = file.getY();
for (int i = 0; i < 50; i++) { for (int i = 0; i < 5; i++) {
for (auto feature : X) { for (auto feature : X) {
cout << fixed << setprecision(1) << feature[i] << " "; cout << fixed << setprecision(1) << feature[i] << " ";
} }
cout << y[i] << endl; cout << y[i] << endl;
} }
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0); auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
size_t total = 0;
for (auto i = 0; i < attributes.size(); i++) { for (auto i = 0; i < attributes.size(); i++) {
cout << "Cut points for " << get<0>(attributes[i]) << endl; auto min_max = minmax_element(X[i].begin(), X[i].end());
cout << "--------------------------" << setprecision(3) << endl; cout << "Cut points for feature " << get<0>(attributes[i]) << ": [" << setprecision(3);
test.fit(X[i], y); test.fit(X[i], y);
for (auto item : test.getCutPoints()) { auto cut_points = test.getCutPoints();
cout << item << endl; for (auto item : cut_points) {
cout << item;
if (item != cut_points.back())
cout << ", ";
} }
total += test.getCutPoints().size();
cout << "]" << endl;
cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl;
cout << "--------------------------" << endl;
}
cout << "Total cut points ...: " << total << endl;
cout << "Total feature states: " << total + attributes.size() << endl;
}
void process_all_files(const map<string, bool>& datasets, const string& path, int max_depth, int min_length,
float max_cutpoints)
{
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: "
<< max_cutpoints << endl << endl;
printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)");
printf("==================== ==== ==== ========\n");
for (const auto& dataset : datasets) {
ArffFiles file;
file.load(path + dataset.first + ".arff", dataset.second);
auto attributes = file.getAttributes();
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
size_t timing = 0;
size_t cut_points = 0;
for (auto i = 0; i < attributes.size(); i++) {
auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
test.fit(X[i], y);
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
timing += std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
cut_points += test.getCutPoints().size();
}
printf("%-20s %4lu %4zu %8zu\n", dataset.first.c_str(), attributes.size(), cut_points, timing);
}
}
int main(int argc, char** argv)
{
map<string, bool> datasets = {
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
{"test", true}
};
string file_name;
string path;
int max_depth;
int min_length;
float max_cutpoints;
tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv);
if (datasets.find(file_name) == datasets.end() && file_name != "all") {
cout << "Invalid file name: " << file_name << endl;
usage(argv[0]);
exit(1);
}
if (file_name == "all")
process_all_files(datasets, path, max_depth, min_length, max_cutpoints);
else {
process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints);
cout << "File name ....: " << file_name << endl;
cout << "Max depth ....: " << max_depth << endl;
cout << "Min length ...: " << min_length << endl;
cout << "Max cutpoints : " << max_cutpoints << endl;
} }
return 0; return 0;
} }

View File

@@ -1,21 +1,37 @@
import time import time
import argparse import argparse
import os import os
from scipy.io import arff
import pandas as pd import pandas as pd
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from fimdlp.mdlp import FImdlp from fimdlp.mdlp import FImdlp
from fimdlp.cppfimdlp import CArffFiles
datasets = { datasets = {
"mfeat-factors": True, "mfeat-factors": True,
"iris": True, "iris": True,
"glass": True,
"liver-disorders": True,
"letter": True, "letter": True,
"kdd_JapaneseVowels": False, "kdd_JapaneseVowels": False,
} }
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
ap.add_argument( ap.add_argument(
"--alternative", dest="proposal", action="store_const", const=1 "-n",
"--min_length",
type=int,
default=3,
help="Minimum length of interval",
)
ap.add_argument(
"-m", "--max_depth", type=int, default=9999, help="Maximum depth"
)
ap.add_argument(
"-c",
"--max_cuts",
type=float,
default=0,
help="Maximum number of cut points",
) )
ap.add_argument("dataset", type=str, choices=datasets.keys()) ap.add_argument("dataset", type=str, choices=datasets.keys())
args = ap.parse_args() args = ap.parse_args()
@@ -23,22 +39,34 @@ relative = "" if os.path.isdir("src") else ".."
file_name = os.path.join( file_name = os.path.join(
relative, "src", "cppmdlp", "tests", "datasets", args.dataset relative, "src", "cppmdlp", "tests", "datasets", args.dataset
) )
data = arff.loadarff(file_name + ".arff") arff = CArffFiles()
df = pd.DataFrame(data[0]) arff.load(bytes(f"{file_name}.arff", "utf-8"))
class_column = -1 if datasets[args.dataset] else 0 X = arff.get_X()
class_name = df.columns.to_list()[class_column] y = arff.get_y()
X = df.drop(class_name, axis=1) attributes = arff.get_attributes()
y, _ = pd.factorize(df[class_name]) attributes = [x[0].decode() for x in attributes]
X = X.to_numpy() df = pd.DataFrame(X, columns=attributes)
test = FImdlp(algorithm=args.proposal if args.proposal is not None else 0) class_name = arff.get_class_name().decode()
df[class_name] = y
test = FImdlp(
min_length=args.min_length,
max_depth=args.max_depth,
max_cuts=args.max_cuts,
)
now = time.time() now = time.time()
test.fit(X, y) test.fit(X, y)
fit_time = time.time() fit_time = time.time()
print("Fitting: ", fit_time - now) print(f"Fitting ....: {fit_time - now:7.5f} seconds")
now = time.time() now = time.time()
Xt = test.transform(X) Xt = test.transform(X)
print("Transforming: ", time.time() - now) print(f"Transforming: {time.time() - now:7.5f} seconds")
print(test.get_cut_points()) cut_points = test.get_cut_points()
for i, cuts in enumerate(cut_points):
print(f"Cut points for feature {attributes[i]}: {cuts}")
print(f"Min: {min(X[:, i]):6.4f} Max: {max(X[:, i]):6.4f}")
num_cuts = sum([len(x) for x in cut_points])
print(f"Total cut points ...: {num_cuts}")
print(f"Total feature states: {num_cuts + len(attributes)}")
clf = RandomForestClassifier(random_state=0) clf = RandomForestClassifier(random_state=0)
print( print(
"Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y) "Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y)

View File

@@ -15,6 +15,7 @@ setup(
"src/cppmdlp/CPPFImdlp.cpp", "src/cppmdlp/CPPFImdlp.cpp",
"src/cppmdlp/Metrics.cpp", "src/cppmdlp/Metrics.cpp",
"src/fimdlp/Factorize.cpp", "src/fimdlp/Factorize.cpp",
"src/fimdlp/ArffFiles.cpp",
], ],
language="c++", language="c++",
include_dirs=["fimdlp"], include_dirs=["fimdlp"],

132
src/fimdlp/ArffFiles.cpp Normal file
View File

@@ -0,0 +1,132 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
using namespace std;
ArffFiles::ArffFiles() = default;
vector<string> ArffFiles::getLines() const
{
return lines;
}
unsigned long int ArffFiles::getSize() const
{
return lines.size();
}
vector<pair<string, string>> ArffFiles::getAttributes() const
{
return attributes;
}
string ArffFiles::getClassName() const
{
return className;
}
string ArffFiles::getClassType() const
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(const string& fileName, bool classLast)
{
ifstream file(fileName);
if (!file.is_open()) {
throw invalid_argument("Unable to open file");
}
string line;
string keyword;
string attribute;
string type;
string type_w;
while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute;
type = "";
while (ss >> type_w)
type += type_w + " ";
attributes.emplace_back(attribute, trim(type));
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
if (classLast) {
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
} else {
className = get<0>(attributes.front());
classType = get<1>(attributes.front());
attributes.erase(attributes.begin());
}
generateDataset(classLast);
}
void ArffFiles::generateDataset(bool classLast)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
auto yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0;
for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int pos = 0;
int xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
X[xIndex++][i] = stof(value);
}
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (const string& label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

34
src/fimdlp/ArffFiles.h Normal file
View File

@@ -0,0 +1,34 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<pair<string, string>> attributes;
string className;
string classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset(bool);
public:
ArffFiles();
void load(const string&, bool = true);
vector<string> getLines() const;
unsigned long int getSize() const;
string getClassName() const;
string getClassType() const;
static string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<pair<string, string>> getAttributes() const;
static vector<int> factorize(const vector<string>& labels_t);
};
#endif

View File

@@ -7,7 +7,7 @@ namespace utils {
yy.reserve(labels_t.size()); yy.reserve(labels_t.size());
map<string, int> labelMap; map<string, int> labelMap;
int i = 0; int i = 0;
for (string label : labels_t) { for (const string& label : labels_t) {
if (labelMap.find(label) == labelMap.end()) { if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++; labelMap[label] = i++;
} }

View File

@@ -1 +1 @@
__version__ = "0.9.3" __version__ = "0.9.4"

View File

@@ -1,20 +1,27 @@
# distutils: language = c++ # distutils: language = c++
# cython: language_level = 3 # cython: language_level = 3
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp.pair cimport pair
from libcpp.string cimport string from libcpp.string cimport string
from libcpp cimport bool
import numpy as np
cdef extern from "limits.h":
cdef int INT_MAX
cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
ctypedef float precision_t ctypedef float precision_t
cdef cppclass CPPFImdlp: cdef cppclass CPPFImdlp:
CPPFImdlp() except + CPPFImdlp() except +
CPPFImdlp(size_t, int, float) except +
CPPFImdlp& fit(vector[precision_t]&, vector[int]&) CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
int get_depth()
vector[precision_t] getCutPoints() vector[precision_t] getCutPoints()
string version() string version()
cdef class CFImdlp: cdef class CFImdlp:
cdef CPPFImdlp *thisptr cdef CPPFImdlp *thisptr
def __cinit__(self): def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX, float max_cuts=0):
self.thisptr = new CPPFImdlp() self.thisptr = new CPPFImdlp(min_length, max_depth, max_cuts)
def __dealloc__(self): def __dealloc__(self):
del self.thisptr del self.thisptr
def fit(self, X, y): def fit(self, X, y):
@@ -24,10 +31,51 @@ cdef class CFImdlp:
return self.thisptr.getCutPoints() return self.thisptr.getCutPoints()
def get_version(self): def get_version(self):
return self.thisptr.version() return self.thisptr.version()
def get_depth(self):
return self.thisptr.get_depth()
def __reduce__(self): def __reduce__(self):
return (CFImdlp, ()) return (CFImdlp, ())
cdef extern from "Factorize.h" namespace "utils": cdef extern from "Factorize.h" namespace "utils":
vector[int] cppFactorize(vector[string] &input_vector) vector[int] cppFactorize(vector[string] &input_vector)
def factorize(input_vector): def factorize(input_vector):
return cppFactorize(input_vector) return cppFactorize(input_vector)
cdef extern from "ArffFiles.h":
cdef cppclass ArffFiles:
ArffFiles() except +
void load(string, bool)
unsigned long int getSize()
string getClassName()
string getClassType()
string trim(const string&)
vector[vector[float]]& getX()
vector[int]& getY()
vector[string] getLines()
vector[pair[string, string]] getAttributes()
cdef class CArffFiles:
cdef ArffFiles *thisptr
def __cinit__(self):
self.thisptr = new ArffFiles()
def __dealloc__(self):
del self.thisptr
def load(self, string filename, bool verbose = True):
self.thisptr.load(filename, verbose)
def get_size(self):
return self.thisptr.getSize()
def get_class_name(self):
return self.thisptr.getClassName()
def get_class_type(self):
return self.thisptr.getClassType()
def get_X(self):
return np.array(self.thisptr.getX()).T
def get_y(self):
return self.thisptr.getY()
def get_lines(self):
return self.thisptr.getLines()
def get_attributes(self):
return self.thisptr.getAttributes()
def __reduce__(self):
return (CArffFiles, ())

View File

@@ -6,12 +6,13 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from joblib import Parallel, delayed from joblib import Parallel, delayed
from ._version import __version__ from ._version import __version__
# from ._version import __version__
class FImdlp(TransformerMixin, BaseEstimator): class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, n_jobs=-1): def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0):
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.min_length = min_length
self.max_depth = max_depth
self.max_cuts = max_cuts
"""Fayyad - Irani MDLP discretization algorithm based implementation. """Fayyad - Irani MDLP discretization algorithm based implementation.
@@ -21,6 +22,12 @@ class FImdlp(TransformerMixin, BaseEstimator):
The number of jobs to run in parallel. :meth:`fit` and The number of jobs to run in parallel. :meth:`fit` and
:meth:`transform`, are parallelized over the features. ``-1`` means :meth:`transform`, are parallelized over the features. ``-1`` means
using all cores available. using all cores available.
min_length: int, default=3
The minimum length of an interval to be considered to be discretized.
max_depth: int, default=1e6
The maximum depth of the discretization process.
max_cuts: float, default=0
The maximum number of cut points to be computed for each feature.
Attributes Attributes
---------- ----------
@@ -95,17 +102,28 @@ class FImdlp(TransformerMixin, BaseEstimator):
self._update_params(X, y) self._update_params(X, y)
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
self.efective_min_length_ = (
self.min_length
if self.min_length > 1
else int(self.min_length * X.shape[0])
)
self.discretizer_ = [None] * self.n_features_in_ self.discretizer_ = [None] * self.n_features_in_
self.cut_points_ = [None] * self.n_features_in_ self.cut_points_ = [None] * self.n_features_in_
Parallel(n_jobs=self.n_jobs, prefer="threads")( Parallel(n_jobs=self.n_jobs, prefer="threads")(
delayed(self._fit_discretizer)(feature) delayed(self._fit_discretizer)(feature)
for feature in range(self.n_features_in_) for feature in range(self.n_features_in_)
) )
# target of every feature. Start with -1 => y (see join_fit)
self.target_ = [-1] * self.n_features_in_
return self return self
def _fit_discretizer(self, feature): def _fit_discretizer(self, feature):
if feature in self.features_: if feature in self.features_:
self.discretizer_[feature] = CFImdlp() self.discretizer_[feature] = CFImdlp(
min_length=self.efective_min_length_,
max_depth=self.max_depth,
max_cuts=self.max_cuts,
)
self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[ self.cut_points_[feature] = self.discretizer_[
feature feature
@@ -232,13 +250,21 @@ class FImdlp(TransformerMixin, BaseEstimator):
f"Target {target} not in range [0, {self.n_features_in_})" f"Target {target} not in range [0, {self.n_features_in_})"
) )
if target in features: if target in features:
raise ValueError("Target cannot in features to join") raise ValueError("Target cannot be in features to join")
y_join = [ y_join = [
f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode()
for item_y, items_x in zip(self.y_, data[:, features]) for item_y, items_x in zip(self.y_, data[:, features])
] ]
# Store in target_ the features used with class to discretize target
self.target_[target] = features + [-1]
self.y_join_ = y_join self.y_join_ = y_join
self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))
self.cut_points_[target] = self.discretizer_[target].get_cut_points() self.cut_points_[target] = self.discretizer_[target].get_cut_points()
# return the discretized target variable with the new cut points # return the discretized target variable with the new cut points
return np.searchsorted(self.cut_points_[target], self.X_[:, target]) return np.searchsorted(self.cut_points_[target], self.X_[:, target])
def get_depths(self):
res = [0] * self.n_features_in_
for feature in self.features_:
res[feature] = self.discretizer_[feature].get_depth()
return res

View File

@@ -3,14 +3,14 @@ import sklearn
import numpy as np import numpy as np
from sklearn.datasets import load_iris from sklearn.datasets import load_iris
from sklearn.utils.estimator_checks import check_estimator from sklearn.utils.estimator_checks import check_estimator
from ..cppfimdlp import CFImdlp, factorize from ..cppfimdlp import CFImdlp, factorize, CArffFiles
from ..mdlp import FImdlp from ..mdlp import FImdlp
from .. import __version__ from .. import __version__
# from .._version import __version__
class FImdlpTest(unittest.TestCase): class FImdlpTest(unittest.TestCase):
delta = 1e-6 # same tolerance as in C++ code
def test_version(self): def test_version(self):
clf = FImdlp() clf = FImdlp()
self.assertEqual( self.assertEqual(
@@ -18,11 +18,22 @@ class FImdlpTest(unittest.TestCase):
f"{__version__}({CFImdlp().get_version().decode()})", f"{__version__}({CFImdlp().get_version().decode()})",
) )
def test_minimum_mdlp_version(self):
mdlp_version = tuple(
int(c) for c in CFImdlp().get_version().decode().split(".")[0:3]
)
minimum_mdlp_version = (1, 1, 2)
self.assertTrue(mdlp_version >= minimum_mdlp_version)
def test_init(self): def test_init(self):
clf = FImdlp() clf = FImdlp()
self.assertEqual(-1, clf.n_jobs) self.assertEqual(-1, clf.n_jobs)
clf = FImdlp(n_jobs=7) self.assertEqual(3, clf.min_length)
self.assertEqual(1e6, clf.max_depth)
clf = FImdlp(n_jobs=7, min_length=24, max_depth=17)
self.assertEqual(7, clf.n_jobs) self.assertEqual(7, clf.n_jobs)
self.assertEqual(24, clf.min_length)
self.assertEqual(17, clf.max_depth)
def test_fit_definitive(self): def test_fit_definitive(self):
clf = FImdlp() clf = FImdlp()
@@ -32,15 +43,15 @@ class FImdlpTest(unittest.TestCase):
self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_)) self.assertTrue(np.array_equal(y, clf.y_))
expected = [ expected = [
[5.449999809265137, 5.75], [5.45, 5.75],
[2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684], [2.75, 2.85, 2.95, 3.05, 3.35],
[2.45, 4.75, 5.050000190734863], [2.45, 4.75, 5.05],
[0.8, 1.75], [0.8, 1.75],
] ]
computed = clf.get_cut_points() computed = clf.get_cut_points()
for item_computed, item_expected in zip(computed, expected): for item_computed, item_expected in zip(computed, expected):
for x_, y_ in zip(item_computed, item_expected): for x_, y_ in zip(item_computed, item_expected):
self.assertAlmostEqual(x_, y_) self.assertAlmostEqual(x_, y_, delta=self.delta)
self.assertListEqual([0, 1, 2, 3], clf.features_) self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3]) clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_) self.assertListEqual([0, 2, 3], clf.features_)
@@ -132,22 +143,32 @@ class FImdlpTest(unittest.TestCase):
self.assertListEqual(expected, computed) self.assertListEqual(expected, computed)
def test_join_fit(self): def test_join_fit(self):
y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) y = np.array([b"f0", b"f0", b"f2", b"f3", b"f3", b"f4", b"f4"])
x = np.array( x = np.array(
[ [
[0, 1, 2, 3, 4], [0, 1, 2, 3, 4, 5],
[0, 1, 2, 3, 4], [0, 2, 2, 3, 4, 5],
[1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 5],
[2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 6],
[3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 7],
[1, 2, 2, 3, 5, 7],
[1, 3, 4, 4, 4, 7],
] ]
) )
expected = [0, 0, 1, 2, 2] expected = [0, 1, 1, 2, 2, 1, 2]
clf = FImdlp() clf = FImdlp()
clf.fit(x, factorize(y)) clf.fit(x, factorize(y))
computed = clf.join_fit([0, 2], 1, x) computed = clf.join_fit([0, 2, 3, 4], 1, x)
self.assertListEqual(computed.tolist(), expected) self.assertListEqual(computed.tolist(), expected)
expected_y = [b"002", b"002", b"113", b"224", b"335"] expected_y = [
b"00234",
b"00234",
b"11345",
b"22456",
b"23567",
b"31235",
b"31444",
]
self.assertListEqual(expected_y, clf.y_join_) self.assertListEqual(expected_y, clf.y_join_)
def test_join_fit_error(self): def test_join_fit_error(self):
@@ -192,7 +213,7 @@ class FImdlpTest(unittest.TestCase):
clf.join_fit([0, 2], 2, x) clf.join_fit([0, 2], 2, x)
self.assertEqual( self.assertEqual(
str(exception.exception), str(exception.exception),
"Target cannot in features to join", "Target cannot be in features to join",
) )
def test_factorize(self): def test_factorize(self):
@@ -205,6 +226,16 @@ class FImdlpTest(unittest.TestCase):
computed = clf.factorize(y) computed = clf.factorize(y)
self.assertListEqual([0, 1, 1, 2, 3], computed) self.assertListEqual([0, 1, 1, 2, 3], computed)
def test_join_fit_info(self):
clf = FImdlp()
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
clf.join_fit([0, 2], 1, X)
clf.join_fit([0, 3], 2, X)
clf.join_fit([1, 2], 3, X)
expected = [-1, [0, 2, -1], [0, 3, -1], [1, 2, -1]]
self.assertListEqual(expected, clf.target_)
@staticmethod @staticmethod
def test_sklearn_transformer(): def test_sklearn_transformer():
for check, test in check_estimator(FImdlp(), generate_only=True): for check, test in check_estimator(FImdlp(), generate_only=True):
@@ -227,3 +258,110 @@ class FImdlpTest(unittest.TestCase):
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
clf.fit(X, y) clf.fit(X, y)
self.assertIsNone(clf.get_states_feature(4)) self.assertIsNone(clf.get_states_feature(4))
def test_MaxDepth(self):
clf = FImdlp(max_depth=1)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45],
[3.35],
[2.45],
[0.8],
]
expected_depths = [1] * 4
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_MinLength(self):
clf = FImdlp(min_length=75)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45, 5.75],
[2.85, 3.35],
[2.45, 4.75],
[0.8, 1.75],
]
expected_depths = [3, 2, 2, 2]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_MinLengthMaxDepth(self):
clf = FImdlp(min_length=75, max_depth=2)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45, 5.75],
[2.85, 3.35],
[2.45, 4.75],
[0.8, 1.75],
]
expected_depths = [2, 2, 2, 2]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_max_cuts(self):
clf = FImdlp(max_cuts=1)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45],
[2.85],
[2.45],
[0.8],
]
expected_depths = [3, 5, 4, 3]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_ArffFiles(self):
loader = CArffFiles()
loader.load(b"src/cppmdlp/tests/datasets/iris.arff")
X = loader.get_X()
y = loader.get_y()
expected = [
(b"sepallength", b"REAL"),
(b"sepalwidth", b"REAL"),
(b"petallength", b"REAL"),
(b"petalwidth", b"REAL"),
]
self.assertListEqual(loader.get_attributes(), expected)
self.assertListEqual(y[:10], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
expected = [
b"5.1,3.5,1.4,0.2,Iris-setosa",
b"4.9,3.0,1.4,0.2,Iris-setosa",
b"4.7,3.2,1.3,0.2,Iris-setosa",
b"4.6,3.1,1.5,0.2,Iris-setosa",
b"5.0,3.6,1.4,0.2,Iris-setosa",
b"5.4,3.9,1.7,0.4,Iris-setosa",
b"4.6,3.4,1.4,0.3,Iris-setosa",
b"5.0,3.4,1.5,0.2,Iris-setosa",
b"4.4,2.9,1.4,0.2,Iris-setosa",
b"4.9,3.1,1.5,0.1,Iris-setosa",
]
self.assertListEqual(loader.get_lines()[:10], expected)
expected_X = [
[5.0999999, 3.5, 1.39999998, 0.2],
[4.9000001, 3, 1.39999998, 0.2],
[4.69999981, 3.20000005, 1.29999995, 0.2],
]
for computed, expected in zip(X[:3].tolist(), expected_X):
for c, e in zip(computed, expected):
self.assertAlmostEqual(c, e, delta=self.delta)