New version of library and tests

This commit is contained in:
2023-02-26 17:59:08 +01:00
parent 900cccf76b
commit aa55d3a340
8 changed files with 305 additions and 13 deletions

View File

@@ -1,4 +1,5 @@
include src/cppmdlp/CPPFImdlp.h include src/cppmdlp/CPPFImdlp.h
include src/cppmdlp/typesFImdlp.h include src/cppmdlp/typesFImdlp.h
include src/cppmdlp/Metrics.h include src/cppmdlp/Metrics.h
include src/fimdlp/Factorize.h include src/fimdlp/Factorize.h
include src/fimdlp/ArffFiles.h

View File

@@ -15,6 +15,7 @@ setup(
"src/cppmdlp/CPPFImdlp.cpp", "src/cppmdlp/CPPFImdlp.cpp",
"src/cppmdlp/Metrics.cpp", "src/cppmdlp/Metrics.cpp",
"src/fimdlp/Factorize.cpp", "src/fimdlp/Factorize.cpp",
"src/fimdlp/ArffFiles.cpp",
], ],
language="c++", language="c++",
include_dirs=["fimdlp"], include_dirs=["fimdlp"],

116
src/fimdlp/ArffFiles.cpp Normal file
View File

@@ -0,0 +1,116 @@
#include "ArffFiles.h"
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
using namespace std;
ArffFiles::ArffFiles()
{
}
vector<string> ArffFiles::getLines()
{
return lines;
}
unsigned long int ArffFiles::getSize()
{
return lines.size();
}
vector<pair<string, string>> ArffFiles::getAttributes()
{
return attributes;
}
string ArffFiles::getClassName()
{
return className;
}
string ArffFiles::getClassType()
{
return classType;
}
vector<vector<float>>& ArffFiles::getX()
{
return X;
}
vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(string fileName, bool classLast)
{
ifstream file(fileName);
string keyword, attribute, type;
if (file.is_open()) {
string line;
while (getline(file, line)) {
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute >> type;
attributes.push_back({ attribute, type });
continue;
}
if (line[0] == '@') {
continue;
}
lines.push_back(line);
}
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
if (classLast) {
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
} else {
className = get<0>(attributes.front());
classType = get<1>(attributes.front());
attributes.erase(attributes.begin());
}
generateDataset(classLast);
} else
throw invalid_argument("Unable to open file");
}
void ArffFiles::generateDataset(bool classLast)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
vector<string> yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? attributes.size() : 0;
for (int i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
int pos = 0, xIndex = 0;
while (getline(ss, value, ',')) {
if (pos++ == labelIndex) {
yy[i] = value;
} else {
X[xIndex++][i] = stof(value);
}
}
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s;
}
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
{
vector<int> yy;
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}
yy.push_back(labelMap[label]);
}
return yy;
}

27
src/fimdlp/ArffFiles.h Normal file
View File

@@ -0,0 +1,27 @@
#ifndef ARFFFILES_H
#define ARFFFILES_H
#include <string>
#include <vector>
using namespace std;
class ArffFiles {
private:
vector<string> lines;
vector<pair<string, string>> attributes;
string className, classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset(bool);
public:
ArffFiles();
void load(string, bool = true);
vector<string> getLines();
unsigned long int getSize();
string getClassName();
string getClassType();
string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<pair<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels_t);
};
#endif

View File

@@ -1,7 +1,10 @@
# distutils: language = c++ # distutils: language = c++
# cython: language_level = 3 # cython: language_level = 3
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp.pair cimport pair
from libcpp.string cimport string from libcpp.string cimport string
from libcpp cimport bool
import numpy as np
cdef extern from "limits.h": cdef extern from "limits.h":
cdef int INT_MAX cdef int INT_MAX
@@ -9,7 +12,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
ctypedef float precision_t ctypedef float precision_t
cdef cppclass CPPFImdlp: cdef cppclass CPPFImdlp:
CPPFImdlp() except + CPPFImdlp() except +
CPPFImdlp(int, int) except + CPPFImdlp(size_t, int) except +
CPPFImdlp& fit(vector[precision_t]&, vector[int]&) CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
int get_depth() int get_depth()
vector[precision_t] getCutPoints() vector[precision_t] getCutPoints()
@@ -17,7 +20,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
cdef class CFImdlp: cdef class CFImdlp:
cdef CPPFImdlp *thisptr cdef CPPFImdlp *thisptr
def __cinit__(self, int min_length=3, int max_depth=INT_MAX): def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX):
self.thisptr = new CPPFImdlp(min_length, max_depth) self.thisptr = new CPPFImdlp(min_length, max_depth)
def __dealloc__(self): def __dealloc__(self):
del self.thisptr del self.thisptr
@@ -36,4 +39,43 @@ cdef class CFImdlp:
cdef extern from "Factorize.h" namespace "utils": cdef extern from "Factorize.h" namespace "utils":
vector[int] cppFactorize(vector[string] &input_vector) vector[int] cppFactorize(vector[string] &input_vector)
def factorize(input_vector): def factorize(input_vector):
return cppFactorize(input_vector) return cppFactorize(input_vector)
cdef extern from "ArffFiles.h":
cdef cppclass ArffFiles:
ArffFiles() except +
void load(string, bool)
unsigned long int getSize()
string getClassName()
string getClassType()
string trim(const string&)
vector[vector[float]]& getX()
vector[int]& getY()
vector[string] getLines()
vector[pair[string, string]] getAttributes()
cdef class CArffFiles:
cdef ArffFiles *thisptr
def __cinit__(self):
self.thisptr = new ArffFiles()
def __dealloc__(self):
del self.thisptr
def load(self, string filename, bool verbose = True):
self.thisptr.load(filename, verbose)
def get_size(self):
return self.thisptr.getSize()
def get_class_name(self):
return self.thisptr.getClassName()
def get_class_type(self):
return self.thisptr.getClassType()
def get_X(self):
return np.array(self.thisptr.getX()).T
def get_y(self):
return self.thisptr.getY()
def get_lines(self):
return self.thisptr.getLines()
def get_attributes(self):
return self.thisptr.getAttributes()
def __reduce__(self):
return (CArffFiles, ())

View File

@@ -10,8 +10,10 @@ from ._version import __version__
class FImdlp(TransformerMixin, BaseEstimator): class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, n_jobs=-1): def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6):
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.min_length = min_length
self.max_depth = max_depth
"""Fayyad - Irani MDLP discretization algorithm based implementation. """Fayyad - Irani MDLP discretization algorithm based implementation.
@@ -105,7 +107,9 @@ class FImdlp(TransformerMixin, BaseEstimator):
def _fit_discretizer(self, feature): def _fit_discretizer(self, feature):
if feature in self.features_: if feature in self.features_:
self.discretizer_[feature] = CFImdlp() self.discretizer_[feature] = CFImdlp(
min_length=self.min_length, max_depth=self.max_depth
)
self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[ self.cut_points_[feature] = self.discretizer_[
feature feature
@@ -242,3 +246,9 @@ class FImdlp(TransformerMixin, BaseEstimator):
self.cut_points_[target] = self.discretizer_[target].get_cut_points() self.cut_points_[target] = self.discretizer_[target].get_cut_points()
# return the discretized target variable with the new cut points # return the discretized target variable with the new cut points
return np.searchsorted(self.cut_points_[target], self.X_[:, target]) return np.searchsorted(self.cut_points_[target], self.X_[:, target])
def get_depths(self):
res = [0] * self.n_features_in_
for feature in self.features_:
res[feature] = self.discretizer_[feature].get_depth()
return res

View File

@@ -3,7 +3,7 @@ import sklearn
import numpy as np import numpy as np
from sklearn.datasets import load_iris from sklearn.datasets import load_iris
from sklearn.utils.estimator_checks import check_estimator from sklearn.utils.estimator_checks import check_estimator
from ..cppfimdlp import CFImdlp, factorize from ..cppfimdlp import CFImdlp, factorize, CArffFiles
from ..mdlp import FImdlp from ..mdlp import FImdlp
from .. import __version__ from .. import __version__
@@ -11,6 +11,8 @@ from .. import __version__
class FImdlpTest(unittest.TestCase): class FImdlpTest(unittest.TestCase):
delta = 1e-6 # same tolerance as in C++ code
def test_version(self): def test_version(self):
clf = FImdlp() clf = FImdlp()
self.assertEqual( self.assertEqual(
@@ -21,8 +23,12 @@ class FImdlpTest(unittest.TestCase):
def test_init(self): def test_init(self):
clf = FImdlp() clf = FImdlp()
self.assertEqual(-1, clf.n_jobs) self.assertEqual(-1, clf.n_jobs)
clf = FImdlp(n_jobs=7) self.assertEqual(3, clf.min_length)
self.assertEqual(1e6, clf.max_depth)
clf = FImdlp(n_jobs=7, min_length=24, max_depth=17)
self.assertEqual(7, clf.n_jobs) self.assertEqual(7, clf.n_jobs)
self.assertEqual(24, clf.min_length)
self.assertEqual(17, clf.max_depth)
def test_fit_definitive(self): def test_fit_definitive(self):
clf = FImdlp() clf = FImdlp()
@@ -32,15 +38,15 @@ class FImdlpTest(unittest.TestCase):
self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_)) self.assertTrue(np.array_equal(y, clf.y_))
expected = [ expected = [
[5.449999809265137, 5.75], [5.45, 5.75],
[2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684], [2.75, 2.85, 2.95, 3.05, 3.35],
[2.45, 4.75, 5.050000190734863], [2.45, 4.75, 5.05],
[0.8, 1.75], [0.8, 1.75],
] ]
computed = clf.get_cut_points() computed = clf.get_cut_points()
for item_computed, item_expected in zip(computed, expected): for item_computed, item_expected in zip(computed, expected):
for x_, y_ in zip(item_computed, item_expected): for x_, y_ in zip(item_computed, item_expected):
self.assertAlmostEqual(x_, y_) self.assertAlmostEqual(x_, y_, delta=self.delta)
self.assertListEqual([0, 1, 2, 3], clf.features_) self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3]) clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_) self.assertListEqual([0, 2, 3], clf.features_)
@@ -227,3 +233,92 @@ class FImdlpTest(unittest.TestCase):
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
clf.fit(X, y) clf.fit(X, y)
self.assertIsNone(clf.get_states_feature(4)) self.assertIsNone(clf.get_states_feature(4))
def test_MaxDepth(self):
clf = FImdlp(max_depth=1)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45],
[3.35],
[2.45],
[0.8],
]
expected_depths = [1] * 4
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_MinLength(self):
clf = FImdlp(min_length=75)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45, 5.75],
[2.85, 3.35],
[2.45, 4.75],
[0.8, 1.75],
]
expected_depths = [3, 2, 2, 2]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_MinLengthMaxDepth(self):
clf = FImdlp(min_length=75, max_depth=2)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
expected_cutpoints = [
[5.45, 5.75],
[2.85, 3.35],
[2.45, 4.75],
[0.8, 1.75],
]
expected_depths = [2, 2, 2, 2]
self.assertListEqual(expected_depths, clf.get_depths())
for expected, computed in zip(
expected_cutpoints, clf.get_cut_points()
):
for e, c in zip(expected, computed):
self.assertAlmostEqual(e, c, delta=self.delta)
def test_ArffFiles(self):
loader = CArffFiles()
loader.load(b"src/cppmdlp/tests/datasets/iris.arff")
X = loader.get_X()
y = loader.get_y()
expected = [
(b"sepallength", b"REAL"),
(b"sepalwidth", b"REAL"),
(b"petallength", b"REAL"),
(b"petalwidth", b"REAL"),
]
self.assertListEqual(loader.get_attributes(), expected)
self.assertListEqual(y[:10], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
expected = [
b"5.1,3.5,1.4,0.2,Iris-setosa",
b"4.9,3.0,1.4,0.2,Iris-setosa",
b"4.7,3.2,1.3,0.2,Iris-setosa",
b"4.6,3.1,1.5,0.2,Iris-setosa",
b"5.0,3.6,1.4,0.2,Iris-setosa",
b"5.4,3.9,1.7,0.4,Iris-setosa",
b"4.6,3.4,1.4,0.3,Iris-setosa",
b"5.0,3.4,1.5,0.2,Iris-setosa",
b"4.4,2.9,1.4,0.2,Iris-setosa",
b"4.9,3.1,1.5,0.1,Iris-setosa",
]
self.assertListEqual(loader.get_lines()[:10], expected)
expected_X = [
[5.0999999, 3.5, 1.39999998, 0.2],
[4.9000001, 3, 1.39999998, 0.2],
[4.69999981, 3.20000005, 1.29999995, 0.2],
]
for computed, expected in zip(X[:3].tolist(), expected_X):
for c, e in zip(computed, expected):
self.assertAlmostEqual(c, e, delta=self.delta)