mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-16 07:55:52 +00:00
New version of library and tests
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
include src/cppmdlp/CPPFImdlp.h
|
||||
include src/cppmdlp/typesFImdlp.h
|
||||
include src/cppmdlp/Metrics.h
|
||||
include src/fimdlp/Factorize.h
|
||||
include src/fimdlp/Factorize.h
|
||||
include src/fimdlp/ArffFiles.h
|
||||
|
1
setup.py
1
setup.py
@@ -15,6 +15,7 @@ setup(
|
||||
"src/cppmdlp/CPPFImdlp.cpp",
|
||||
"src/cppmdlp/Metrics.cpp",
|
||||
"src/fimdlp/Factorize.cpp",
|
||||
"src/fimdlp/ArffFiles.cpp",
|
||||
],
|
||||
language="c++",
|
||||
include_dirs=["fimdlp"],
|
||||
|
Submodule src/cppmdlp updated: 964555de20...a7d13f602d
116
src/fimdlp/ArffFiles.cpp
Normal file
116
src/fimdlp/ArffFiles.cpp
Normal file
@@ -0,0 +1,116 @@
|
||||
#include "ArffFiles.h"
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
ArffFiles::ArffFiles()
|
||||
{
|
||||
}
|
||||
vector<string> ArffFiles::getLines()
|
||||
{
|
||||
return lines;
|
||||
}
|
||||
unsigned long int ArffFiles::getSize()
|
||||
{
|
||||
return lines.size();
|
||||
}
|
||||
vector<pair<string, string>> ArffFiles::getAttributes()
|
||||
{
|
||||
return attributes;
|
||||
}
|
||||
string ArffFiles::getClassName()
|
||||
{
|
||||
return className;
|
||||
}
|
||||
string ArffFiles::getClassType()
|
||||
{
|
||||
return classType;
|
||||
}
|
||||
vector<vector<float>>& ArffFiles::getX()
|
||||
{
|
||||
return X;
|
||||
}
|
||||
vector<int>& ArffFiles::getY()
|
||||
{
|
||||
return y;
|
||||
}
|
||||
void ArffFiles::load(string fileName, bool classLast)
|
||||
{
|
||||
ifstream file(fileName);
|
||||
string keyword, attribute, type;
|
||||
if (file.is_open()) {
|
||||
string line;
|
||||
while (getline(file, line)) {
|
||||
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") {
|
||||
continue;
|
||||
}
|
||||
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
|
||||
stringstream ss(line);
|
||||
ss >> keyword >> attribute >> type;
|
||||
attributes.push_back({ attribute, type });
|
||||
continue;
|
||||
}
|
||||
if (line[0] == '@') {
|
||||
continue;
|
||||
}
|
||||
lines.push_back(line);
|
||||
}
|
||||
file.close();
|
||||
if (attributes.empty())
|
||||
throw invalid_argument("No attributes found");
|
||||
if (classLast) {
|
||||
className = get<0>(attributes.back());
|
||||
classType = get<1>(attributes.back());
|
||||
attributes.pop_back();
|
||||
} else {
|
||||
className = get<0>(attributes.front());
|
||||
classType = get<1>(attributes.front());
|
||||
attributes.erase(attributes.begin());
|
||||
}
|
||||
generateDataset(classLast);
|
||||
} else
|
||||
throw invalid_argument("Unable to open file");
|
||||
}
|
||||
void ArffFiles::generateDataset(bool classLast)
|
||||
{
|
||||
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
|
||||
vector<string> yy = vector<string>(lines.size(), "");
|
||||
int labelIndex = classLast ? attributes.size() : 0;
|
||||
for (int i = 0; i < lines.size(); i++) {
|
||||
stringstream ss(lines[i]);
|
||||
string value;
|
||||
int pos = 0, xIndex = 0;
|
||||
while (getline(ss, value, ',')) {
|
||||
if (pos++ == labelIndex) {
|
||||
yy[i] = value;
|
||||
} else {
|
||||
X[xIndex++][i] = stof(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
y = factorize(yy);
|
||||
}
|
||||
string ArffFiles::trim(const string& source)
|
||||
{
|
||||
string s(source);
|
||||
s.erase(0, s.find_first_not_of(" \n\r\t"));
|
||||
s.erase(s.find_last_not_of(" \n\r\t") + 1);
|
||||
return s;
|
||||
}
|
||||
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
|
||||
{
|
||||
vector<int> yy;
|
||||
yy.reserve(labels_t.size());
|
||||
map<string, int> labelMap;
|
||||
int i = 0;
|
||||
for (string label : labels_t) {
|
||||
if (labelMap.find(label) == labelMap.end()) {
|
||||
labelMap[label] = i++;
|
||||
}
|
||||
yy.push_back(labelMap[label]);
|
||||
}
|
||||
return yy;
|
||||
}
|
27
src/fimdlp/ArffFiles.h
Normal file
27
src/fimdlp/ArffFiles.h
Normal file
@@ -0,0 +1,27 @@
|
||||
#ifndef ARFFFILES_H
|
||||
#define ARFFFILES_H
|
||||
#include <string>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
class ArffFiles {
|
||||
private:
|
||||
vector<string> lines;
|
||||
vector<pair<string, string>> attributes;
|
||||
string className, classType;
|
||||
vector<vector<float>> X;
|
||||
vector<int> y;
|
||||
void generateDataset(bool);
|
||||
public:
|
||||
ArffFiles();
|
||||
void load(string, bool = true);
|
||||
vector<string> getLines();
|
||||
unsigned long int getSize();
|
||||
string getClassName();
|
||||
string getClassType();
|
||||
string trim(const string&);
|
||||
vector<vector<float>>& getX();
|
||||
vector<int>& getY();
|
||||
vector<pair<string, string>> getAttributes();
|
||||
vector<int> factorize(const vector<string>& labels_t);
|
||||
};
|
||||
#endif
|
@@ -1,7 +1,10 @@
|
||||
# distutils: language = c++
|
||||
# cython: language_level = 3
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.pair cimport pair
|
||||
from libcpp.string cimport string
|
||||
from libcpp cimport bool
|
||||
import numpy as np
|
||||
|
||||
cdef extern from "limits.h":
|
||||
cdef int INT_MAX
|
||||
@@ -9,7 +12,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
|
||||
ctypedef float precision_t
|
||||
cdef cppclass CPPFImdlp:
|
||||
CPPFImdlp() except +
|
||||
CPPFImdlp(int, int) except +
|
||||
CPPFImdlp(size_t, int) except +
|
||||
CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
|
||||
int get_depth()
|
||||
vector[precision_t] getCutPoints()
|
||||
@@ -17,7 +20,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp":
|
||||
|
||||
cdef class CFImdlp:
|
||||
cdef CPPFImdlp *thisptr
|
||||
def __cinit__(self, int min_length=3, int max_depth=INT_MAX):
|
||||
def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX):
|
||||
self.thisptr = new CPPFImdlp(min_length, max_depth)
|
||||
def __dealloc__(self):
|
||||
del self.thisptr
|
||||
@@ -36,4 +39,43 @@ cdef class CFImdlp:
|
||||
cdef extern from "Factorize.h" namespace "utils":
|
||||
vector[int] cppFactorize(vector[string] &input_vector)
|
||||
def factorize(input_vector):
|
||||
return cppFactorize(input_vector)
|
||||
return cppFactorize(input_vector)
|
||||
|
||||
cdef extern from "ArffFiles.h":
|
||||
cdef cppclass ArffFiles:
|
||||
ArffFiles() except +
|
||||
void load(string, bool)
|
||||
unsigned long int getSize()
|
||||
string getClassName()
|
||||
string getClassType()
|
||||
string trim(const string&)
|
||||
vector[vector[float]]& getX()
|
||||
vector[int]& getY()
|
||||
vector[string] getLines()
|
||||
vector[pair[string, string]] getAttributes()
|
||||
|
||||
cdef class CArffFiles:
|
||||
cdef ArffFiles *thisptr
|
||||
def __cinit__(self):
|
||||
self.thisptr = new ArffFiles()
|
||||
def __dealloc__(self):
|
||||
del self.thisptr
|
||||
def load(self, string filename, bool verbose = True):
|
||||
self.thisptr.load(filename, verbose)
|
||||
def get_size(self):
|
||||
return self.thisptr.getSize()
|
||||
def get_class_name(self):
|
||||
return self.thisptr.getClassName()
|
||||
def get_class_type(self):
|
||||
return self.thisptr.getClassType()
|
||||
def get_X(self):
|
||||
return np.array(self.thisptr.getX()).T
|
||||
def get_y(self):
|
||||
return self.thisptr.getY()
|
||||
def get_lines(self):
|
||||
return self.thisptr.getLines()
|
||||
def get_attributes(self):
|
||||
return self.thisptr.getAttributes()
|
||||
def __reduce__(self):
|
||||
return (CArffFiles, ())
|
||||
|
@@ -10,8 +10,10 @@ from ._version import __version__
|
||||
|
||||
|
||||
class FImdlp(TransformerMixin, BaseEstimator):
|
||||
def __init__(self, n_jobs=-1):
|
||||
def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6):
|
||||
self.n_jobs = n_jobs
|
||||
self.min_length = min_length
|
||||
self.max_depth = max_depth
|
||||
|
||||
"""Fayyad - Irani MDLP discretization algorithm based implementation.
|
||||
|
||||
@@ -105,7 +107,9 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
|
||||
def _fit_discretizer(self, feature):
|
||||
if feature in self.features_:
|
||||
self.discretizer_[feature] = CFImdlp()
|
||||
self.discretizer_[feature] = CFImdlp(
|
||||
min_length=self.min_length, max_depth=self.max_depth
|
||||
)
|
||||
self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
|
||||
self.cut_points_[feature] = self.discretizer_[
|
||||
feature
|
||||
@@ -242,3 +246,9 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
||||
self.cut_points_[target] = self.discretizer_[target].get_cut_points()
|
||||
# return the discretized target variable with the new cut points
|
||||
return np.searchsorted(self.cut_points_[target], self.X_[:, target])
|
||||
|
||||
def get_depths(self):
|
||||
res = [0] * self.n_features_in_
|
||||
for feature in self.features_:
|
||||
res[feature] = self.discretizer_[feature].get_depth()
|
||||
return res
|
||||
|
@@ -3,7 +3,7 @@ import sklearn
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
from ..cppfimdlp import CFImdlp, factorize
|
||||
from ..cppfimdlp import CFImdlp, factorize, CArffFiles
|
||||
from ..mdlp import FImdlp
|
||||
from .. import __version__
|
||||
|
||||
@@ -11,6 +11,8 @@ from .. import __version__
|
||||
|
||||
|
||||
class FImdlpTest(unittest.TestCase):
|
||||
delta = 1e-6 # same tolerance as in C++ code
|
||||
|
||||
def test_version(self):
|
||||
clf = FImdlp()
|
||||
self.assertEqual(
|
||||
@@ -21,8 +23,12 @@ class FImdlpTest(unittest.TestCase):
|
||||
def test_init(self):
|
||||
clf = FImdlp()
|
||||
self.assertEqual(-1, clf.n_jobs)
|
||||
clf = FImdlp(n_jobs=7)
|
||||
self.assertEqual(3, clf.min_length)
|
||||
self.assertEqual(1e6, clf.max_depth)
|
||||
clf = FImdlp(n_jobs=7, min_length=24, max_depth=17)
|
||||
self.assertEqual(7, clf.n_jobs)
|
||||
self.assertEqual(24, clf.min_length)
|
||||
self.assertEqual(17, clf.max_depth)
|
||||
|
||||
def test_fit_definitive(self):
|
||||
clf = FImdlp()
|
||||
@@ -32,15 +38,15 @@ class FImdlpTest(unittest.TestCase):
|
||||
self.assertTrue(np.array_equal(X, clf.X_))
|
||||
self.assertTrue(np.array_equal(y, clf.y_))
|
||||
expected = [
|
||||
[5.449999809265137, 5.75],
|
||||
[2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684],
|
||||
[2.45, 4.75, 5.050000190734863],
|
||||
[5.45, 5.75],
|
||||
[2.75, 2.85, 2.95, 3.05, 3.35],
|
||||
[2.45, 4.75, 5.05],
|
||||
[0.8, 1.75],
|
||||
]
|
||||
computed = clf.get_cut_points()
|
||||
for item_computed, item_expected in zip(computed, expected):
|
||||
for x_, y_ in zip(item_computed, item_expected):
|
||||
self.assertAlmostEqual(x_, y_)
|
||||
self.assertAlmostEqual(x_, y_, delta=self.delta)
|
||||
self.assertListEqual([0, 1, 2, 3], clf.features_)
|
||||
clf.fit(X, y, features=[0, 2, 3])
|
||||
self.assertListEqual([0, 2, 3], clf.features_)
|
||||
@@ -227,3 +233,92 @@ class FImdlpTest(unittest.TestCase):
|
||||
X, y = load_iris(return_X_y=True)
|
||||
clf.fit(X, y)
|
||||
self.assertIsNone(clf.get_states_feature(4))
|
||||
|
||||
def test_MaxDepth(self):
|
||||
clf = FImdlp(max_depth=1)
|
||||
X, y = load_iris(return_X_y=True)
|
||||
clf.fit(X, y)
|
||||
expected_cutpoints = [
|
||||
[5.45],
|
||||
[3.35],
|
||||
[2.45],
|
||||
[0.8],
|
||||
]
|
||||
expected_depths = [1] * 4
|
||||
self.assertListEqual(expected_depths, clf.get_depths())
|
||||
for expected, computed in zip(
|
||||
expected_cutpoints, clf.get_cut_points()
|
||||
):
|
||||
for e, c in zip(expected, computed):
|
||||
self.assertAlmostEqual(e, c, delta=self.delta)
|
||||
|
||||
def test_MinLength(self):
|
||||
clf = FImdlp(min_length=75)
|
||||
X, y = load_iris(return_X_y=True)
|
||||
clf.fit(X, y)
|
||||
expected_cutpoints = [
|
||||
[5.45, 5.75],
|
||||
[2.85, 3.35],
|
||||
[2.45, 4.75],
|
||||
[0.8, 1.75],
|
||||
]
|
||||
expected_depths = [3, 2, 2, 2]
|
||||
self.assertListEqual(expected_depths, clf.get_depths())
|
||||
for expected, computed in zip(
|
||||
expected_cutpoints, clf.get_cut_points()
|
||||
):
|
||||
for e, c in zip(expected, computed):
|
||||
self.assertAlmostEqual(e, c, delta=self.delta)
|
||||
|
||||
def test_MinLengthMaxDepth(self):
|
||||
clf = FImdlp(min_length=75, max_depth=2)
|
||||
X, y = load_iris(return_X_y=True)
|
||||
clf.fit(X, y)
|
||||
expected_cutpoints = [
|
||||
[5.45, 5.75],
|
||||
[2.85, 3.35],
|
||||
[2.45, 4.75],
|
||||
[0.8, 1.75],
|
||||
]
|
||||
expected_depths = [2, 2, 2, 2]
|
||||
self.assertListEqual(expected_depths, clf.get_depths())
|
||||
for expected, computed in zip(
|
||||
expected_cutpoints, clf.get_cut_points()
|
||||
):
|
||||
for e, c in zip(expected, computed):
|
||||
self.assertAlmostEqual(e, c, delta=self.delta)
|
||||
|
||||
def test_ArffFiles(self):
|
||||
loader = CArffFiles()
|
||||
loader.load(b"src/cppmdlp/tests/datasets/iris.arff")
|
||||
X = loader.get_X()
|
||||
y = loader.get_y()
|
||||
expected = [
|
||||
(b"sepallength", b"REAL"),
|
||||
(b"sepalwidth", b"REAL"),
|
||||
(b"petallength", b"REAL"),
|
||||
(b"petalwidth", b"REAL"),
|
||||
]
|
||||
self.assertListEqual(loader.get_attributes(), expected)
|
||||
self.assertListEqual(y[:10], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
|
||||
expected = [
|
||||
b"5.1,3.5,1.4,0.2,Iris-setosa",
|
||||
b"4.9,3.0,1.4,0.2,Iris-setosa",
|
||||
b"4.7,3.2,1.3,0.2,Iris-setosa",
|
||||
b"4.6,3.1,1.5,0.2,Iris-setosa",
|
||||
b"5.0,3.6,1.4,0.2,Iris-setosa",
|
||||
b"5.4,3.9,1.7,0.4,Iris-setosa",
|
||||
b"4.6,3.4,1.4,0.3,Iris-setosa",
|
||||
b"5.0,3.4,1.5,0.2,Iris-setosa",
|
||||
b"4.4,2.9,1.4,0.2,Iris-setosa",
|
||||
b"4.9,3.1,1.5,0.1,Iris-setosa",
|
||||
]
|
||||
self.assertListEqual(loader.get_lines()[:10], expected)
|
||||
expected_X = [
|
||||
[5.0999999, 3.5, 1.39999998, 0.2],
|
||||
[4.9000001, 3, 1.39999998, 0.2],
|
||||
[4.69999981, 3.20000005, 1.29999995, 0.2],
|
||||
]
|
||||
for computed, expected in zip(X[:3].tolist(), expected_X):
|
||||
for c, e in zip(computed, expected):
|
||||
self.assertAlmostEqual(c, e, delta=self.delta)
|
||||
|
Reference in New Issue
Block a user