diff --git a/MANIFEST.in b/MANIFEST.in index 433cea4..a55eff4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include src/cppmdlp/CPPFImdlp.h include src/cppmdlp/typesFImdlp.h include src/cppmdlp/Metrics.h -include src/fimdlp/Factorize.h \ No newline at end of file +include src/fimdlp/Factorize.h +include src/fimdlp/ArffFiles.h diff --git a/setup.py b/setup.py index 0ba294e..38917e1 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ setup( "src/cppmdlp/CPPFImdlp.cpp", "src/cppmdlp/Metrics.cpp", "src/fimdlp/Factorize.cpp", + "src/fimdlp/ArffFiles.cpp", ], language="c++", include_dirs=["fimdlp"], diff --git a/src/cppmdlp b/src/cppmdlp index 964555d..a7d13f6 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit 964555de20175f4c1cd9a2d9525fa1bcca783322 +Subproject commit a7d13f602de3d347d1d5ad53bb654b6dedd4def1 diff --git a/src/fimdlp/ArffFiles.cpp b/src/fimdlp/ArffFiles.cpp new file mode 100644 index 0000000..470f5fa --- /dev/null +++ b/src/fimdlp/ArffFiles.cpp @@ -0,0 +1,116 @@ +#include "ArffFiles.h" +#include +#include +#include +#include + +using namespace std; + +ArffFiles::ArffFiles() +{ +} +vector ArffFiles::getLines() +{ + return lines; +} +unsigned long int ArffFiles::getSize() +{ + return lines.size(); +} +vector> ArffFiles::getAttributes() +{ + return attributes; +} +string ArffFiles::getClassName() +{ + return className; +} +string ArffFiles::getClassType() +{ + return classType; +} +vector>& ArffFiles::getX() +{ + return X; +} +vector& ArffFiles::getY() +{ + return y; +} +void ArffFiles::load(string fileName, bool classLast) +{ + ifstream file(fileName); + string keyword, attribute, type; + if (file.is_open()) { + string line; + while (getline(file, line)) { + if (line[0] == '%' || line.empty() || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { + stringstream ss(line); + ss >> keyword >> attribute >> type; + attributes.push_back({ attribute, type }); + continue; + } + if (line[0] == '@') { + continue; + } + lines.push_back(line); + } + file.close(); + if (attributes.empty()) + throw invalid_argument("No attributes found"); + if (classLast) { + className = get<0>(attributes.back()); + classType = get<1>(attributes.back()); + attributes.pop_back(); + } else { + className = get<0>(attributes.front()); + classType = get<1>(attributes.front()); + attributes.erase(attributes.begin()); + } + generateDataset(classLast); + } else + throw invalid_argument("Unable to open file"); +} +void ArffFiles::generateDataset(bool classLast) +{ + X = vector>(attributes.size(), vector(lines.size())); + vector yy = vector(lines.size(), ""); + int labelIndex = classLast ? attributes.size() : 0; + for (int i = 0; i < lines.size(); i++) { + stringstream ss(lines[i]); + string value; + int pos = 0, xIndex = 0; + while (getline(ss, value, ',')) { + if (pos++ == labelIndex) { + yy[i] = value; + } else { + X[xIndex++][i] = stof(value); + } + } + } + y = factorize(yy); +} +string ArffFiles::trim(const string& source) +{ + string s(source); + s.erase(0, s.find_first_not_of(" \n\r\t")); + s.erase(s.find_last_not_of(" \n\r\t") + 1); + return s; +} +vector ArffFiles::factorize(const vector& labels_t) +{ + vector yy; + yy.reserve(labels_t.size()); + map labelMap; + int i = 0; + for (string label : labels_t) { + if (labelMap.find(label) == labelMap.end()) { + labelMap[label] = i++; + } + yy.push_back(labelMap[label]); + } + return yy; +} \ No newline at end of file diff --git a/src/fimdlp/ArffFiles.h b/src/fimdlp/ArffFiles.h new file mode 100644 index 0000000..b56d28d --- /dev/null +++ b/src/fimdlp/ArffFiles.h @@ -0,0 +1,27 @@ +#ifndef ARFFFILES_H +#define ARFFFILES_H +#include +#include +using namespace std; +class ArffFiles { +private: + vector lines; + vector> attributes; + string className, classType; + vector> X; + vector y; + void generateDataset(bool); +public: + ArffFiles(); + void load(string, bool = true); + vector getLines(); + unsigned long int getSize(); + string getClassName(); + string getClassType(); + string trim(const string&); + vector>& getX(); + vector& getY(); + vector> getAttributes(); + vector factorize(const vector& labels_t); +}; +#endif \ No newline at end of file diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx index 8d1a630..fce850d 100644 --- a/src/fimdlp/cfimdlp.pyx +++ b/src/fimdlp/cfimdlp.pyx @@ -1,7 +1,10 @@ # distutils: language = c++ # cython: language_level = 3 from libcpp.vector cimport vector +from libcpp.pair cimport pair from libcpp.string cimport string +from libcpp cimport bool +import numpy as np cdef extern from "limits.h": cdef int INT_MAX @@ -9,7 +12,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": ctypedef float precision_t cdef cppclass CPPFImdlp: CPPFImdlp() except + - CPPFImdlp(int, int) except + + CPPFImdlp(size_t, int) except + CPPFImdlp& fit(vector[precision_t]&, vector[int]&) int get_depth() vector[precision_t] getCutPoints() @@ -17,7 +20,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": cdef class CFImdlp: cdef CPPFImdlp *thisptr - def __cinit__(self, int min_length=3, int max_depth=INT_MAX): + def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX): self.thisptr = new CPPFImdlp(min_length, max_depth) def __dealloc__(self): del self.thisptr @@ -36,4 +39,43 @@ cdef class CFImdlp: cdef extern from "Factorize.h" namespace "utils": vector[int] cppFactorize(vector[string] &input_vector) def factorize(input_vector): - return cppFactorize(input_vector) \ No newline at end of file + return cppFactorize(input_vector) + +cdef extern from "ArffFiles.h": + cdef cppclass ArffFiles: + ArffFiles() except + + void load(string, bool) + unsigned long int getSize() + string getClassName() + string getClassType() + string trim(const string&) + vector[vector[float]]& getX() + vector[int]& getY() + vector[string] getLines() + vector[pair[string, string]] getAttributes() + +cdef class CArffFiles: + cdef ArffFiles *thisptr + def __cinit__(self): + self.thisptr = new ArffFiles() + def __dealloc__(self): + del self.thisptr + def load(self, string filename, bool verbose = True): + self.thisptr.load(filename, verbose) + def get_size(self): + return self.thisptr.getSize() + def get_class_name(self): + return self.thisptr.getClassName() + def get_class_type(self): + return self.thisptr.getClassType() + def get_X(self): + return np.array(self.thisptr.getX()).T + def get_y(self): + return self.thisptr.getY() + def get_lines(self): + return self.thisptr.getLines() + def get_attributes(self): + return self.thisptr.getAttributes() + def __reduce__(self): + return (CArffFiles, ()) + \ No newline at end of file diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 2a2114a..60521ec 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -10,8 +10,10 @@ from ._version import __version__ class FImdlp(TransformerMixin, BaseEstimator): - def __init__(self, n_jobs=-1): + def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6): self.n_jobs = n_jobs + self.min_length = min_length + self.max_depth = max_depth """Fayyad - Irani MDLP discretization algorithm based implementation. @@ -105,7 +107,9 @@ class FImdlp(TransformerMixin, BaseEstimator): def _fit_discretizer(self, feature): if feature in self.features_: - self.discretizer_[feature] = CFImdlp() + self.discretizer_[feature] = CFImdlp( + min_length=self.min_length, max_depth=self.max_depth + ) self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.cut_points_[feature] = self.discretizer_[ feature @@ -242,3 +246,9 @@ class FImdlp(TransformerMixin, BaseEstimator): self.cut_points_[target] = self.discretizer_[target].get_cut_points() # return the discretized target variable with the new cut points return np.searchsorted(self.cut_points_[target], self.X_[:, target]) + + def get_depths(self): + res = [0] * self.n_features_in_ + for feature in self.features_: + res[feature] = self.discretizer_[feature].get_depth() + return res diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 068a4e8..afa92c9 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -3,7 +3,7 @@ import sklearn import numpy as np from sklearn.datasets import load_iris from sklearn.utils.estimator_checks import check_estimator -from ..cppfimdlp import CFImdlp, factorize +from ..cppfimdlp import CFImdlp, factorize, CArffFiles from ..mdlp import FImdlp from .. import __version__ @@ -11,6 +11,8 @@ from .. import __version__ class FImdlpTest(unittest.TestCase): + delta = 1e-6 # same tolerance as in C++ code + def test_version(self): clf = FImdlp() self.assertEqual( @@ -21,8 +23,12 @@ class FImdlpTest(unittest.TestCase): def test_init(self): clf = FImdlp() self.assertEqual(-1, clf.n_jobs) - clf = FImdlp(n_jobs=7) + self.assertEqual(3, clf.min_length) + self.assertEqual(1e6, clf.max_depth) + clf = FImdlp(n_jobs=7, min_length=24, max_depth=17) self.assertEqual(7, clf.n_jobs) + self.assertEqual(24, clf.min_length) + self.assertEqual(17, clf.max_depth) def test_fit_definitive(self): clf = FImdlp() @@ -32,15 +38,15 @@ class FImdlpTest(unittest.TestCase): self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(y, clf.y_)) expected = [ - [5.449999809265137, 5.75], - [2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684], - [2.45, 4.75, 5.050000190734863], + [5.45, 5.75], + [2.75, 2.85, 2.95, 3.05, 3.35], + [2.45, 4.75, 5.05], [0.8, 1.75], ] computed = clf.get_cut_points() for item_computed, item_expected in zip(computed, expected): for x_, y_ in zip(item_computed, item_expected): - self.assertAlmostEqual(x_, y_) + self.assertAlmostEqual(x_, y_, delta=self.delta) self.assertListEqual([0, 1, 2, 3], clf.features_) clf.fit(X, y, features=[0, 2, 3]) self.assertListEqual([0, 2, 3], clf.features_) @@ -227,3 +233,92 @@ class FImdlpTest(unittest.TestCase): X, y = load_iris(return_X_y=True) clf.fit(X, y) self.assertIsNone(clf.get_states_feature(4)) + + def test_MaxDepth(self): + clf = FImdlp(max_depth=1) + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + expected_cutpoints = [ + [5.45], + [3.35], + [2.45], + [0.8], + ] + expected_depths = [1] * 4 + self.assertListEqual(expected_depths, clf.get_depths()) + for expected, computed in zip( + expected_cutpoints, clf.get_cut_points() + ): + for e, c in zip(expected, computed): + self.assertAlmostEqual(e, c, delta=self.delta) + + def test_MinLength(self): + clf = FImdlp(min_length=75) + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + expected_cutpoints = [ + [5.45, 5.75], + [2.85, 3.35], + [2.45, 4.75], + [0.8, 1.75], + ] + expected_depths = [3, 2, 2, 2] + self.assertListEqual(expected_depths, clf.get_depths()) + for expected, computed in zip( + expected_cutpoints, clf.get_cut_points() + ): + for e, c in zip(expected, computed): + self.assertAlmostEqual(e, c, delta=self.delta) + + def test_MinLengthMaxDepth(self): + clf = FImdlp(min_length=75, max_depth=2) + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + expected_cutpoints = [ + [5.45, 5.75], + [2.85, 3.35], + [2.45, 4.75], + [0.8, 1.75], + ] + expected_depths = [2, 2, 2, 2] + self.assertListEqual(expected_depths, clf.get_depths()) + for expected, computed in zip( + expected_cutpoints, clf.get_cut_points() + ): + for e, c in zip(expected, computed): + self.assertAlmostEqual(e, c, delta=self.delta) + + def test_ArffFiles(self): + loader = CArffFiles() + loader.load(b"src/cppmdlp/tests/datasets/iris.arff") + X = loader.get_X() + y = loader.get_y() + expected = [ + (b"sepallength", b"REAL"), + (b"sepalwidth", b"REAL"), + (b"petallength", b"REAL"), + (b"petalwidth", b"REAL"), + ] + self.assertListEqual(loader.get_attributes(), expected) + self.assertListEqual(y[:10], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + expected = [ + b"5.1,3.5,1.4,0.2,Iris-setosa", + b"4.9,3.0,1.4,0.2,Iris-setosa", + b"4.7,3.2,1.3,0.2,Iris-setosa", + b"4.6,3.1,1.5,0.2,Iris-setosa", + b"5.0,3.6,1.4,0.2,Iris-setosa", + b"5.4,3.9,1.7,0.4,Iris-setosa", + b"4.6,3.4,1.4,0.3,Iris-setosa", + b"5.0,3.4,1.5,0.2,Iris-setosa", + b"4.4,2.9,1.4,0.2,Iris-setosa", + b"4.9,3.1,1.5,0.1,Iris-setosa", + ] + self.assertListEqual(loader.get_lines()[:10], expected) + expected_X = [ + [5.0999999, 3.5, 1.39999998, 0.2], + [4.9000001, 3, 1.39999998, 0.2], + [4.69999981, 3.20000005, 1.29999995, 0.2], + ] + for computed, expected in zip(X[:3].tolist(), expected_X): + for c, e in zip(computed, expected): + self.assertAlmostEqual(c, e, delta=self.delta)