From 900cccf76b47f2ad7be5b1cc12e2ad70aacf26bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 25 Feb 2023 18:52:21 +0100 Subject: [PATCH 01/24] Update discretizer to new library --- src/cppmdlp | 2 +- src/fimdlp/cfimdlp.pyx | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/cppmdlp b/src/cppmdlp index 32a6fd9..964555d 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit 32a6fd9ba0553da691ed9f5b2011bef48cd33d6a +Subproject commit 964555de20175f4c1cd9a2d9525fa1bcca783322 diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx index 8892e8b..8d1a630 100644 --- a/src/fimdlp/cfimdlp.pyx +++ b/src/fimdlp/cfimdlp.pyx @@ -3,18 +3,22 @@ from libcpp.vector cimport vector from libcpp.string cimport string +cdef extern from "limits.h": + cdef int INT_MAX cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": ctypedef float precision_t cdef cppclass CPPFImdlp: CPPFImdlp() except + + CPPFImdlp(int, int) except + CPPFImdlp& fit(vector[precision_t]&, vector[int]&) + int get_depth() vector[precision_t] getCutPoints() string version() cdef class CFImdlp: cdef CPPFImdlp *thisptr - def __cinit__(self): - self.thisptr = new CPPFImdlp() + def __cinit__(self, int min_length=3, int max_depth=INT_MAX): + self.thisptr = new CPPFImdlp(min_length, max_depth) def __dealloc__(self): del self.thisptr def fit(self, X, y): @@ -24,6 +28,8 @@ cdef class CFImdlp: return self.thisptr.getCutPoints() def get_version(self): return self.thisptr.version() + def get_depth(self): + return self.thisptr.get_depth() def __reduce__(self): return (CFImdlp, ()) From aa55d3a3405e87444f362a7db5933c189ee9d035 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 26 Feb 2023 17:59:08 +0100 Subject: [PATCH 02/24] New version of library and tests --- MANIFEST.in | 3 +- setup.py | 1 + src/cppmdlp | 2 +- src/fimdlp/ArffFiles.cpp | 116 ++++++++++++++++++++++++++++++++ src/fimdlp/ArffFiles.h | 27 ++++++++ src/fimdlp/cfimdlp.pyx | 48 ++++++++++++- src/fimdlp/mdlp.py | 14 +++- src/fimdlp/tests/FImdlp_test.py | 107 +++++++++++++++++++++++++++-- 8 files changed, 305 insertions(+), 13 deletions(-) create mode 100644 src/fimdlp/ArffFiles.cpp create mode 100644 src/fimdlp/ArffFiles.h diff --git a/MANIFEST.in b/MANIFEST.in index 433cea4..a55eff4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include src/cppmdlp/CPPFImdlp.h include src/cppmdlp/typesFImdlp.h include src/cppmdlp/Metrics.h -include src/fimdlp/Factorize.h \ No newline at end of file +include src/fimdlp/Factorize.h +include src/fimdlp/ArffFiles.h diff --git a/setup.py b/setup.py index 0ba294e..38917e1 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ setup( "src/cppmdlp/CPPFImdlp.cpp", "src/cppmdlp/Metrics.cpp", "src/fimdlp/Factorize.cpp", + "src/fimdlp/ArffFiles.cpp", ], language="c++", include_dirs=["fimdlp"], diff --git a/src/cppmdlp b/src/cppmdlp index 964555d..a7d13f6 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit 964555de20175f4c1cd9a2d9525fa1bcca783322 +Subproject commit a7d13f602de3d347d1d5ad53bb654b6dedd4def1 diff --git a/src/fimdlp/ArffFiles.cpp b/src/fimdlp/ArffFiles.cpp new file mode 100644 index 0000000..470f5fa --- /dev/null +++ b/src/fimdlp/ArffFiles.cpp @@ -0,0 +1,116 @@ +#include "ArffFiles.h" +#include +#include +#include +#include + +using namespace std; + +ArffFiles::ArffFiles() +{ +} +vector ArffFiles::getLines() +{ + return lines; +} +unsigned long int ArffFiles::getSize() +{ + return lines.size(); +} +vector> ArffFiles::getAttributes() +{ + return attributes; +} +string ArffFiles::getClassName() +{ + return className; +} +string ArffFiles::getClassType() +{ + return classType; +} +vector>& ArffFiles::getX() +{ + return X; +} +vector& ArffFiles::getY() +{ + return y; +} +void ArffFiles::load(string fileName, bool classLast) +{ + ifstream file(fileName); + string keyword, attribute, type; + if (file.is_open()) { + string line; + while (getline(file, line)) { + if (line[0] == '%' || line.empty() || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { + stringstream ss(line); + ss >> keyword >> attribute >> type; + attributes.push_back({ attribute, type }); + continue; + } + if (line[0] == '@') { + continue; + } + lines.push_back(line); + } + file.close(); + if (attributes.empty()) + throw invalid_argument("No attributes found"); + if (classLast) { + className = get<0>(attributes.back()); + classType = get<1>(attributes.back()); + attributes.pop_back(); + } else { + className = get<0>(attributes.front()); + classType = get<1>(attributes.front()); + attributes.erase(attributes.begin()); + } + generateDataset(classLast); + } else + throw invalid_argument("Unable to open file"); +} +void ArffFiles::generateDataset(bool classLast) +{ + X = vector>(attributes.size(), vector(lines.size())); + vector yy = vector(lines.size(), ""); + int labelIndex = classLast ? attributes.size() : 0; + for (int i = 0; i < lines.size(); i++) { + stringstream ss(lines[i]); + string value; + int pos = 0, xIndex = 0; + while (getline(ss, value, ',')) { + if (pos++ == labelIndex) { + yy[i] = value; + } else { + X[xIndex++][i] = stof(value); + } + } + } + y = factorize(yy); +} +string ArffFiles::trim(const string& source) +{ + string s(source); + s.erase(0, s.find_first_not_of(" \n\r\t")); + s.erase(s.find_last_not_of(" \n\r\t") + 1); + return s; +} +vector ArffFiles::factorize(const vector& labels_t) +{ + vector yy; + yy.reserve(labels_t.size()); + map labelMap; + int i = 0; + for (string label : labels_t) { + if (labelMap.find(label) == labelMap.end()) { + labelMap[label] = i++; + } + yy.push_back(labelMap[label]); + } + return yy; +} \ No newline at end of file diff --git a/src/fimdlp/ArffFiles.h b/src/fimdlp/ArffFiles.h new file mode 100644 index 0000000..b56d28d --- /dev/null +++ b/src/fimdlp/ArffFiles.h @@ -0,0 +1,27 @@ +#ifndef ARFFFILES_H +#define ARFFFILES_H +#include +#include +using namespace std; +class ArffFiles { +private: + vector lines; + vector> attributes; + string className, classType; + vector> X; + vector y; + void generateDataset(bool); +public: + ArffFiles(); + void load(string, bool = true); + vector getLines(); + unsigned long int getSize(); + string getClassName(); + string getClassType(); + string trim(const string&); + vector>& getX(); + vector& getY(); + vector> getAttributes(); + vector factorize(const vector& labels_t); +}; +#endif \ No newline at end of file diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx index 8d1a630..fce850d 100644 --- a/src/fimdlp/cfimdlp.pyx +++ b/src/fimdlp/cfimdlp.pyx @@ -1,7 +1,10 @@ # distutils: language = c++ # cython: language_level = 3 from libcpp.vector cimport vector +from libcpp.pair cimport pair from libcpp.string cimport string +from libcpp cimport bool +import numpy as np cdef extern from "limits.h": cdef int INT_MAX @@ -9,7 +12,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": ctypedef float precision_t cdef cppclass CPPFImdlp: CPPFImdlp() except + - CPPFImdlp(int, int) except + + CPPFImdlp(size_t, int) except + CPPFImdlp& fit(vector[precision_t]&, vector[int]&) int get_depth() vector[precision_t] getCutPoints() @@ -17,7 +20,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": cdef class CFImdlp: cdef CPPFImdlp *thisptr - def __cinit__(self, int min_length=3, int max_depth=INT_MAX): + def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX): self.thisptr = new CPPFImdlp(min_length, max_depth) def __dealloc__(self): del self.thisptr @@ -36,4 +39,43 @@ cdef class CFImdlp: cdef extern from "Factorize.h" namespace "utils": vector[int] cppFactorize(vector[string] &input_vector) def factorize(input_vector): - return cppFactorize(input_vector) \ No newline at end of file + return cppFactorize(input_vector) + +cdef extern from "ArffFiles.h": + cdef cppclass ArffFiles: + ArffFiles() except + + void load(string, bool) + unsigned long int getSize() + string getClassName() + string getClassType() + string trim(const string&) + vector[vector[float]]& getX() + vector[int]& getY() + vector[string] getLines() + vector[pair[string, string]] getAttributes() + +cdef class CArffFiles: + cdef ArffFiles *thisptr + def __cinit__(self): + self.thisptr = new ArffFiles() + def __dealloc__(self): + del self.thisptr + def load(self, string filename, bool verbose = True): + self.thisptr.load(filename, verbose) + def get_size(self): + return self.thisptr.getSize() + def get_class_name(self): + return self.thisptr.getClassName() + def get_class_type(self): + return self.thisptr.getClassType() + def get_X(self): + return np.array(self.thisptr.getX()).T + def get_y(self): + return self.thisptr.getY() + def get_lines(self): + return self.thisptr.getLines() + def get_attributes(self): + return self.thisptr.getAttributes() + def __reduce__(self): + return (CArffFiles, ()) + \ No newline at end of file diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 2a2114a..60521ec 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -10,8 +10,10 @@ from ._version import __version__ class FImdlp(TransformerMixin, BaseEstimator): - def __init__(self, n_jobs=-1): + def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6): self.n_jobs = n_jobs + self.min_length = min_length + self.max_depth = max_depth """Fayyad - Irani MDLP discretization algorithm based implementation. @@ -105,7 +107,9 @@ class FImdlp(TransformerMixin, BaseEstimator): def _fit_discretizer(self, feature): if feature in self.features_: - self.discretizer_[feature] = CFImdlp() + self.discretizer_[feature] = CFImdlp( + min_length=self.min_length, max_depth=self.max_depth + ) self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.cut_points_[feature] = self.discretizer_[ feature @@ -242,3 +246,9 @@ class FImdlp(TransformerMixin, BaseEstimator): self.cut_points_[target] = self.discretizer_[target].get_cut_points() # return the discretized target variable with the new cut points return np.searchsorted(self.cut_points_[target], self.X_[:, target]) + + def get_depths(self): + res = [0] * self.n_features_in_ + for feature in self.features_: + res[feature] = self.discretizer_[feature].get_depth() + return res diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 068a4e8..afa92c9 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -3,7 +3,7 @@ import sklearn import numpy as np from sklearn.datasets import load_iris from sklearn.utils.estimator_checks import check_estimator -from ..cppfimdlp import CFImdlp, factorize +from ..cppfimdlp import CFImdlp, factorize, CArffFiles from ..mdlp import FImdlp from .. import __version__ @@ -11,6 +11,8 @@ from .. import __version__ class FImdlpTest(unittest.TestCase): + delta = 1e-6 # same tolerance as in C++ code + def test_version(self): clf = FImdlp() self.assertEqual( @@ -21,8 +23,12 @@ class FImdlpTest(unittest.TestCase): def test_init(self): clf = FImdlp() self.assertEqual(-1, clf.n_jobs) - clf = FImdlp(n_jobs=7) + self.assertEqual(3, clf.min_length) + self.assertEqual(1e6, clf.max_depth) + clf = FImdlp(n_jobs=7, min_length=24, max_depth=17) self.assertEqual(7, clf.n_jobs) + self.assertEqual(24, clf.min_length) + self.assertEqual(17, clf.max_depth) def test_fit_definitive(self): clf = FImdlp() @@ -32,15 +38,15 @@ class FImdlpTest(unittest.TestCase): self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(y, clf.y_)) expected = [ - [5.449999809265137, 5.75], - [2.75, 2.8499999046325684, 2.95, 3.05, 3.3499999046325684], - [2.45, 4.75, 5.050000190734863], + [5.45, 5.75], + [2.75, 2.85, 2.95, 3.05, 3.35], + [2.45, 4.75, 5.05], [0.8, 1.75], ] computed = clf.get_cut_points() for item_computed, item_expected in zip(computed, expected): for x_, y_ in zip(item_computed, item_expected): - self.assertAlmostEqual(x_, y_) + self.assertAlmostEqual(x_, y_, delta=self.delta) self.assertListEqual([0, 1, 2, 3], clf.features_) clf.fit(X, y, features=[0, 2, 3]) self.assertListEqual([0, 2, 3], clf.features_) @@ -227,3 +233,92 @@ class FImdlpTest(unittest.TestCase): X, y = load_iris(return_X_y=True) clf.fit(X, y) self.assertIsNone(clf.get_states_feature(4)) + + def test_MaxDepth(self): + clf = FImdlp(max_depth=1) + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + expected_cutpoints = [ + [5.45], + [3.35], + [2.45], + [0.8], + ] + expected_depths = [1] * 4 + self.assertListEqual(expected_depths, clf.get_depths()) + for expected, computed in zip( + expected_cutpoints, clf.get_cut_points() + ): + for e, c in zip(expected, computed): + self.assertAlmostEqual(e, c, delta=self.delta) + + def test_MinLength(self): + clf = FImdlp(min_length=75) + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + expected_cutpoints = [ + [5.45, 5.75], + [2.85, 3.35], + [2.45, 4.75], + [0.8, 1.75], + ] + expected_depths = [3, 2, 2, 2] + self.assertListEqual(expected_depths, clf.get_depths()) + for expected, computed in zip( + expected_cutpoints, clf.get_cut_points() + ): + for e, c in zip(expected, computed): + self.assertAlmostEqual(e, c, delta=self.delta) + + def test_MinLengthMaxDepth(self): + clf = FImdlp(min_length=75, max_depth=2) + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + expected_cutpoints = [ + [5.45, 5.75], + [2.85, 3.35], + [2.45, 4.75], + [0.8, 1.75], + ] + expected_depths = [2, 2, 2, 2] + self.assertListEqual(expected_depths, clf.get_depths()) + for expected, computed in zip( + expected_cutpoints, clf.get_cut_points() + ): + for e, c in zip(expected, computed): + self.assertAlmostEqual(e, c, delta=self.delta) + + def test_ArffFiles(self): + loader = CArffFiles() + loader.load(b"src/cppmdlp/tests/datasets/iris.arff") + X = loader.get_X() + y = loader.get_y() + expected = [ + (b"sepallength", b"REAL"), + (b"sepalwidth", b"REAL"), + (b"petallength", b"REAL"), + (b"petalwidth", b"REAL"), + ] + self.assertListEqual(loader.get_attributes(), expected) + self.assertListEqual(y[:10], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + expected = [ + b"5.1,3.5,1.4,0.2,Iris-setosa", + b"4.9,3.0,1.4,0.2,Iris-setosa", + b"4.7,3.2,1.3,0.2,Iris-setosa", + b"4.6,3.1,1.5,0.2,Iris-setosa", + b"5.0,3.6,1.4,0.2,Iris-setosa", + b"5.4,3.9,1.7,0.4,Iris-setosa", + b"4.6,3.4,1.4,0.3,Iris-setosa", + b"5.0,3.4,1.5,0.2,Iris-setosa", + b"4.4,2.9,1.4,0.2,Iris-setosa", + b"4.9,3.1,1.5,0.1,Iris-setosa", + ] + self.assertListEqual(loader.get_lines()[:10], expected) + expected_X = [ + [5.0999999, 3.5, 1.39999998, 0.2], + [4.9000001, 3, 1.39999998, 0.2], + [4.69999981, 3.20000005, 1.29999995, 0.2], + ] + for computed, expected in zip(X[:3].tolist(), expected_X): + for c, e in zip(computed, expected): + self.assertAlmostEqual(c, e, delta=self.delta) From ccce9725b36aaed0e223942139bcfcf845a1f375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 13 Mar 2023 18:14:56 +0100 Subject: [PATCH 03/24] Add max_cuts hyperparamter as in mdlp --- src/cppmdlp | 2 +- src/fimdlp/ArffFiles.cpp | 7 +++---- src/fimdlp/cfimdlp.pyx | 6 +++--- src/fimdlp/mdlp.py | 7 +++++-- src/fimdlp/tests/FImdlp_test.py | 20 ++++++++++++++++++-- 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/cppmdlp b/src/cppmdlp index a7d13f6..ed74336 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit a7d13f602de3d347d1d5ad53bb654b6dedd4def1 +Subproject commit ed7433672d98745115fb5f0bc49fcbd7bf035427 diff --git a/src/fimdlp/ArffFiles.cpp b/src/fimdlp/ArffFiles.cpp index 470f5fa..4fbca78 100644 --- a/src/fimdlp/ArffFiles.cpp +++ b/src/fimdlp/ArffFiles.cpp @@ -40,11 +40,10 @@ vector& ArffFiles::getY() void ArffFiles::load(string fileName, bool classLast) { ifstream file(fileName); - string keyword, attribute, type; if (file.is_open()) { - string line; + string line, keyword, attribute, type; while (getline(file, line)) { - if (line[0] == '%' || line.empty() || line == "\r" || line == " ") { + if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { continue; } if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { @@ -79,7 +78,7 @@ void ArffFiles::generateDataset(bool classLast) X = vector>(attributes.size(), vector(lines.size())); vector yy = vector(lines.size(), ""); int labelIndex = classLast ? attributes.size() : 0; - for (int i = 0; i < lines.size(); i++) { + for (size_t i = 0; i < lines.size(); i++) { stringstream ss(lines[i]); string value; int pos = 0, xIndex = 0; diff --git a/src/fimdlp/cfimdlp.pyx b/src/fimdlp/cfimdlp.pyx index fce850d..560bd13 100644 --- a/src/fimdlp/cfimdlp.pyx +++ b/src/fimdlp/cfimdlp.pyx @@ -12,7 +12,7 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": ctypedef float precision_t cdef cppclass CPPFImdlp: CPPFImdlp() except + - CPPFImdlp(size_t, int) except + + CPPFImdlp(size_t, int, float) except + CPPFImdlp& fit(vector[precision_t]&, vector[int]&) int get_depth() vector[precision_t] getCutPoints() @@ -20,8 +20,8 @@ cdef extern from "../cppmdlp/CPPFImdlp.h" namespace "mdlp": cdef class CFImdlp: cdef CPPFImdlp *thisptr - def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX): - self.thisptr = new CPPFImdlp(min_length, max_depth) + def __cinit__(self, size_t min_length=3, int max_depth=INT_MAX, float max_cuts=0): + self.thisptr = new CPPFImdlp(min_length, max_depth, max_cuts) def __dealloc__(self): del self.thisptr def fit(self, X, y): diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 60521ec..f22be91 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -10,10 +10,11 @@ from ._version import __version__ class FImdlp(TransformerMixin, BaseEstimator): - def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6): + def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0): self.n_jobs = n_jobs self.min_length = min_length self.max_depth = max_depth + self.max_cuts = max_cuts """Fayyad - Irani MDLP discretization algorithm based implementation. @@ -108,7 +109,9 @@ class FImdlp(TransformerMixin, BaseEstimator): def _fit_discretizer(self, feature): if feature in self.features_: self.discretizer_[feature] = CFImdlp( - min_length=self.min_length, max_depth=self.max_depth + min_length=self.min_length, + max_depth=self.max_depth, + max_cuts=self.max_cuts, ) self.discretizer_[feature].fit(self.X_[:, feature], self.y_) self.cut_points_[feature] = self.discretizer_[ diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index afa92c9..111f960 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -7,8 +7,6 @@ from ..cppfimdlp import CFImdlp, factorize, CArffFiles from ..mdlp import FImdlp from .. import __version__ -# from .._version import __version__ - class FImdlpTest(unittest.TestCase): delta = 1e-6 # same tolerance as in C++ code @@ -288,6 +286,24 @@ class FImdlpTest(unittest.TestCase): for e, c in zip(expected, computed): self.assertAlmostEqual(e, c, delta=self.delta) + def test_max_cuts(self): + clf = FImdlp(max_cuts=1) + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + expected_cutpoints = [ + [5.45], + [3.35], + [2.45], + [0.8], + ] + expected_depths = [1] * 4 + self.assertListEqual(expected_depths, clf.get_depths()) + for expected, computed in zip( + expected_cutpoints, clf.get_cut_points() + ): + for e, c in zip(expected, computed): + self.assertAlmostEqual(e, c, delta=self.delta) + def test_ArffFiles(self): loader = CArffFiles() loader.load(b"src/cppmdlp/tests/datasets/iris.arff") From e6a56e31403803ad331f4b1189dadc162c0b99fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 14 Mar 2023 11:47:30 +0100 Subject: [PATCH 04/24] Update samples --- samples/CMakeLists.txt | 2 +- samples/sample.cpp | 169 ++++++++++++++++++++++++++++++++++++----- samples/sample.py | 14 +++- src/cppmdlp | 2 +- 4 files changed, 163 insertions(+), 24 deletions(-) diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 3f41728..d10d2a9 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.20) project(main) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 11) add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp) diff --git a/samples/sample.cpp b/samples/sample.cpp index 7410445..61f8321 100644 --- a/samples/sample.cpp +++ b/samples/sample.cpp @@ -1,28 +1,94 @@ -#include "../src/cppmdlp/tests/ArffFiles.h" #include #include #include +#include +#include +#include +#include #include "../src/cppmdlp/CPPFImdlp.h" +#include "../src/cppmdlp/tests/ArffFiles.h" using namespace std; +using namespace mdlp; -int main(int argc, char** argv) +const string PATH = "../../src/cppmdlp/tests/datasets/"; + +/* print a description of all supported options */ +void usage(const char* path) +{ + /* take only the last portion of the path */ + const char* basename = strrchr(path, '/'); + basename = basename ? basename + 1 : path; + + cout << "usage: " << basename << "[OPTION]" << endl; + cout << " -h, --help\t\t Print this help and exit." << endl; + cout << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl; + cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl; + cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl; + cout << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" << endl; + cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl; +} + +tuple parse_arguments(int argc, char** argv) +{ + string file_name; + string path = PATH; + int max_depth = numeric_limits::max(); + int min_length = 3; + float max_cutpoints = 0; + static struct option long_options[] = { + { "help", no_argument, 0, 'h' }, + { "file", required_argument, 0, 'f' }, + { "path", required_argument, 0, 'p' }, + { "max_depth", required_argument, 0, 'm' }, + { "max_cutpoints", required_argument, 0, 'c' }, + { "min_length", required_argument, 0, 'n' }, + { 0, 0, 0, 0 } + }; + while (1) { + auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, 0); + if (c == -1) + break; + switch (c) { + case 'h': + usage(argv[0]); + exit(0); + case 'f': + file_name = optarg; + break; + case 'm': + max_depth = atoi(optarg); + break; + case 'n': + min_length = atoi(optarg); + break; + case 'c': + max_cutpoints = atof(optarg); + break; + case 'p': + path = optarg; + if (path.back() != '/') + path += '/'; + break; + case '?': + usage(argv[0]); + exit(1); + default: + abort(); + } + } + if (file_name.empty()) { + usage(argv[0]); + exit(1); + } + return make_tuple(file_name, path, max_depth, min_length, max_cutpoints); +} + +void process_file(string path, string file_name, bool class_last, int max_depth, int min_length, float max_cutpoints) { ArffFiles file; - vector lines; - string path = "../../src/cppmdlp/tests/datasets/"; - map datasets = { - {"mfeat-factors", true}, - {"iris", true}, - {"letter", true}, - {"kdd_JapaneseVowels", false} - }; - if (argc != 2 || datasets.find(argv[1]) == datasets.end()) { - cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl; - return 1; - } - file.load(path + argv[1] + ".arff", datasets[argv[1]]); + file.load(path + file_name + ".arff", class_last); auto attributes = file.getAttributes(); int items = file.getSize(); cout << "Number of lines: " << items << endl; @@ -33,22 +99,85 @@ int main(int argc, char** argv) cout << "Class name: " << file.getClassName() << endl; cout << "Class type: " << file.getClassType() << endl; cout << "Data: " << endl; - vector>& X = file.getX(); - vector& y = file.getY(); - for (int i = 0; i < 50; i++) { + vector& X = file.getX(); + labels_t& y = file.getY(); + for (int i = 0; i < 5; i++) { for (auto feature : X) { cout << fixed << setprecision(1) << feature[i] << " "; } cout << y[i] << endl; } - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0); + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); + auto total = 0; for (auto i = 0; i < attributes.size(); i++) { + auto min_max = minmax_element(X[i].begin(), X[i].end()); cout << "Cut points for " << get<0>(attributes[i]) << endl; + cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl; cout << "--------------------------" << setprecision(3) << endl; test.fit(X[i], y); for (auto item : test.getCutPoints()) { cout << item << endl; } + total += test.getCutPoints().size(); + } + cout << "Total cut points ...: " << total << endl; + cout << "Total feature states: " << total + attributes.size() << endl; +} + +void process_all_files(map datasets, string path, int max_depth, int min_length, float max_cutpoints) +{ + cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << endl << endl; + printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)"); + printf("==================== ==== ==== ========\n"); + for (auto dataset : datasets) { + ArffFiles file; + file.load(path + dataset.first + ".arff", dataset.second); + auto attributes = file.getAttributes(); + vector& X = file.getX(); + labels_t& y = file.getY(); + size_t timing = 0; + int cut_points = 0; + for (auto i = 0; i < attributes.size(); i++) { + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); + std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); + test.fit(X[i], y); + std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); + timing += std::chrono::duration_cast(end - begin).count(); + cut_points += test.getCutPoints().size(); + } + printf("%-20s %4lu %4d %8zu\n", dataset.first.c_str(), attributes.size(), cut_points, timing); + } +} + + +int main(int argc, char** argv) +{ + map datasets = { + {"glass", true}, + {"iris", true}, + {"kdd_JapaneseVowels", false}, + {"letter", true}, + {"liver-disorders", true}, + {"mfeat-factors", true}, + {"test", true} + }; + string file_name, path; + int max_depth, min_length; + float max_cutpoints; + tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv); + if (datasets.find(file_name) == datasets.end() && file_name != "all") { + cout << "Invalid file name: " << file_name << endl; + usage(argv[0]); + exit(1); + } + if (file_name == "all") + process_all_files(datasets, path, max_depth, min_length, max_cutpoints); + else { + process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints); + cout << "File name ....: " << file_name << endl; + cout << "Max depth ....: " << max_depth << endl; + cout << "Min length ...: " << min_length << endl; + cout << "Max cutpoints : " << max_cutpoints << endl; } return 0; -} +} \ No newline at end of file diff --git a/samples/sample.py b/samples/sample.py index b02bb32..d671ddb 100644 --- a/samples/sample.py +++ b/samples/sample.py @@ -9,13 +9,19 @@ from fimdlp.mdlp import FImdlp datasets = { "mfeat-factors": True, "iris": True, + "glass": True, + "liver-disorders": True, "letter": True, "kdd_JapaneseVowels": False, } ap = argparse.ArgumentParser() ap.add_argument( - "--alternative", dest="proposal", action="store_const", const=1 + "--min_length", type=int, default=3, help="Minimum length of interval" +) +ap.add_argument("--max_depth", type=int, default=9999, help="Maximum depth") +ap.add_argument( + "--max_cuts", type=float, default=0, help="Maximum number of cut points" ) ap.add_argument("dataset", type=str, choices=datasets.keys()) args = ap.parse_args() @@ -30,7 +36,11 @@ class_name = df.columns.to_list()[class_column] X = df.drop(class_name, axis=1) y, _ = pd.factorize(df[class_name]) X = X.to_numpy() -test = FImdlp(algorithm=args.proposal if args.proposal is not None else 0) +test = FImdlp( + min_length=args.min_length, + max_depth=args.max_depth, + max_cuts=args.max_cuts, +) now = time.time() test.fit(X, y) fit_time = time.time() diff --git a/src/cppmdlp b/src/cppmdlp index ed74336..770502c 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit ed7433672d98745115fb5f0bc49fcbd7bf035427 +Subproject commit 770502c8e57a3ea57a722091ce05e4eb08c444d4 From c2a0d33604d6c5f23ba7f851f22c37f60cf56668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 19 Mar 2023 19:14:32 +0100 Subject: [PATCH 05/24] Add last mdlp version --- src/cppmdlp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cppmdlp b/src/cppmdlp index 770502c..ed74336 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit 770502c8e57a3ea57a722091ce05e4eb08c444d4 +Subproject commit ed7433672d98745115fb5f0bc49fcbd7bf035427 From b5c6a49e196fd235edc8749ef89ae76dd6fc1bf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 19 Mar 2023 19:22:07 +0100 Subject: [PATCH 06/24] Add last version of mdlp --- src/cppmdlp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cppmdlp b/src/cppmdlp index ed74336..cfade7a 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit ed7433672d98745115fb5f0bc49fcbd7bf035427 +Subproject commit cfade7a556b7a5c8b050737e7134c517d9ddca28 From 7368dd9ff4d40d089c69c2a82cd307523da3682d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 20 Mar 2023 17:45:58 +0100 Subject: [PATCH 07/24] Refactor ArffFiles in main project --- src/fimdlp/ArffFiles.cpp | 128 ++++++++++++++++++++------------------- src/fimdlp/ArffFiles.h | 39 ++++++++---- 2 files changed, 95 insertions(+), 72 deletions(-) diff --git a/src/fimdlp/ArffFiles.cpp b/src/fimdlp/ArffFiles.cpp index 4fbca78..405a57e 100644 --- a/src/fimdlp/ArffFiles.cpp +++ b/src/fimdlp/ArffFiles.cpp @@ -2,86 +2,92 @@ #include #include #include -#include using namespace std; -ArffFiles::ArffFiles() -{ -} -vector ArffFiles::getLines() -{ +ArffFiles::ArffFiles() = default; + +vector ArffFiles::getLines() const { return lines; } -unsigned long int ArffFiles::getSize() -{ + +unsigned long int ArffFiles::getSize() const { return lines.size(); } -vector> ArffFiles::getAttributes() -{ + +vector> ArffFiles::getAttributes() const { return attributes; } -string ArffFiles::getClassName() -{ + +string ArffFiles::getClassName() const { return className; } -string ArffFiles::getClassType() -{ + +string ArffFiles::getClassType() const { return classType; } -vector>& ArffFiles::getX() -{ + +vector> &ArffFiles::getX() { return X; } -vector& ArffFiles::getY() -{ + +vector &ArffFiles::getY() { return y; } -void ArffFiles::load(string fileName, bool classLast) -{ + +void ArffFiles::load(const string &fileName, bool classLast) { ifstream file(fileName); - if (file.is_open()) { - string line, keyword, attribute, type; - while (getline(file, line)) { - if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { - continue; - } - if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { - stringstream ss(line); - ss >> keyword >> attribute >> type; - attributes.push_back({ attribute, type }); - continue; - } - if (line[0] == '@') { - continue; - } - lines.push_back(line); - } - file.close(); - if (attributes.empty()) - throw invalid_argument("No attributes found"); - if (classLast) { - className = get<0>(attributes.back()); - classType = get<1>(attributes.back()); - attributes.pop_back(); - } else { - className = get<0>(attributes.front()); - classType = get<1>(attributes.front()); - attributes.erase(attributes.begin()); - } - generateDataset(classLast); - } else + if (!file.is_open()) { throw invalid_argument("Unable to open file"); + } + string line; + string keyword; + string attribute; + string type; + string type_w; + while (getline(file, line)) { + if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { + stringstream ss(line); + ss >> keyword >> attribute; + type = ""; + while (ss >> type_w) + type += type_w + " "; + attributes.emplace_back(attribute, type); + continue; + } + if (line[0] == '@') { + continue; + } + lines.push_back(line); + } + file.close(); + if (attributes.empty()) + throw invalid_argument("No attributes found"); + if (classLast) { + className = get<0>(attributes.back()); + classType = get<1>(attributes.back()); + attributes.pop_back(); + } else { + className = get<0>(attributes.front()); + classType = get<1>(attributes.front()); + attributes.erase(attributes.begin()); + } + generateDataset(classLast); + } -void ArffFiles::generateDataset(bool classLast) -{ + +void ArffFiles::generateDataset(bool classLast) { X = vector>(attributes.size(), vector(lines.size())); - vector yy = vector(lines.size(), ""); - int labelIndex = classLast ? attributes.size() : 0; + auto yy = vector(lines.size(), ""); + int labelIndex = classLast ? static_cast(attributes.size()) : 0; for (size_t i = 0; i < lines.size(); i++) { stringstream ss(lines[i]); string value; - int pos = 0, xIndex = 0; + int pos = 0; + int xIndex = 0; while (getline(ss, value, ',')) { if (pos++ == labelIndex) { yy[i] = value; @@ -92,20 +98,20 @@ void ArffFiles::generateDataset(bool classLast) } y = factorize(yy); } -string ArffFiles::trim(const string& source) -{ + +string ArffFiles::trim(const string &source) { string s(source); s.erase(0, s.find_first_not_of(" \n\r\t")); s.erase(s.find_last_not_of(" \n\r\t") + 1); return s; } -vector ArffFiles::factorize(const vector& labels_t) -{ + +vector ArffFiles::factorize(const vector &labels_t) { vector yy; yy.reserve(labels_t.size()); map labelMap; int i = 0; - for (string label : labels_t) { + for (const string &label: labels_t) { if (labelMap.find(label) == labelMap.end()) { labelMap[label] = i++; } diff --git a/src/fimdlp/ArffFiles.h b/src/fimdlp/ArffFiles.h index b56d28d..38531af 100644 --- a/src/fimdlp/ArffFiles.h +++ b/src/fimdlp/ArffFiles.h @@ -1,27 +1,44 @@ #ifndef ARFFFILES_H #define ARFFFILES_H + #include #include + using namespace std; + class ArffFiles { private: vector lines; vector> attributes; - string className, classType; + string className; + string classType; vector> X; vector y; + void generateDataset(bool); + public: ArffFiles(); - void load(string, bool = true); - vector getLines(); - unsigned long int getSize(); - string getClassName(); - string getClassType(); - string trim(const string&); - vector>& getX(); - vector& getY(); - vector> getAttributes(); - vector factorize(const vector& labels_t); + + void load(const string &, bool = true); + + vector getLines() const; + + unsigned long int getSize() const; + + string getClassName() const; + + string getClassType() const; + + static string trim(const string &); + + vector> &getX(); + + vector &getY(); + + vector> getAttributes() const; + + static vector factorize(const vector &labels_t); }; + #endif \ No newline at end of file From e3c329b2e51b91c385130a0876ade4e2c9f18be3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 20 Mar 2023 18:57:26 +0100 Subject: [PATCH 08/24] Add min_length as percentage of # samples --- src/fimdlp/mdlp.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index f22be91..3809b4b 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -98,6 +98,11 @@ class FImdlp(TransformerMixin, BaseEstimator): self._update_params(X, y) self.X_ = X self.y_ = y + self.efective_min_length = ( + self.min_length + if self.min_length > 1 + else int(self.min_length * X.shape[0]) + ) self.discretizer_ = [None] * self.n_features_in_ self.cut_points_ = [None] * self.n_features_in_ Parallel(n_jobs=self.n_jobs, prefer="threads")( @@ -109,7 +114,7 @@ class FImdlp(TransformerMixin, BaseEstimator): def _fit_discretizer(self, feature): if feature in self.features_: self.discretizer_[feature] = CFImdlp( - min_length=self.min_length, + min_length=self.efective_min_length, max_depth=self.max_depth, max_cuts=self.max_cuts, ) From da9db322dac1b77766f9a8e90600b905cda64b64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 20 Mar 2023 18:58:55 +0100 Subject: [PATCH 09/24] Fix sklearn requirement --- src/fimdlp/mdlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 3809b4b..5a8ea8c 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -98,7 +98,7 @@ class FImdlp(TransformerMixin, BaseEstimator): self._update_params(X, y) self.X_ = X self.y_ = y - self.efective_min_length = ( + self.efective_min_length_ = ( self.min_length if self.min_length > 1 else int(self.min_length * X.shape[0]) @@ -114,7 +114,7 @@ class FImdlp(TransformerMixin, BaseEstimator): def _fit_discretizer(self, feature): if feature in self.features_: self.discretizer_[feature] = CFImdlp( - min_length=self.efective_min_length, + min_length=self.efective_min_length_, max_depth=self.max_depth, max_cuts=self.max_cuts, ) From 95bc29c7f26fbc824603090a59fb8fc66b6299a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 20 Mar 2023 20:27:47 +0100 Subject: [PATCH 10/24] Remove trailing space in attribute type of Arff --- src/cppmdlp | 2 +- src/fimdlp/ArffFiles.cpp | 37 ++++++++++++++++++++++++------------- src/fimdlp/Factorize.cpp | 2 +- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/cppmdlp b/src/cppmdlp index cfade7a..12222f7 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit cfade7a556b7a5c8b050737e7134c517d9ddca28 +Subproject commit 12222f790352189ef3a31908015ed47556e7ed28 diff --git a/src/fimdlp/ArffFiles.cpp b/src/fimdlp/ArffFiles.cpp index 405a57e..b576699 100644 --- a/src/fimdlp/ArffFiles.cpp +++ b/src/fimdlp/ArffFiles.cpp @@ -7,35 +7,43 @@ using namespace std; ArffFiles::ArffFiles() = default; -vector ArffFiles::getLines() const { +vector ArffFiles::getLines() const +{ return lines; } -unsigned long int ArffFiles::getSize() const { +unsigned long int ArffFiles::getSize() const +{ return lines.size(); } -vector> ArffFiles::getAttributes() const { +vector> ArffFiles::getAttributes() const +{ return attributes; } -string ArffFiles::getClassName() const { +string ArffFiles::getClassName() const +{ return className; } -string ArffFiles::getClassType() const { +string ArffFiles::getClassType() const +{ return classType; } -vector> &ArffFiles::getX() { +vector>& ArffFiles::getX() +{ return X; } -vector &ArffFiles::getY() { +vector& ArffFiles::getY() +{ return y; } -void ArffFiles::load(const string &fileName, bool classLast) { +void ArffFiles::load(const string& fileName, bool classLast) +{ ifstream file(fileName); if (!file.is_open()) { throw invalid_argument("Unable to open file"); @@ -55,7 +63,7 @@ void ArffFiles::load(const string &fileName, bool classLast) { type = ""; while (ss >> type_w) type += type_w + " "; - attributes.emplace_back(attribute, type); + attributes.emplace_back(attribute, trim(type)); continue; } if (line[0] == '@') { @@ -79,7 +87,8 @@ void ArffFiles::load(const string &fileName, bool classLast) { } -void ArffFiles::generateDataset(bool classLast) { +void ArffFiles::generateDataset(bool classLast) +{ X = vector>(attributes.size(), vector(lines.size())); auto yy = vector(lines.size(), ""); int labelIndex = classLast ? static_cast(attributes.size()) : 0; @@ -99,19 +108,21 @@ void ArffFiles::generateDataset(bool classLast) { y = factorize(yy); } -string ArffFiles::trim(const string &source) { +string ArffFiles::trim(const string& source) +{ string s(source); s.erase(0, s.find_first_not_of(" \n\r\t")); s.erase(s.find_last_not_of(" \n\r\t") + 1); return s; } -vector ArffFiles::factorize(const vector &labels_t) { +vector ArffFiles::factorize(const vector& labels_t) +{ vector yy; yy.reserve(labels_t.size()); map labelMap; int i = 0; - for (const string &label: labels_t) { + for (const string& label : labels_t) { if (labelMap.find(label) == labelMap.end()) { labelMap[label] = i++; } diff --git a/src/fimdlp/Factorize.cpp b/src/fimdlp/Factorize.cpp index f814d6f..9d415bd 100644 --- a/src/fimdlp/Factorize.cpp +++ b/src/fimdlp/Factorize.cpp @@ -7,7 +7,7 @@ namespace utils { yy.reserve(labels_t.size()); map labelMap; int i = 0; - for (string label : labels_t) { + for (const string& label : labels_t) { if (labelMap.find(label) == labelMap.end()) { labelMap[label] = i++; } From 1069fc8ff4f788967e7a2fd5dee88251f772009f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 21 Mar 2023 10:18:51 +0100 Subject: [PATCH 11/24] Add last mdlp version and update sample.cpp --- samples/sample.cpp | 59 ++++++++++++++++++++++++++-------------------- src/cppmdlp | 2 +- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/samples/sample.cpp b/samples/sample.cpp index 61f8321..9440421 100644 --- a/samples/sample.cpp +++ b/samples/sample.cpp @@ -22,10 +22,14 @@ void usage(const char* path) cout << "usage: " << basename << "[OPTION]" << endl; cout << " -h, --help\t\t Print this help and exit." << endl; - cout << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl; + cout + << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." + << endl; cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl; cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl; - cout << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" << endl; + cout + << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" + << endl; cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl; } @@ -36,17 +40,17 @@ tuple parse_arguments(int argc, char** argv) int max_depth = numeric_limits::max(); int min_length = 3; float max_cutpoints = 0; - static struct option long_options[] = { - { "help", no_argument, 0, 'h' }, - { "file", required_argument, 0, 'f' }, - { "path", required_argument, 0, 'p' }, - { "max_depth", required_argument, 0, 'm' }, - { "max_cutpoints", required_argument, 0, 'c' }, - { "min_length", required_argument, 0, 'n' }, - { 0, 0, 0, 0 } + const option long_options[] = { + {"help", no_argument, nullptr, 'h'}, + {"file", required_argument, nullptr, 'f'}, + {"path", required_argument, nullptr, 'p'}, + {"max_depth", required_argument, nullptr, 'm'}, + {"max_cutpoints", required_argument, nullptr, 'c'}, + {"min_length", required_argument, nullptr, 'n'}, + {nullptr, no_argument, nullptr, 0} }; - while (1) { - auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, 0); + while (true) { + const auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, nullptr); if (c == -1) break; switch (c) { @@ -54,16 +58,16 @@ tuple parse_arguments(int argc, char** argv) usage(argv[0]); exit(0); case 'f': - file_name = optarg; + file_name = string(optarg); break; case 'm': - max_depth = atoi(optarg); + max_depth = stoi(optarg); break; case 'n': - min_length = atoi(optarg); + min_length = stoi(optarg); break; case 'c': - max_cutpoints = atof(optarg); + max_cutpoints = stof(optarg); break; case 'p': path = optarg; @@ -84,13 +88,14 @@ tuple parse_arguments(int argc, char** argv) return make_tuple(file_name, path, max_depth, min_length, max_cutpoints); } -void process_file(string path, string file_name, bool class_last, int max_depth, int min_length, float max_cutpoints) +void process_file(const string& path, const string& file_name, bool class_last, int max_depth, int min_length, + float max_cutpoints) { ArffFiles file; file.load(path + file_name + ".arff", class_last); auto attributes = file.getAttributes(); - int items = file.getSize(); + auto items = file.getSize(); cout << "Number of lines: " << items << endl; cout << "Attributes: " << endl; for (auto attribute : attributes) { @@ -107,7 +112,7 @@ void process_file(string path, string file_name, bool class_last, int max_depth, } cout << y[i] << endl; } - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); + auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); auto total = 0; for (auto i = 0; i < attributes.size(); i++) { auto min_max = minmax_element(X[i].begin(), X[i].end()); @@ -124,12 +129,14 @@ void process_file(string path, string file_name, bool class_last, int max_depth, cout << "Total feature states: " << total + attributes.size() << endl; } -void process_all_files(map datasets, string path, int max_depth, int min_length, float max_cutpoints) +void process_all_files(const map& datasets, const string& path, int max_depth, int min_length, + float max_cutpoints) { - cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << endl << endl; + cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: " + << max_cutpoints << endl << endl; printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)"); printf("==================== ==== ==== ========\n"); - for (auto dataset : datasets) { + for (const auto& dataset : datasets) { ArffFiles file; file.load(path + dataset.first + ".arff", dataset.second); auto attributes = file.getAttributes(); @@ -138,7 +145,7 @@ void process_all_files(map datasets, string path, int max_depth, i size_t timing = 0; int cut_points = 0; for (auto i = 0; i < attributes.size(); i++) { - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); + auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); test.fit(X[i], y); std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); @@ -161,8 +168,10 @@ int main(int argc, char** argv) {"mfeat-factors", true}, {"test", true} }; - string file_name, path; - int max_depth, min_length; + string file_name; + string path; + int max_depth; + int min_length; float max_cutpoints; tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv); if (datasets.find(file_name) == datasets.end() && file_name != "all") { diff --git a/src/cppmdlp b/src/cppmdlp index 12222f7..7713573 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit 12222f790352189ef3a31908015ed47556e7ed28 +Subproject commit 77135739cf72cfc02603332c681baae4dcea28f3 From c2294613dfab6b6b7ef3787029104a24e7818724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 22 Mar 2023 18:19:01 +0100 Subject: [PATCH 12/24] Move limits include to CPPFImldp header --- src/cppmdlp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cppmdlp b/src/cppmdlp index 7713573..12222f7 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit 77135739cf72cfc02603332c681baae4dcea28f3 +Subproject commit 12222f790352189ef3a31908015ed47556e7ed28 From e44bca0420a579c067c5d0ff0a3f214498061b65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 22 Mar 2023 18:21:52 +0100 Subject: [PATCH 13/24] Move limits include to CPPFImldp header --- src/cppmdlp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cppmdlp b/src/cppmdlp index 12222f7..42e83b3 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit 12222f790352189ef3a31908015ed47556e7ed28 +Subproject commit 42e83b3d26b9a237b09d718a01abefc30812c9b7 From 0768d68a36d02fe4b99bd12092ac01b5d0247644 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 8 Apr 2023 12:22:03 +0200 Subject: [PATCH 14/24] add join_fit target info --- k.py | 12 ------------ src/fimdlp/mdlp.py | 13 ++++++++++--- src/fimdlp/tests/FImdlp_test.py | 12 +++++++++++- 3 files changed, 21 insertions(+), 16 deletions(-) delete mode 100644 k.py diff --git a/k.py b/k.py deleted file mode 100644 index 47e0856..0000000 --- a/k.py +++ /dev/null @@ -1,12 +0,0 @@ -from sklearn.datasets import load_wine -from fimdlp.mdlp import FImdlp - -X, y = load_wine(return_X_y=True) -trans = FImdlp() -Xt = trans.join_transform(X, y, 12) -print("X shape = ", X.shape) -print("Xt.shape=", Xt.shape) -print("Xt ", Xt[:10]) -print("trans.X_ shape = ", trans.X_.shape) -print("trans.y_ ", trans.y_[:10]) -print("y_join ", trans.y_join_[:10]) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 5a8ea8c..36ce3a0 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -6,8 +6,6 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from joblib import Parallel, delayed from ._version import __version__ -# from ._version import __version__ - class FImdlp(TransformerMixin, BaseEstimator): def __init__(self, n_jobs=-1, min_length=3, max_depth=1e6, max_cuts=0): @@ -24,6 +22,12 @@ class FImdlp(TransformerMixin, BaseEstimator): The number of jobs to run in parallel. :meth:`fit` and :meth:`transform`, are parallelized over the features. ``-1`` means using all cores available. + min_length: int, default=3 + The minimum length of an interval to be considered to be discretized. + max_depth: int, default=1e6 + The maximum depth of the discretization process. + max_cuts: float, default=0 + The maximum number of cut points to be computed for each feature. Attributes ---------- @@ -109,6 +113,8 @@ class FImdlp(TransformerMixin, BaseEstimator): delayed(self._fit_discretizer)(feature) for feature in range(self.n_features_in_) ) + # target of every feature. Start with -1 => y (see join_fit) + self.target_ = [-1] * self.n_features_in_ return self def _fit_discretizer(self, feature): @@ -244,11 +250,12 @@ class FImdlp(TransformerMixin, BaseEstimator): f"Target {target} not in range [0, {self.n_features_in_})" ) if target in features: - raise ValueError("Target cannot in features to join") + raise ValueError("Target cannot be in features to join") y_join = [ f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() for item_y, items_x in zip(self.y_, data[:, features]) ] + self.target_[target] = features + [-1] self.y_join_ = y_join self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) self.cut_points_[target] = self.discretizer_[target].get_cut_points() diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 111f960..0215509 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -196,7 +196,7 @@ class FImdlpTest(unittest.TestCase): clf.join_fit([0, 2], 2, x) self.assertEqual( str(exception.exception), - "Target cannot in features to join", + "Target cannot be in features to join", ) def test_factorize(self): @@ -209,6 +209,16 @@ class FImdlpTest(unittest.TestCase): computed = clf.factorize(y) self.assertListEqual([0, 1, 1, 2, 3], computed) + def test_join_fit_info(self): + clf = FImdlp() + X, y = load_iris(return_X_y=True) + clf.fit(X, y) + clf.join_fit([0, 2], 1, X) + clf.join_fit([0, 3], 2, X) + clf.join_fit([1, 2], 3, X) + expected = [-1, [0, 2, -1], [0, 3, -1], [1, 2, -1]] + self.assertListEqual(expected, clf.target_) + @staticmethod def test_sklearn_transformer(): for check, test in check_estimator(FImdlp(), generate_only=True): From d04cb389c08577a378c42dbcdcc733e2206a94a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 11 Apr 2023 19:33:57 +0200 Subject: [PATCH 15/24] Update tests and module mdlp version --- src/cppmdlp | 2 +- src/fimdlp/tests/FImdlp_test.py | 28 +++++++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/cppmdlp b/src/cppmdlp index 42e83b3..d77d274 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit 42e83b3d26b9a237b09d718a01abefc30812c9b7 +Subproject commit d77d27459ba6fbddcbc54469fab718ab4337290d diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index 0215509..a91a55e 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -136,22 +136,32 @@ class FImdlpTest(unittest.TestCase): self.assertListEqual(expected, computed) def test_join_fit(self): - y = np.array([b"f0", b"f0", b"f2", b"f3", b"f4"]) + y = np.array([b"f0", b"f0", b"f2", b"f3", b"f3", b"f4", b"f4"]) x = np.array( [ - [0, 1, 2, 3, 4], - [0, 1, 2, 3, 4], - [1, 2, 3, 4, 5], - [2, 3, 4, 5, 6], - [3, 4, 5, 6, 7], + [0, 1, 2, 3, 4, 5], + [0, 2, 2, 3, 4, 5], + [1, 2, 3, 4, 5, 5], + [2, 3, 4, 5, 6, 6], + [3, 4, 5, 6, 7, 7], + [1, 2, 2, 3, 5, 7], + [1, 3, 4, 4, 4, 7], ] ) - expected = [0, 0, 1, 2, 2] + expected = [0, 1, 1, 2, 2, 1, 2] clf = FImdlp() clf.fit(x, factorize(y)) - computed = clf.join_fit([0, 2], 1, x) + computed = clf.join_fit([0, 2, 3, 4], 1, x) self.assertListEqual(computed.tolist(), expected) - expected_y = [b"002", b"002", b"113", b"224", b"335"] + expected_y = [ + b"00234", + b"00234", + b"11345", + b"22456", + b"23567", + b"31235", + b"31444", + ] self.assertListEqual(expected_y, clf.y_join_) def test_join_fit_error(self): From 947d54202dc37c9e66480f17674a635139adae37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 11 Apr 2023 19:35:39 +0200 Subject: [PATCH 16/24] Update hyperparams info --- src/fimdlp/mdlp.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 36ce3a0..f94225a 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -27,7 +27,7 @@ class FImdlp(TransformerMixin, BaseEstimator): max_depth: int, default=1e6 The maximum depth of the discretization process. max_cuts: float, default=0 - The maximum number of cut points to be computed for each feature. + The maximum number of cut points to be computed for each feature. Attributes ---------- @@ -115,6 +115,8 @@ class FImdlp(TransformerMixin, BaseEstimator): ) # target of every feature. Start with -1 => y (see join_fit) self.target_ = [-1] * self.n_features_in_ + # target of every feature. Start with -1 => y (see join_fit) + self.target_ = [-1] * self.n_features_in_ return self def _fit_discretizer(self, feature): @@ -251,11 +253,13 @@ class FImdlp(TransformerMixin, BaseEstimator): ) if target in features: raise ValueError("Target cannot be in features to join") + raise ValueError("Target cannot be in features to join") y_join = [ f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() for item_y, items_x in zip(self.y_, data[:, features]) ] self.target_[target] = features + [-1] + self.target_[target] = features + [-1] self.y_join_ = y_join self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) self.cut_points_[target] = self.discretizer_[target].get_cut_points() From fa8c4a221db926d6e8574acc2e9342de86cd603f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 11 Apr 2023 19:45:37 +0200 Subject: [PATCH 17/24] Remove duplicated lines --- src/fimdlp/mdlp.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index f94225a..4e5503a 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -115,8 +115,6 @@ class FImdlp(TransformerMixin, BaseEstimator): ) # target of every feature. Start with -1 => y (see join_fit) self.target_ = [-1] * self.n_features_in_ - # target of every feature. Start with -1 => y (see join_fit) - self.target_ = [-1] * self.n_features_in_ return self def _fit_discretizer(self, feature): @@ -253,13 +251,11 @@ class FImdlp(TransformerMixin, BaseEstimator): ) if target in features: raise ValueError("Target cannot be in features to join") - raise ValueError("Target cannot be in features to join") y_join = [ f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() for item_y, items_x in zip(self.y_, data[:, features]) ] self.target_[target] = features + [-1] - self.target_[target] = features + [-1] self.y_join_ = y_join self.discretizer_[target].fit(self.X_[:, target], factorize(y_join)) self.cut_points_[target] = self.discretizer_[target].get_cut_points() From 25d341aee5dc4c146593bbb25f61248b5d41e840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 12 Apr 2023 17:40:25 +0200 Subject: [PATCH 18/24] Update samples and Readme --- .github/workflows/main.yml | 6 +++--- README.md | 11 +++++++---- pyproject.toml | 5 +++-- samples/CMakeLists.txt | 2 +- samples/sample.py | 4 ++-- 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6adbf1b..1c2f576 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,14 +20,14 @@ jobs: with: submodules: recursive - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} - name: Install dependencies run: | pip install -q --upgrade pip pip install -q scikit-learn cython - pip install -q --upgrade codecov coverage black flake8 codacy-coverage + pip install -q coverage black flake8 codacy-coverage - name: Build and install run: | make install @@ -40,7 +40,7 @@ jobs: coverage run -m unittest discover -v -s src coverage xml - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v3 with: token: ${{ secrets.CODECOV_TOKEN }} files: ./coverage.xml diff --git a/README.md b/README.md index 5cfe03a..ef60d30 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,12 @@ python samples/sample.py -h # for more options ```bash cd samples -mkdir build +cmake -B build cd build -cmake .. -make -./sample iris +make sample +./sample -f iris +./sample -h ``` + +### Based on +[https://github.com/rmontanana/mdlp](https://github.com/rmontanana/mdlp) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index e3cc5fc..229bfd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ dynamic = ['version'] dependencies = ["numpy", "joblib", "scikit-learn"] requires-python = ">=3.9" classifiers = [ - "Development Status :: 3 - Alpha", + "Development Status :: 4 - Beta", "Intended Audience :: Science/Research", "Intended Audience :: Developers", "Topic :: Software Development", @@ -33,6 +33,7 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ] [project.urls] @@ -40,7 +41,7 @@ Home = "https://github.com/doctorado-ml/FImdlp" [tool.black] line-length = 79 -target_version = ['py39', 'py310'] +target_version = ['py39', 'py310', 'py311'] include = '\.pyi?$' exclude = ''' /( diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index d10d2a9..ace3c51 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.20) -project(main) +project(sample) set(CMAKE_CXX_STANDARD 11) diff --git a/samples/sample.py b/samples/sample.py index d671ddb..781b9e8 100644 --- a/samples/sample.py +++ b/samples/sample.py @@ -44,10 +44,10 @@ test = FImdlp( now = time.time() test.fit(X, y) fit_time = time.time() -print("Fitting: ", fit_time - now) +print(f"Fitting ....: {fit_time - now:7.5f} seconds") now = time.time() Xt = test.transform(X) -print("Transforming: ", time.time() - now) +print(f"Transforming: {time.time() - now:7.5f} seconds") print(test.get_cut_points()) clf = RandomForestClassifier(random_state=0) print( From 878cd379ee7b11027aecfe8273dc535d8641de62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Fri, 14 Apr 2023 11:20:48 +0200 Subject: [PATCH 19/24] Change arff library to sample.py --- samples/sample.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/samples/sample.py b/samples/sample.py index 781b9e8..0e12647 100644 --- a/samples/sample.py +++ b/samples/sample.py @@ -5,6 +5,7 @@ from scipy.io import arff import pandas as pd from sklearn.ensemble import RandomForestClassifier from fimdlp.mdlp import FImdlp +from fimdlp.cppfimdlp import CArffFiles datasets = { "mfeat-factors": True, @@ -29,13 +30,15 @@ relative = "" if os.path.isdir("src") else ".." file_name = os.path.join( relative, "src", "cppmdlp", "tests", "datasets", args.dataset ) -data = arff.loadarff(file_name + ".arff") -df = pd.DataFrame(data[0]) -class_column = -1 if datasets[args.dataset] else 0 -class_name = df.columns.to_list()[class_column] -X = df.drop(class_name, axis=1) -y, _ = pd.factorize(df[class_name]) -X = X.to_numpy() +arff = CArffFiles() +arff.load(bytes(f"{file_name}.arff", "utf-8")) +X = arff.get_X() +y = arff.get_y() +attributes = arff.get_attributes() +attributes = [x[0].decode() for x in attributes] +df = pd.DataFrame(X, columns=attributes) +class_name = arff.get_class_name().decode() +df[class_name] = y test = FImdlp( min_length=args.min_length, max_depth=args.max_depth, @@ -48,7 +51,13 @@ print(f"Fitting ....: {fit_time - now:7.5f} seconds") now = time.time() Xt = test.transform(X) print(f"Transforming: {time.time() - now:7.5f} seconds") -print(test.get_cut_points()) +cut_points = test.get_cut_points() +for i, cuts in enumerate(cut_points): + print(f"Cut points for feature {attributes[i]}: {cuts}") + print(f"Min: {min(X[:, i]):6.4f} Max: {max(X[:, i]):6.4f}") +num_cuts = sum([len(x) for x in cut_points]) +print(f"Total cut points ...: {num_cuts}") +print(f"Total feature states: {num_cuts + len(attributes)}") clf = RandomForestClassifier(random_state=0) print( "Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y) From 3ed491cd34cc065c9b1636a48df6ae9623031cfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 25 Apr 2023 12:05:52 +0200 Subject: [PATCH 20/24] Update mdlp version Add mimimun mdlp version test Update sample.cpp --- samples/CMakeLists.txt | 1 + samples/sample.cpp | 30 +++++++++++++++++------------- src/cppmdlp | 2 +- src/fimdlp/tests/FImdlp_test.py | 11 +++++++++-- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index ace3c51..6398778 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -2,5 +2,6 @@ cmake_minimum_required(VERSION 3.20) project(sample) set(CMAKE_CXX_STANDARD 11) +set(CMAKE_BUILD_TYPE Debug) add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp) diff --git a/samples/sample.cpp b/samples/sample.cpp index 9440421..c27ce2d 100644 --- a/samples/sample.cpp +++ b/samples/sample.cpp @@ -28,7 +28,7 @@ void usage(const char* path) cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl; cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl; cout - << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" + << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 -> any" << endl; cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl; } @@ -40,7 +40,7 @@ tuple parse_arguments(int argc, char** argv) int max_depth = numeric_limits::max(); int min_length = 3; float max_cutpoints = 0; - const option long_options[] = { + const vector long_options = { {"help", no_argument, nullptr, 'h'}, {"file", required_argument, nullptr, 'f'}, {"path", required_argument, nullptr, 'p'}, @@ -50,7 +50,7 @@ tuple parse_arguments(int argc, char** argv) {nullptr, no_argument, nullptr, 0} }; while (true) { - const auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, nullptr); + const auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options.data(), nullptr); if (c == -1) break; switch (c) { @@ -94,8 +94,8 @@ void process_file(const string& path, const string& file_name, bool class_last, ArffFiles file; file.load(path + file_name + ".arff", class_last); - auto attributes = file.getAttributes(); - auto items = file.getSize(); + const auto attributes = file.getAttributes(); + const auto items = file.getSize(); cout << "Number of lines: " << items << endl; cout << "Attributes: " << endl; for (auto attribute : attributes) { @@ -113,17 +113,21 @@ void process_file(const string& path, const string& file_name, bool class_last, cout << y[i] << endl; } auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); - auto total = 0; + size_t total = 0; for (auto i = 0; i < attributes.size(); i++) { auto min_max = minmax_element(X[i].begin(), X[i].end()); - cout << "Cut points for " << get<0>(attributes[i]) << endl; - cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl; - cout << "--------------------------" << setprecision(3) << endl; + cout << "Cut points for feature " << get<0>(attributes[i]) << ": [" << setprecision(3); test.fit(X[i], y); - for (auto item : test.getCutPoints()) { - cout << item << endl; + auto cut_points = test.getCutPoints(); + for (auto item : cut_points) { + cout << item; + if (item != cut_points.back()) + cout << ", "; } total += test.getCutPoints().size(); + cout << "]" << endl; + cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl; + cout << "--------------------------" << endl; } cout << "Total cut points ...: " << total << endl; cout << "Total feature states: " << total + attributes.size() << endl; @@ -143,7 +147,7 @@ void process_all_files(const map& datasets, const string& path, in vector& X = file.getX(); labels_t& y = file.getY(); size_t timing = 0; - int cut_points = 0; + size_t cut_points = 0; for (auto i = 0; i < attributes.size(); i++) { auto test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); @@ -152,7 +156,7 @@ void process_all_files(const map& datasets, const string& path, in timing += std::chrono::duration_cast(end - begin).count(); cut_points += test.getCutPoints().size(); } - printf("%-20s %4lu %4d %8zu\n", dataset.first.c_str(), attributes.size(), cut_points, timing); + printf("%-20s %4lu %4zu %8zu\n", dataset.first.c_str(), attributes.size(), cut_points, timing); } } diff --git a/src/cppmdlp b/src/cppmdlp index d77d274..db76afc 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit d77d27459ba6fbddcbc54469fab718ab4337290d +Subproject commit db76afc4e2f38c4b365925b84b31c0bb5713dc84 diff --git a/src/fimdlp/tests/FImdlp_test.py b/src/fimdlp/tests/FImdlp_test.py index a91a55e..e0a5990 100644 --- a/src/fimdlp/tests/FImdlp_test.py +++ b/src/fimdlp/tests/FImdlp_test.py @@ -18,6 +18,13 @@ class FImdlpTest(unittest.TestCase): f"{__version__}({CFImdlp().get_version().decode()})", ) + def test_minimum_mdlp_version(self): + mdlp_version = tuple( + int(c) for c in CFImdlp().get_version().decode().split(".")[0:3] + ) + minimum_mdlp_version = (1, 1, 2) + self.assertTrue(mdlp_version >= minimum_mdlp_version) + def test_init(self): clf = FImdlp() self.assertEqual(-1, clf.n_jobs) @@ -312,11 +319,11 @@ class FImdlpTest(unittest.TestCase): clf.fit(X, y) expected_cutpoints = [ [5.45], - [3.35], + [2.85], [2.45], [0.8], ] - expected_depths = [1] * 4 + expected_depths = [3, 5, 4, 3] self.assertListEqual(expected_depths, clf.get_depths()) for expected, computed in zip( expected_cutpoints, clf.get_cut_points() From 17a66858f88cb8b84446323e35f3d777a72f3ce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 25 Apr 2023 16:58:23 +0200 Subject: [PATCH 21/24] Update version number to 0.9.4 --- README.md | 6 +++--- src/fimdlp/_version.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ef60d30..4ab57c1 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ git clone --recurse-submodules https://github.com/doctorado-ml/FImdlp.git ```bash pip install -e . python samples/sample.py iris -python samples/sample.py iris --alternative +python samples/sample.py iris -c 2 python samples/sample.py -h # for more options ``` @@ -35,8 +35,8 @@ python samples/sample.py -h # for more options cd samples cmake -B build cd build -make sample -./sample -f iris +make +./sample -f iris -c 2 ./sample -h ``` diff --git a/src/fimdlp/_version.py b/src/fimdlp/_version.py index c598173..e94731c 100644 --- a/src/fimdlp/_version.py +++ b/src/fimdlp/_version.py @@ -1 +1 @@ -__version__ = "0.9.3" +__version__ = "0.9.4" From 3a100bbba74f2de5b6a89ecad16f220349166ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 25 Apr 2023 17:11:40 +0200 Subject: [PATCH 22/24] Add mdlp version to Makefile Refactor sample.py --- Makefile | 1 + samples/sample.py | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index ffc0216..7679994 100644 --- a/Makefile +++ b/Makefile @@ -40,6 +40,7 @@ audit: ## Audit pip version: @echo "Current Python version .: $(shell python --version)" @echo "Current FImdlp version .: $(shell python -c "from fimdlp import _version; print(_version.__version__)")" + @echo "Current mdlp version ...: $(shell python -c "from fimdlp.cppfimdlp import CFImdlp; print(CFImdlp().get_version().decode())")" @echo "Installed FImdlp version: $(shell pip show fimdlp | grep Version | cut -d' ' -f2)" help: ## Show help message diff --git a/samples/sample.py b/samples/sample.py index 0e12647..aa525b2 100644 --- a/samples/sample.py +++ b/samples/sample.py @@ -1,7 +1,6 @@ import time import argparse import os -from scipy.io import arff import pandas as pd from sklearn.ensemble import RandomForestClassifier from fimdlp.mdlp import FImdlp @@ -18,11 +17,21 @@ datasets = { ap = argparse.ArgumentParser() ap.add_argument( - "--min_length", type=int, default=3, help="Minimum length of interval" + "-n", + "--min_length", + type=int, + default=3, + help="Minimum length of interval", ) -ap.add_argument("--max_depth", type=int, default=9999, help="Maximum depth") ap.add_argument( - "--max_cuts", type=float, default=0, help="Maximum number of cut points" + "-m", "--max_depth", type=int, default=9999, help="Maximum depth" +) +ap.add_argument( + "-c", + "--max_cuts", + type=float, + default=0, + help="Maximum number of cut points", ) ap.add_argument("dataset", type=str, choices=datasets.keys()) args = ap.parse_args() From dd42e186d534f195d5368753d02493f9ce32a184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 25 Apr 2023 17:16:04 +0200 Subject: [PATCH 23/24] Reformat Arfffiles.h --- src/fimdlp/ArffFiles.h | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/src/fimdlp/ArffFiles.h b/src/fimdlp/ArffFiles.h index 38531af..ff8bbc5 100644 --- a/src/fimdlp/ArffFiles.h +++ b/src/fimdlp/ArffFiles.h @@ -19,26 +19,16 @@ private: public: ArffFiles(); - - void load(const string &, bool = true); - + void load(const string&, bool = true); vector getLines() const; - unsigned long int getSize() const; - string getClassName() const; - string getClassType() const; - - static string trim(const string &); - - vector> &getX(); - - vector &getY(); - + static string trim(const string&); + vector>& getX(); + vector& getY(); vector> getAttributes() const; - - static vector factorize(const vector &labels_t); + static vector factorize(const vector& labels_t); }; #endif \ No newline at end of file From 6e1754856393742997050b403d3d38e1302bbe9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 25 Apr 2023 17:53:36 +0200 Subject: [PATCH 24/24] Add url to pyproject and comment to mdlp --- pyproject.toml | 1 + src/fimdlp/mdlp.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 229bfd7..f677c57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ classifiers = [ [project.urls] Home = "https://github.com/doctorado-ml/FImdlp" +Base = "https://github.com/rmontanana/mdlp" [tool.black] line-length = 79 diff --git a/src/fimdlp/mdlp.py b/src/fimdlp/mdlp.py index 4e5503a..345a6f1 100644 --- a/src/fimdlp/mdlp.py +++ b/src/fimdlp/mdlp.py @@ -255,6 +255,7 @@ class FImdlp(TransformerMixin, BaseEstimator): f"{str(item_y)}{''.join([str(x) for x in items_x])}".encode() for item_y, items_x in zip(self.y_, data[:, features]) ] + # Store in target_ the features used with class to discretize target self.target_[target] = features + [-1] self.y_join_ = y_join self.discretizer_[target].fit(self.X_[:, target], factorize(y_join))