Refactor samples and fix Metrics tests

This commit is contained in:
2022-12-10 14:32:28 +01:00
parent 418db2bb99
commit 3d48073574
22 changed files with 301 additions and 258 deletions

View File

@@ -6,5 +6,7 @@ Fayyad - Irani MDLP discretization algorithm
```bash ```bash
python setup.py build_ext --inplace python setup.py build_ext --inplace
python sample.py python samples/sample.py iris --original
python samples/sample.py iris --proposal
python samples/sample.py -h # for more options
``` ```

View File

@@ -1,21 +1,17 @@
#include <numeric> #include <numeric>
#include <iostream>
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include "CPPFImdlp.h" #include "CPPFImdlp.h"
#include "Metrics.h" #include "Metrics.h"
namespace mdlp { namespace mdlp {
CPPFImdlp::CPPFImdlp(): proposal(true), debug(false), indices(indices_t()), y(labels()), metrics(Metrics(y, indices)) CPPFImdlp::CPPFImdlp(bool proposal):proposal(proposal), indices(indices_t()), y(labels_t()), metrics(Metrics(y, indices))
{
}
CPPFImdlp::CPPFImdlp(bool proposal, bool debug): proposal(proposal), debug(debug), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
{ {
} }
CPPFImdlp::~CPPFImdlp() CPPFImdlp::~CPPFImdlp()
= default; = default;
CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_) CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_)
{ {
X = X_; X = X_;
y = y_; y = y_;
@@ -28,8 +24,10 @@ namespace mdlp {
} }
indices = sortIndices(X_); indices = sortIndices(X_);
metrics.setData(y, indices); metrics.setData(y, indices);
//computeCutPoints(0, X.size()); if (proposal)
computeCutPointsProposal(); computeCutPointsProposal();
else
computeCutPoints(0, X.size());
return *this; return *this;
} }
void CPPFImdlp::computeCutPoints(size_t start, size_t end) void CPPFImdlp::computeCutPoints(size_t start, size_t end)
@@ -53,7 +51,6 @@ namespace mdlp {
} }
void CPPFImdlp::computeCutPointsOriginal(size_t start, size_t end) void CPPFImdlp::computeCutPointsOriginal(size_t start, size_t end)
{ {
size_t idx;
precision_t cut; precision_t cut;
if (end - start < 2) if (end - start < 2)
return; return;
@@ -76,14 +73,9 @@ namespace mdlp {
yCur = yPrev = y[indices[0]]; yCur = yPrev = y[indices[0]];
numElements = indices.size() - 1; numElements = indices.size() - 1;
idx = start = 0; idx = start = 0;
bool firstCutPoint = true;
if (debug)
printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
while (idx < numElements) { while (idx < numElements) {
xPivot = xCur; xPivot = xCur;
yPivot = yCur; yPivot = yCur;
if (debug)
printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
// Read the same values and check class changes // Read the same values and check class changes
do { do {
idx++; idx++;
@@ -92,17 +84,12 @@ namespace mdlp {
if (yCur != yPivot && xCur == xPivot) { if (yCur != yPivot && xCur == xPivot) {
yPivot = -1; yPivot = -1;
} }
if (debug)
printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
} }
while (idx < numElements && xCur == xPivot); while (idx < numElements && xCur == xPivot);
// Check if the class changed and there are more than 1 element // Check if the class changed and there are more than 1 element
if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) { if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) {
start = idx; start = idx;
cutPoint = (xPrev + xCur) / 2; cutPoint = (xPrev + xCur) / 2;
if (debug) {
printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = %3.1g \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint);
}
cutPoints.push_back(cutPoint); cutPoints.push_back(cutPoint);
} }
yPrev = yPivot; yPrev = yPivot;
@@ -160,7 +147,7 @@ namespace mdlp {
return output; return output;
} }
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples& X_) indices_t CPPFImdlp::sortIndices(samples_t& X_)
{ {
indices_t idx(X_.size()); indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0); iota(idx.begin(), idx.end(), 0);

View File

@@ -6,15 +6,14 @@
namespace mdlp { namespace mdlp {
class CPPFImdlp { class CPPFImdlp {
protected: protected:
bool proposal; // proposed algorithm or original algorithm bool proposal;
bool debug;
indices_t indices; // sorted indices to use with X and y indices_t indices; // sorted indices to use with X and y
samples X; samples_t X;
labels y; labels_t y;
Metrics metrics; Metrics metrics;
cutPoints_t cutPoints; cutPoints_t cutPoints;
static indices_t sortIndices(samples&); static indices_t sortIndices(samples_t&);
void computeCutPoints(size_t, size_t); void computeCutPoints(size_t, size_t);
long int getCandidate(size_t, size_t); long int getCandidate(size_t, size_t);
bool mdlp(size_t, size_t, size_t); bool mdlp(size_t, size_t, size_t);
@@ -25,11 +24,10 @@ namespace mdlp {
void computeCutPointsProposal(); void computeCutPointsProposal();
public: public:
CPPFImdlp(); CPPFImdlp(bool);
CPPFImdlp(bool, bool debug = false);
~CPPFImdlp(); ~CPPFImdlp();
CPPFImdlp& fit(samples&, labels&); CPPFImdlp& fit(samples_t&, labels_t&);
samples getCutPoints(); samples_t getCutPoints();
}; };
} }
#endif #endif

View File

@@ -1,8 +1,9 @@
#include "Metrics.h" #include "Metrics.h"
#include <set> #include <set>
#include <cmath>
using namespace std; using namespace std;
namespace mdlp { namespace mdlp {
Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t()) Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
{ {
} }
int Metrics::computeNumClasses(size_t start, size_t end) int Metrics::computeNumClasses(size_t start, size_t end)
@@ -13,7 +14,7 @@ namespace mdlp {
} }
return nClasses.size(); return nClasses.size();
} }
void Metrics::setData(labels& y_, indices_t& indices_) void Metrics::setData(labels_t& y_, indices_t& indices_)
{ {
indices = indices_; indices = indices_;
y = y_; y = y_;
@@ -25,7 +26,7 @@ namespace mdlp {
{ {
precision_t p, ventropy = 0; precision_t p, ventropy = 0;
int nElements = 0; int nElements = 0;
labels counts(numClasses + 1, 0); labels_t counts(numClasses + 1, 0);
if (end - start < 2) if (end - start < 2)
return 0; return 0;
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) { if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {

View File

@@ -1,18 +1,17 @@
#ifndef CCMETRICS_H #ifndef CCMETRICS_H
#define CCMETRICS_H #define CCMETRICS_H
#include "typesFImdlp.h" #include "typesFImdlp.h"
#include <cmath>
namespace mdlp { namespace mdlp {
class Metrics { class Metrics {
protected: protected:
labels& y; labels_t& y;
indices_t& indices; indices_t& indices;
int numClasses; int numClasses;
cacheEnt_t entropyCache; cacheEnt_t entropyCache;
cacheIg_t igCache; cacheIg_t igCache;
public: public:
Metrics(labels&, indices_t&); Metrics(labels_t&, indices_t&);
void setData(labels&, indices_t&); void setData(labels_t&, indices_t&);
int computeNumClasses(size_t, size_t); int computeNumClasses(size_t, size_t);
precision_t entropy(size_t, size_t); precision_t entropy(size_t, size_t);
precision_t informationGain(size_t, size_t, size_t); precision_t informationGain(size_t, size_t, size_t);

View File

@@ -1 +1 @@
__version__ = '0.1.1' __version__ = "0.9.1"

View File

@@ -6,24 +6,15 @@ from libcpp cimport bool
cdef extern from "CPPFImdlp.h" namespace "mdlp": cdef extern from "CPPFImdlp.h" namespace "mdlp":
ctypedef float precision_t ctypedef float precision_t
cdef cppclass CPPFImdlp: cdef cppclass CPPFImdlp:
CPPFImdlp() except + CPPFImdlp(bool) except +
CPPFImdlp(bool, bool) except +
CPPFImdlp& fit(vector[precision_t]&, vector[int]&) CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
vector[precision_t] getCutPoints() vector[precision_t] getCutPoints()
class PcutPoint_t:
def __init__(self, start, end, fromValue, toValue):
self.start = start
self.end = end
self.fromValue = fromValue
self.toValue = toValue
cdef class CFImdlp: cdef class CFImdlp:
cdef CPPFImdlp *thisptr cdef CPPFImdlp *thisptr
def __cinit__(self, debug=False, proposal=True): def __cinit__(self, proposal):
# Proposal or original algorithm self.thisptr = new CPPFImdlp(proposal)
self.thisptr = new CPPFImdlp(proposal, debug)
def __dealloc__(self): def __dealloc__(self):
del self.thisptr del self.thisptr
def fit(self, X, y): def fit(self, X, y):

View File

@@ -3,33 +3,35 @@ from .cppfimdlp import CFImdlp
from sklearn.base import BaseEstimator, TransformerMixin from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import unique_labels from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from joblib import Parallel, delayed
class FImdlp(TransformerMixin, BaseEstimator): class FImdlp(TransformerMixin, BaseEstimator):
def __init__(self, proposal=True): def __init__(self, n_jobs=-1, proposal=False):
self.proposal = proposal # proposed algorithm or original algorithm self.n_jobs = n_jobs
self.proposal = proposal
"""Fayyad - Irani MDLP discretization algorithm. """Fayyad - Irani MDLP discretization algorithm based implementation.
Parameters Parameters
---------- ----------
demo_param : str, default='demo' n_jobs : int, default=-1
A parameter used for demonstation of how to pass and store paramters. The number of jobs to run in parallel. :meth:`fit` and
:meth:`transform`, are parallelized over the features. ``-1`` means
using all cores available.
Attributes Attributes
---------- ----------
n_features_ : int n_features_ : int
The number of features of the data passed to :meth:`fit`. The number of features of the data passed to :meth:`fit`.
discretizer_ : list discretizer_ : list
The list of discretizers for each feature. The list of discretizers, one for each feature.
cut_points_ : list cut_points_ : list
The list of cut points for each feature. The list of cut points for each feature.
X_ : array X_ : array
the samples used to fit, shape (n_samples, n_features) the samples used to fit, shape (n_samples, n_features)
y_ : array y_ : array
the labels used to fit, shape (n_samples,) the labels used to fit, shape (n_samples,)
discretized_X_ :
array of the discretized samples passed to fit(n_samples, n_features)
features_ : list features_ : list
the list of features to be discretized the list of features to be discretized
""" """
@@ -70,6 +72,8 @@ class FImdlp(TransformerMixin, BaseEstimator):
y : None y : None
There is no need of a target in a transformer, yet the pipeline API There is no need of a target in a transformer, yet the pipeline API
requires this parameter. requires this parameter.
features : list, default=[i for i in range(n_features)]
The list of features to be discretized.
Returns Returns
------- -------
self : object self : object
@@ -83,36 +87,22 @@ class FImdlp(TransformerMixin, BaseEstimator):
self.y_ = y self.y_ = y
self.discretizer_ = [None] * self.n_features_ self.discretizer_ = [None] * self.n_features_
self.cut_points_ = [None] * self.n_features_ self.cut_points_ = [None] * self.n_features_
# Can do it in parallel Parallel(n_jobs=self.n_jobs, prefer="threads")(
for feature in self.features_: delayed(self._fit_discretizer)(feature)
self.discretizer_[feature] = CFImdlp( for feature in range(self.n_features_)
proposal=self.proposal, debug=False
) )
self.discretizer_[feature].fit(X[:, feature], y)
self.cut_points_[feature] = self.discretizer_[
feature
].get_cut_points()
return self return self
def get_fitted(self): def _fit_discretizer(self, feature):
"""Return the discretized X computed during fit. self.discretizer_[feature] = CFImdlp(proposal=self.proposal)
self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
self.cut_points_[feature] = self.discretizer_[feature].get_cut_points()
Returns def _discretize_feature(self, feature, X, result):
-------
X_transformed : array, shape (n_samples, n_features)
discretized X computed during fit.
"""
# Check is fit had been called
check_is_fitted(self, "n_features_")
result = np.zeros_like(self.X_, dtype=np.int32) - 1
for feature in range(self.n_features_):
if feature in self.features_: if feature in self.features_:
result[:, feature] = self.discretizer_[ result[:, feature] = np.searchsorted(self.cut_points_[feature], X)
feature
].get_discretized_values()
else: else:
result[:, feature] = self.X_[:, feature] result[:, feature] = X
return result
def transform(self, X): def transform(self, X):
"""Discretize X values. """Discretize X values.
@@ -127,28 +117,28 @@ class FImdlp(TransformerMixin, BaseEstimator):
""" """
# Check is fit had been called # Check is fit had been called
check_is_fitted(self, "n_features_") check_is_fitted(self, "n_features_")
# Input validation # Input validation
X = check_array(X) X = check_array(X)
# Check that the input is of the same shape as the one passed # Check that the input is of the same shape as the one passed
# during fit. # during fit.
# if X.shape[1] != self.n_features_: if X.shape[1] != self.n_features_:
# raise ValueError( raise ValueError(
# "Shape of input is different from what was seen in `fit`" "Shape of input is different from what was seen in `fit`"
# ) )
result = np.zeros_like(X, dtype=np.int32) - 1 result = np.zeros_like(X, dtype=np.int32) - 1
# Can do it in parallel Parallel(n_jobs=self.n_jobs, prefer="threads")(
for feature in range(self.n_features_): delayed(self._discretize_feature)(feature, X[:, feature], result)
if feature in self.features_: for feature in range(self.n_features_)
result[:, feature] = np.searchsorted(
self.cut_points_[feature], X[:, feature]
) )
else:
result[:, feature] = X[:, feature]
return result return result
def get_cut_points(self): def get_cut_points(self):
"""Get the cut points for each feature.
Returns
-------
result: list
The list of cut points for each feature.
"""
result = [] result = []
for feature in range(self.n_features_): for feature in range(self.n_features_):
result.append(self.cut_points_[feature]) result.append(self.cut_points_[feature])

View File

@@ -1,74 +1,63 @@
//#include "gtest/gtest.h" #include "gtest/gtest.h"
//#include "../Metrics.h" #include "../Metrics.h"
//#include "../CPPFImdlp.h" #include "../CPPFImdlp.h"
//namespace mdlp { namespace mdlp {
// class TestFImdlp : public CPPFImdlp, public testing::Test { class TestFImdlp: public CPPFImdlp, public testing::Test {
// public: public:
// TestFImdlp() : CPPFImdlp(true, true) {} TestFImdlp(): CPPFImdlp(false) {}
// void SetUp() void SetUp()
// { {
// // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0] // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
// //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2) //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
// X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
// y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
// fit(X, y); fit(X, y);
// } }
// void setProposal(bool value) void setProposal(bool value)
// { {
// proposal = value; proposal = value;
// } }
// void initCutPoints() void initIndices()
// { {
// setCutPoints(cutPoints_t()); indices = indices_t();
// } }
// void initIndices() void checkSortedVector(samples_t& X_, indices_t indices_)
// { {
// indices = indices_t(); X = X_;
// } indices = indices_;
// void initDiscretized() indices_t testSortedIndices = sortIndices(X);
// { precision_t prev = X[testSortedIndices[0]];
// xDiscretized = labels(); for (auto i = 0; i < X.size(); ++i) {
// } EXPECT_EQ(testSortedIndices[i], indices[i]);
// void checkSortedVector(samples& X_, indices_t indices_) EXPECT_LE(prev, X[testSortedIndices[i]]);
// { prev = X[testSortedIndices[i]];
// X = X_; }
// indices = indices_; }
// indices_t testSortedIndices = sortIndices(X); void checkCutPoints(cutPoints_t& expected)
// precision_t prev = X[testSortedIndices[0]]; {
// for (auto i = 0; i < X.size(); ++i) { int expectedSize = expected.size();
// EXPECT_EQ(testSortedIndices[i], indices[i]); EXPECT_EQ(cutPoints.size(), expectedSize);
// EXPECT_LE(prev, X[testSortedIndices[i]]); for (auto i = 0; i < expectedSize; i++) {
// prev = X[testSortedIndices[i]]; EXPECT_EQ(cutPoints[i], expected[i]);
// } }
// } }
// void checkCutPoints(cutPoints_t& expected) template<typename T, typename A>
// { void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
// int expectedSize = expected.size(); {
// EXPECT_EQ(cutPoints.size(), expectedSize); EXPECT_EQ(expected.size(), computed.size());
// for (auto i = 0; i < expectedSize; i++) { for (auto i = 0; i < expected.size(); i++) {
// EXPECT_EQ(cutPoints[i].start, expected[i].start); EXPECT_EQ(expected[i], computed[i]);
// EXPECT_EQ(cutPoints[i].end, expected[i].end); }
// EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber); }
// EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision); };
// EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision); TEST_F(TestFImdlp, FitErrorEmptyDataset)
// } {
// } X = samples_t();
// template<typename T, typename A> y = labels_t();
// void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed) EXPECT_THROW(fit(X, y), std::invalid_argument);
// { }
// EXPECT_EQ(expected.size(), computed.size()); }
// for (auto i = 0; i < expected.size(); i++) {
// EXPECT_EQ(expected[i], computed[i]);
// }
// }
// //
// };
// TEST_F(TestFImdlp, FitErrorEmptyDataset)
// {
// X = samples();
// y = labels();
// EXPECT_THROW(fit(X, y), std::invalid_argument);
// }
// TEST_F(TestFImdlp, FitErrorDifferentSize) // TEST_F(TestFImdlp, FitErrorDifferentSize)
// { // {
// X = { 1, 2, 3 }; // X = { 1, 2, 3 };
@@ -143,7 +132,7 @@
// } // }
// TEST_F(TestFImdlp, DiscretizedValues) // TEST_F(TestFImdlp, DiscretizedValues)
// { // {
// labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // labels_t computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
// computed = getDiscretizedValues(); // computed = getDiscretizedValues();
// checkVectors(expected, computed); // checkVectors(expected, computed);
// } // }
@@ -157,7 +146,7 @@
// TEST_F(TestFImdlp, Constructor) // TEST_F(TestFImdlp, Constructor)
// { // {
// samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; // samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
// labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; // labels_t y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
// setProposal(false); // setProposal(false);
// fit(X, y); // fit(X, y);
// computeCutPointsOriginal(); // computeCutPointsOriginal();

View File

@@ -1,31 +1,43 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "../Metrics.h" #include "../Metrics.h"
namespace mdlp { namespace mdlp {
class TestMetrics: public Metrics, public testing::Test {
public:
labels_t y;
samples_t X;
indices_t indices;
precision_t precision = 0.000001; precision_t precision = 0.000001;
TEST(MetricTest, NumClasses)
TestMetrics(): Metrics(y, indices) {}
void SetUp()
{ {
labels y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
EXPECT_EQ(1, Metrics::numClasses(y, indices, 4, 8)); setData(y, indices);
EXPECT_EQ(2, Metrics::numClasses(y, indices, 0, 10));
EXPECT_EQ(2, Metrics::numClasses(y, indices, 8, 10));
} }
TEST(MetricTest, Entropy) };
TEST_F(TestMetrics, NumClasses)
{ {
labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; EXPECT_EQ(1, computeNumClasses(4, 8));
EXPECT_EQ(1, Metrics::entropy(y, indices, 0, 10, 2)); EXPECT_EQ(2, computeNumClasses(0, 10));
EXPECT_EQ(0, Metrics::entropy(y, indices, 0, 5, 1)); EXPECT_EQ(2, computeNumClasses(8, 10));
labels yz = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
ASSERT_NEAR(0.468996, Metrics::entropy(yz, indices, 0, 10, 2), precision);
} }
TEST(MetricTest, InformationGain) TEST_F(TestMetrics, Entropy)
{ {
labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; EXPECT_EQ(1, entropy(0, 10));
indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; EXPECT_EQ(0, entropy(0, 5));
labels yz = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
ASSERT_NEAR(1, Metrics::informationGain(y, indices, 0, 10, 5, 2), precision); setData(y, indices);
ASSERT_NEAR(0.108032, Metrics::informationGain(yz, indices, 0, 10, 5, 2), precision); ASSERT_NEAR(0.468996, entropy(0, 10), precision);
}
TEST_F(TestMetrics, InformationGain)
{
ASSERT_NEAR(1, informationGain(0, 5, 10), precision);
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
setData(y, indices);
ASSERT_NEAR(0.108032, informationGain(0, 5, 10), precision);
} }
} }

Binary file not shown.

View File

@@ -101,13 +101,13 @@ string ArffFiles::trim(const string& source)
s.erase(s.find_last_not_of(" \n\r\t") + 1); s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s; return s;
} }
vector<int> ArffFiles::factorize(const vector<string>& labels) vector<int> ArffFiles::factorize(const vector<string>& labels_t)
{ {
vector<int> yy; vector<int> yy;
yy.reserve(labels.size()); yy.reserve(labels_t.size());
map<string, int> labelMap; map<string, int> labelMap;
int i = 0; int i = 0;
for (string label : labels) { for (string label : labels_t) {
if (labelMap.find(label) == labelMap.end()) { if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++; labelMap[label] = i++;
} }

View File

@@ -23,6 +23,6 @@ public:
vector<vector<float>>& getX(); vector<vector<float>>& getX();
vector<int>& getY(); vector<int>& getY();
vector<tuple<string, string>> getAttributes(); vector<tuple<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels); vector<int> factorize(const vector<string>& labels_t);
}; };
#endif #endif

View File

@@ -8,12 +8,14 @@ from ..mdlp import FImdlp
class FImdlpTest(unittest.TestCase): class FImdlpTest(unittest.TestCase):
def test_init(self): def test_init(self):
clf = FImdlp() clf = FImdlp()
self.assertTrue(clf.proposal) self.assertEqual(-1, clf.n_jobs)
clf = FImdlp(proposal=False)
self.assertFalse(clf.proposal) self.assertFalse(clf.proposal)
clf = FImdlp(proposal=True, n_jobs=7)
self.assertTrue(clf.proposal)
self.assertEqual(7, clf.n_jobs)
def test_fit(self): def test_fit_proposal(self):
clf = FImdlp() clf = FImdlp(proposal=True)
clf.fit([[1, 2], [3, 4]], [1, 2]) clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_, 2) self.assertEqual(clf.n_features_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]]) self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
@@ -25,10 +27,39 @@ class FImdlpTest(unittest.TestCase):
self.assertTrue(np.array_equal(X, clf.X_)) self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_)) self.assertTrue(np.array_equal(y, clf.y_))
expected = [ expected = [
[4.900000095367432, 5.0, 5.099999904632568, 5.400000095367432], [
[2.6999998092651367, 2.9000000953674316], 4.900000095367432,
[2.3499999046325684, 4.5], 5.0,
[0.75, 1.399999976158142, 1.5], 5.099999904632568,
5.400000095367432,
5.699999809265137,
],
[2.6999998092651367, 2.9000000953674316, 3.1999998092651367],
[2.3499999046325684, 4.5, 4.800000190734863],
[0.75, 1.399999976158142, 1.5, 1.7000000476837158],
]
self.assertListEqual(expected, clf.get_cut_points())
self.assertListEqual([0, 1, 2, 3], clf.features_)
clf.fit(X, y, features=[0, 2, 3])
self.assertListEqual([0, 2, 3], clf.features_)
def test_fit_original(self):
clf = FImdlp(proposal=False)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(clf.n_features_, 2)
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
self.assertListEqual(clf.y_.tolist(), [1, 2])
self.assertListEqual([[], []], clf.get_cut_points())
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
expected = [
[5.5, 5.800000190734863],
[3.0999999046325684],
[2.450000047683716, 4.800000190734863, 5.099999904632568],
[0.800000011920929, 1.7000000476837158],
] ]
self.assertListEqual(expected, clf.get_cut_points()) self.assertListEqual(expected, clf.get_cut_points())
self.assertListEqual([0, 1, 2, 3], clf.features_) self.assertListEqual([0, 1, 2, 3], clf.features_)
@@ -44,8 +75,38 @@ class FImdlpTest(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
clf.fit([[1, 2], [3, 4]], [1, 2], unexpected="class_name") clf.fit([[1, 2], [3, 4]], [1, 2], unexpected="class_name")
def test_transform(self): def test_transform_original(self):
clf = FImdlp() clf = FImdlp(proposal=False)
clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
)
X, y = load_iris(return_X_y=True)
clf.fit(X, y)
self.assertEqual(clf.n_features_, 4)
self.assertTrue(np.array_equal(X, clf.X_))
self.assertTrue(np.array_equal(y, clf.y_))
self.assertListEqual(
clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
)
expected = [
[0, 0, 1, 1],
[2, 0, 1, 1],
[1, 0, 1, 1],
[0, 0, 1, 1],
[1, 0, 1, 1],
[1, 0, 1, 1],
[1, 0, 1, 1],
]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp(proposal=False)
clf.transform([[1, 2], [3, 4]])
def test_transform_proposal(self):
clf = FImdlp(proposal=True)
clf.fit([[1, 2], [3, 4]], [1, 2]) clf.fit([[1, 2], [3, 4]], [1, 2])
self.assertEqual( self.assertEqual(
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]] clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
@@ -60,16 +121,16 @@ class FImdlpTest(unittest.TestCase):
) )
expected = [ expected = [
[4, 0, 1, 1], [4, 0, 1, 1],
[4, 2, 2, 2], [5, 2, 2, 2],
[4, 0, 1, 1], [5, 0, 1, 1],
[1, 0, 1, 1], [1, 0, 1, 1],
[4, 1, 1, 1], [4, 1, 1, 1],
[4, 2, 1, 1], [5, 2, 1, 1],
[4, 1, 1, 1], [5, 1, 1, 1],
] ]
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected)) self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
clf.transform([[1, 2, 3], [4, 5, 6]]) clf.transform([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(sklearn.exceptions.NotFittedError): with self.assertRaises(sklearn.exceptions.NotFittedError):
clf = FImdlp() clf = FImdlp(proposal=True)
clf.transform([[1, 2], [3, 4]]) clf.transform([[1, 2], [3, 4]])

View File

@@ -6,8 +6,8 @@
using namespace std; using namespace std;
namespace mdlp { namespace mdlp {
typedef float precision_t; typedef float precision_t;
typedef vector<precision_t> samples; typedef vector<precision_t> samples_t;
typedef vector<int> labels; typedef vector<int> labels_t;
typedef vector<size_t> indices_t; typedef vector<size_t> indices_t;
typedef vector<precision_t> cutPoints_t; typedef vector<precision_t> cutPoints_t;
typedef map<tuple<int, int>, precision_t> cacheEnt_t; typedef map<tuple<int, int>, precision_t> cacheEnt_t;

View File

@@ -1,37 +0,0 @@
from fimdlp.mdlp import FImdlp
from fimdlp.cppfimdlp import CFImdlp
from sklearn.ensemble import RandomForestClassifier
import time
from scipy.io import arff
import pandas as pd
path = "fimdlp/testcpp/datasets/"
# class_name = "speaker"
# file_name = "kdd_JapaneseVowels.arff"
class_name = "class"
# file_name = "mfeat-factors.arff"
file_name = "letter.arff"
data = arff.loadarff(path + file_name)
df = pd.DataFrame(data[0])
df.dropna(axis=0, how="any", inplace=True)
dataset = df
X = df.drop(class_name, axis=1)
features = X.columns
class_name = class_name
y, _ = pd.factorize(df[class_name])
X = X.to_numpy()
test = FImdlp()
now = time.time()
# test.fit(X, y, features=[i for i in (range(3, 14))])
test.fit(X, y)
fit_time = time.time()
print("Fitting: ", fit_time - now)
now = time.time()
Xt = test.transform(X)
print("Transforming: ", time.time() - now)
print(test.get_cut_points())
clf = RandomForestClassifier(random_state=0)
print(clf.fit(Xt, y).score(Xt, y))

View File

@@ -101,13 +101,13 @@ string ArffFiles::trim(const string& source)
s.erase(s.find_last_not_of(" \n\r\t") + 1); s.erase(s.find_last_not_of(" \n\r\t") + 1);
return s; return s;
} }
vector<int> ArffFiles::factorize(const vector<string>& labels) vector<int> ArffFiles::factorize(const vector<string>& labels_t)
{ {
vector<int> yy; vector<int> yy;
yy.reserve(labels.size()); yy.reserve(labels_t.size());
map<string, int> labelMap; map<string, int> labelMap;
int i = 0; int i = 0;
for (string label : labels) { for (string label : labels_t) {
if (labelMap.find(label) == labelMap.end()) { if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++; labelMap[label] = i++;
} }

View File

@@ -23,6 +23,6 @@ public:
vector<vector<float>>& getX(); vector<vector<float>>& getX();
vector<int>& getY(); vector<int>& getY();
vector<tuple<string, string>> getAttributes(); vector<tuple<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels); vector<int> factorize(const vector<string>& labels_t);
}; };
#endif #endif

6
samples/CMakeLists.txt Normal file
View File

@@ -0,0 +1,6 @@
cmake_minimum_required(VERSION 3.24)
project(main)
set(CMAKE_CXX_STANDARD 17)
add_executable(sample sample.cpp ArffFiles.cpp ../fimdlp/Metrics.cpp ../fimdlp/CPPFImdlp.cpp)

View File

@@ -2,7 +2,7 @@
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <iomanip> #include <iomanip>
#include "../CPPFImdlp.h" #include "../fimdlp/CPPFImdlp.h"
using namespace std; using namespace std;
@@ -10,7 +10,7 @@ int main(int argc, char** argv)
{ {
ArffFiles file; ArffFiles file;
vector<string> lines; vector<string> lines;
string path = "/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/"; string path = "../fimdlp/testcpp/datasets/";
map<string, bool > datasets = { map<string, bool > datasets = {
{"mfeat-factors", true}, {"mfeat-factors", true},
{"iris", true}, {"iris", true},
@@ -41,7 +41,7 @@ int main(int argc, char** argv)
} }
cout << y[i] << endl; cout << y[i] << endl;
} }
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(); mdlp::CPPFImdlp test = mdlp::CPPFImdlp(false);
for (auto i = 0; i < attributes.size(); i++) { for (auto i = 0; i < attributes.size(); i++) {
cout << "Cut points for " << get<0>(attributes[i]) << endl; cout << "Cut points for " << get<0>(attributes[i]) << endl;
cout << "--------------------------" << setprecision(3) << endl; cout << "--------------------------" << setprecision(3) << endl;

44
samples/sample.py Normal file
View File

@@ -0,0 +1,44 @@
import time
import argparse
import os
from scipy.io import arff
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from fimdlp.mdlp import FImdlp
datasets = {
"mfeat-factors": True,
"iris": True,
"letter": True,
"kdd_JapaneseVowels": False,
}
ap = argparse.ArgumentParser()
ap.add_argument("--proposal", action="store_true")
ap.add_argument("--original", dest="proposal", action="store_false")
ap.add_argument("dataset", type=str, choices=datasets.keys())
args = ap.parse_args()
relative = "" if os.path.isdir("fimdlp") else ".."
file_name = os.path.join(
relative, "fimdlp", "testcpp", "datasets", args.dataset
)
data = arff.loadarff(file_name + ".arff")
df = pd.DataFrame(data[0])
class_column = -1 if datasets[args.dataset] else 0
class_name = df.columns.to_list()[class_column]
X = df.drop(class_name, axis=1)
y, _ = pd.factorize(df[class_name])
X = X.to_numpy()
test = FImdlp(proposal=args.proposal)
now = time.time()
test.fit(X, y)
fit_time = time.time()
print("Fitting: ", fit_time - now)
now = time.time()
Xt = test.transform(X)
print("Transforming: ", time.time() - now)
print(test.get_cut_points())
clf = RandomForestClassifier(random_state=0)
print(
"Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y)
)