mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-18 00:45:52 +00:00
Refactor samples and fix Metrics tests
This commit is contained in:
@@ -6,5 +6,7 @@ Fayyad - Irani MDLP discretization algorithm
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
python setup.py build_ext --inplace
|
python setup.py build_ext --inplace
|
||||||
python sample.py
|
python samples/sample.py iris --original
|
||||||
|
python samples/sample.py iris --proposal
|
||||||
|
python samples/sample.py -h # for more options
|
||||||
```
|
```
|
||||||
|
@@ -1,21 +1,17 @@
|
|||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <iostream>
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include "CPPFImdlp.h"
|
#include "CPPFImdlp.h"
|
||||||
#include "Metrics.h"
|
#include "Metrics.h"
|
||||||
|
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
CPPFImdlp::CPPFImdlp(): proposal(true), debug(false), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
|
CPPFImdlp::CPPFImdlp(bool proposal):proposal(proposal), indices(indices_t()), y(labels_t()), metrics(Metrics(y, indices))
|
||||||
{
|
|
||||||
}
|
|
||||||
CPPFImdlp::CPPFImdlp(bool proposal, bool debug): proposal(proposal), debug(debug), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
|
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
CPPFImdlp::~CPPFImdlp()
|
CPPFImdlp::~CPPFImdlp()
|
||||||
= default;
|
= default;
|
||||||
|
|
||||||
CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
|
CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_)
|
||||||
{
|
{
|
||||||
X = X_;
|
X = X_;
|
||||||
y = y_;
|
y = y_;
|
||||||
@@ -28,8 +24,10 @@ namespace mdlp {
|
|||||||
}
|
}
|
||||||
indices = sortIndices(X_);
|
indices = sortIndices(X_);
|
||||||
metrics.setData(y, indices);
|
metrics.setData(y, indices);
|
||||||
//computeCutPoints(0, X.size());
|
if (proposal)
|
||||||
computeCutPointsProposal();
|
computeCutPointsProposal();
|
||||||
|
else
|
||||||
|
computeCutPoints(0, X.size());
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
void CPPFImdlp::computeCutPoints(size_t start, size_t end)
|
void CPPFImdlp::computeCutPoints(size_t start, size_t end)
|
||||||
@@ -53,7 +51,6 @@ namespace mdlp {
|
|||||||
}
|
}
|
||||||
void CPPFImdlp::computeCutPointsOriginal(size_t start, size_t end)
|
void CPPFImdlp::computeCutPointsOriginal(size_t start, size_t end)
|
||||||
{
|
{
|
||||||
size_t idx;
|
|
||||||
precision_t cut;
|
precision_t cut;
|
||||||
if (end - start < 2)
|
if (end - start < 2)
|
||||||
return;
|
return;
|
||||||
@@ -76,14 +73,9 @@ namespace mdlp {
|
|||||||
yCur = yPrev = y[indices[0]];
|
yCur = yPrev = y[indices[0]];
|
||||||
numElements = indices.size() - 1;
|
numElements = indices.size() - 1;
|
||||||
idx = start = 0;
|
idx = start = 0;
|
||||||
bool firstCutPoint = true;
|
|
||||||
if (debug)
|
|
||||||
printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
|
|
||||||
while (idx < numElements) {
|
while (idx < numElements) {
|
||||||
xPivot = xCur;
|
xPivot = xCur;
|
||||||
yPivot = yCur;
|
yPivot = yCur;
|
||||||
if (debug)
|
|
||||||
printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
|
|
||||||
// Read the same values and check class changes
|
// Read the same values and check class changes
|
||||||
do {
|
do {
|
||||||
idx++;
|
idx++;
|
||||||
@@ -92,17 +84,12 @@ namespace mdlp {
|
|||||||
if (yCur != yPivot && xCur == xPivot) {
|
if (yCur != yPivot && xCur == xPivot) {
|
||||||
yPivot = -1;
|
yPivot = -1;
|
||||||
}
|
}
|
||||||
if (debug)
|
|
||||||
printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
|
|
||||||
}
|
}
|
||||||
while (idx < numElements && xCur == xPivot);
|
while (idx < numElements && xCur == xPivot);
|
||||||
// Check if the class changed and there are more than 1 element
|
// Check if the class changed and there are more than 1 element
|
||||||
if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) {
|
if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) {
|
||||||
start = idx;
|
start = idx;
|
||||||
cutPoint = (xPrev + xCur) / 2;
|
cutPoint = (xPrev + xCur) / 2;
|
||||||
if (debug) {
|
|
||||||
printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = %3.1g \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint);
|
|
||||||
}
|
|
||||||
cutPoints.push_back(cutPoint);
|
cutPoints.push_back(cutPoint);
|
||||||
}
|
}
|
||||||
yPrev = yPivot;
|
yPrev = yPivot;
|
||||||
@@ -160,7 +147,7 @@ namespace mdlp {
|
|||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
||||||
indices_t CPPFImdlp::sortIndices(samples& X_)
|
indices_t CPPFImdlp::sortIndices(samples_t& X_)
|
||||||
{
|
{
|
||||||
indices_t idx(X_.size());
|
indices_t idx(X_.size());
|
||||||
iota(idx.begin(), idx.end(), 0);
|
iota(idx.begin(), idx.end(), 0);
|
||||||
|
@@ -6,15 +6,14 @@
|
|||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
class CPPFImdlp {
|
class CPPFImdlp {
|
||||||
protected:
|
protected:
|
||||||
bool proposal; // proposed algorithm or original algorithm
|
bool proposal;
|
||||||
bool debug;
|
|
||||||
indices_t indices; // sorted indices to use with X and y
|
indices_t indices; // sorted indices to use with X and y
|
||||||
samples X;
|
samples_t X;
|
||||||
labels y;
|
labels_t y;
|
||||||
Metrics metrics;
|
Metrics metrics;
|
||||||
cutPoints_t cutPoints;
|
cutPoints_t cutPoints;
|
||||||
|
|
||||||
static indices_t sortIndices(samples&);
|
static indices_t sortIndices(samples_t&);
|
||||||
void computeCutPoints(size_t, size_t);
|
void computeCutPoints(size_t, size_t);
|
||||||
long int getCandidate(size_t, size_t);
|
long int getCandidate(size_t, size_t);
|
||||||
bool mdlp(size_t, size_t, size_t);
|
bool mdlp(size_t, size_t, size_t);
|
||||||
@@ -25,11 +24,10 @@ namespace mdlp {
|
|||||||
void computeCutPointsProposal();
|
void computeCutPointsProposal();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CPPFImdlp();
|
CPPFImdlp(bool);
|
||||||
CPPFImdlp(bool, bool debug = false);
|
|
||||||
~CPPFImdlp();
|
~CPPFImdlp();
|
||||||
CPPFImdlp& fit(samples&, labels&);
|
CPPFImdlp& fit(samples_t&, labels_t&);
|
||||||
samples getCutPoints();
|
samples_t getCutPoints();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
@@ -1,8 +1,9 @@
|
|||||||
#include "Metrics.h"
|
#include "Metrics.h"
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <cmath>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
|
Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
int Metrics::computeNumClasses(size_t start, size_t end)
|
int Metrics::computeNumClasses(size_t start, size_t end)
|
||||||
@@ -13,7 +14,7 @@ namespace mdlp {
|
|||||||
}
|
}
|
||||||
return nClasses.size();
|
return nClasses.size();
|
||||||
}
|
}
|
||||||
void Metrics::setData(labels& y_, indices_t& indices_)
|
void Metrics::setData(labels_t& y_, indices_t& indices_)
|
||||||
{
|
{
|
||||||
indices = indices_;
|
indices = indices_;
|
||||||
y = y_;
|
y = y_;
|
||||||
@@ -25,7 +26,7 @@ namespace mdlp {
|
|||||||
{
|
{
|
||||||
precision_t p, ventropy = 0;
|
precision_t p, ventropy = 0;
|
||||||
int nElements = 0;
|
int nElements = 0;
|
||||||
labels counts(numClasses + 1, 0);
|
labels_t counts(numClasses + 1, 0);
|
||||||
if (end - start < 2)
|
if (end - start < 2)
|
||||||
return 0;
|
return 0;
|
||||||
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
|
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
|
||||||
|
@@ -1,18 +1,17 @@
|
|||||||
#ifndef CCMETRICS_H
|
#ifndef CCMETRICS_H
|
||||||
#define CCMETRICS_H
|
#define CCMETRICS_H
|
||||||
#include "typesFImdlp.h"
|
#include "typesFImdlp.h"
|
||||||
#include <cmath>
|
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
class Metrics {
|
class Metrics {
|
||||||
protected:
|
protected:
|
||||||
labels& y;
|
labels_t& y;
|
||||||
indices_t& indices;
|
indices_t& indices;
|
||||||
int numClasses;
|
int numClasses;
|
||||||
cacheEnt_t entropyCache;
|
cacheEnt_t entropyCache;
|
||||||
cacheIg_t igCache;
|
cacheIg_t igCache;
|
||||||
public:
|
public:
|
||||||
Metrics(labels&, indices_t&);
|
Metrics(labels_t&, indices_t&);
|
||||||
void setData(labels&, indices_t&);
|
void setData(labels_t&, indices_t&);
|
||||||
int computeNumClasses(size_t, size_t);
|
int computeNumClasses(size_t, size_t);
|
||||||
precision_t entropy(size_t, size_t);
|
precision_t entropy(size_t, size_t);
|
||||||
precision_t informationGain(size_t, size_t, size_t);
|
precision_t informationGain(size_t, size_t, size_t);
|
||||||
|
@@ -1 +1 @@
|
|||||||
__version__ = '0.1.1'
|
__version__ = "0.9.1"
|
||||||
|
@@ -6,24 +6,15 @@ from libcpp cimport bool
|
|||||||
cdef extern from "CPPFImdlp.h" namespace "mdlp":
|
cdef extern from "CPPFImdlp.h" namespace "mdlp":
|
||||||
ctypedef float precision_t
|
ctypedef float precision_t
|
||||||
cdef cppclass CPPFImdlp:
|
cdef cppclass CPPFImdlp:
|
||||||
CPPFImdlp() except +
|
CPPFImdlp(bool) except +
|
||||||
CPPFImdlp(bool, bool) except +
|
|
||||||
CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
|
CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
|
||||||
vector[precision_t] getCutPoints()
|
vector[precision_t] getCutPoints()
|
||||||
|
|
||||||
|
|
||||||
class PcutPoint_t:
|
|
||||||
def __init__(self, start, end, fromValue, toValue):
|
|
||||||
self.start = start
|
|
||||||
self.end = end
|
|
||||||
self.fromValue = fromValue
|
|
||||||
self.toValue = toValue
|
|
||||||
|
|
||||||
cdef class CFImdlp:
|
cdef class CFImdlp:
|
||||||
cdef CPPFImdlp *thisptr
|
cdef CPPFImdlp *thisptr
|
||||||
def __cinit__(self, debug=False, proposal=True):
|
def __cinit__(self, proposal):
|
||||||
# Proposal or original algorithm
|
self.thisptr = new CPPFImdlp(proposal)
|
||||||
self.thisptr = new CPPFImdlp(proposal, debug)
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
del self.thisptr
|
del self.thisptr
|
||||||
def fit(self, X, y):
|
def fit(self, X, y):
|
||||||
|
Binary file not shown.
@@ -3,33 +3,35 @@ from .cppfimdlp import CFImdlp
|
|||||||
from sklearn.base import BaseEstimator, TransformerMixin
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
from sklearn.utils.multiclass import unique_labels
|
from sklearn.utils.multiclass import unique_labels
|
||||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
|
||||||
|
|
||||||
class FImdlp(TransformerMixin, BaseEstimator):
|
class FImdlp(TransformerMixin, BaseEstimator):
|
||||||
def __init__(self, proposal=True):
|
def __init__(self, n_jobs=-1, proposal=False):
|
||||||
self.proposal = proposal # proposed algorithm or original algorithm
|
self.n_jobs = n_jobs
|
||||||
|
self.proposal = proposal
|
||||||
|
|
||||||
"""Fayyad - Irani MDLP discretization algorithm.
|
"""Fayyad - Irani MDLP discretization algorithm based implementation.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
demo_param : str, default='demo'
|
n_jobs : int, default=-1
|
||||||
A parameter used for demonstation of how to pass and store paramters.
|
The number of jobs to run in parallel. :meth:`fit` and
|
||||||
|
:meth:`transform`, are parallelized over the features. ``-1`` means
|
||||||
|
using all cores available.
|
||||||
|
|
||||||
Attributes
|
Attributes
|
||||||
----------
|
----------
|
||||||
n_features_ : int
|
n_features_ : int
|
||||||
The number of features of the data passed to :meth:`fit`.
|
The number of features of the data passed to :meth:`fit`.
|
||||||
discretizer_ : list
|
discretizer_ : list
|
||||||
The list of discretizers for each feature.
|
The list of discretizers, one for each feature.
|
||||||
cut_points_ : list
|
cut_points_ : list
|
||||||
The list of cut points for each feature.
|
The list of cut points for each feature.
|
||||||
X_ : array
|
X_ : array
|
||||||
the samples used to fit, shape (n_samples, n_features)
|
the samples used to fit, shape (n_samples, n_features)
|
||||||
y_ : array
|
y_ : array
|
||||||
the labels used to fit, shape (n_samples,)
|
the labels used to fit, shape (n_samples,)
|
||||||
discretized_X_ :
|
|
||||||
array of the discretized samples passed to fit(n_samples, n_features)
|
|
||||||
features_ : list
|
features_ : list
|
||||||
the list of features to be discretized
|
the list of features to be discretized
|
||||||
"""
|
"""
|
||||||
@@ -70,6 +72,8 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
y : None
|
y : None
|
||||||
There is no need of a target in a transformer, yet the pipeline API
|
There is no need of a target in a transformer, yet the pipeline API
|
||||||
requires this parameter.
|
requires this parameter.
|
||||||
|
features : list, default=[i for i in range(n_features)]
|
||||||
|
The list of features to be discretized.
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
self : object
|
self : object
|
||||||
@@ -83,36 +87,22 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
self.y_ = y
|
self.y_ = y
|
||||||
self.discretizer_ = [None] * self.n_features_
|
self.discretizer_ = [None] * self.n_features_
|
||||||
self.cut_points_ = [None] * self.n_features_
|
self.cut_points_ = [None] * self.n_features_
|
||||||
# Can do it in parallel
|
Parallel(n_jobs=self.n_jobs, prefer="threads")(
|
||||||
for feature in self.features_:
|
delayed(self._fit_discretizer)(feature)
|
||||||
self.discretizer_[feature] = CFImdlp(
|
for feature in range(self.n_features_)
|
||||||
proposal=self.proposal, debug=False
|
|
||||||
)
|
)
|
||||||
self.discretizer_[feature].fit(X[:, feature], y)
|
|
||||||
self.cut_points_[feature] = self.discretizer_[
|
|
||||||
feature
|
|
||||||
].get_cut_points()
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def get_fitted(self):
|
def _fit_discretizer(self, feature):
|
||||||
"""Return the discretized X computed during fit.
|
self.discretizer_[feature] = CFImdlp(proposal=self.proposal)
|
||||||
|
self.discretizer_[feature].fit(self.X_[:, feature], self.y_)
|
||||||
|
self.cut_points_[feature] = self.discretizer_[feature].get_cut_points()
|
||||||
|
|
||||||
Returns
|
def _discretize_feature(self, feature, X, result):
|
||||||
-------
|
|
||||||
X_transformed : array, shape (n_samples, n_features)
|
|
||||||
discretized X computed during fit.
|
|
||||||
"""
|
|
||||||
# Check is fit had been called
|
|
||||||
check_is_fitted(self, "n_features_")
|
|
||||||
result = np.zeros_like(self.X_, dtype=np.int32) - 1
|
|
||||||
for feature in range(self.n_features_):
|
|
||||||
if feature in self.features_:
|
if feature in self.features_:
|
||||||
result[:, feature] = self.discretizer_[
|
result[:, feature] = np.searchsorted(self.cut_points_[feature], X)
|
||||||
feature
|
|
||||||
].get_discretized_values()
|
|
||||||
else:
|
else:
|
||||||
result[:, feature] = self.X_[:, feature]
|
result[:, feature] = X
|
||||||
return result
|
|
||||||
|
|
||||||
def transform(self, X):
|
def transform(self, X):
|
||||||
"""Discretize X values.
|
"""Discretize X values.
|
||||||
@@ -127,28 +117,28 @@ class FImdlp(TransformerMixin, BaseEstimator):
|
|||||||
"""
|
"""
|
||||||
# Check is fit had been called
|
# Check is fit had been called
|
||||||
check_is_fitted(self, "n_features_")
|
check_is_fitted(self, "n_features_")
|
||||||
|
|
||||||
# Input validation
|
# Input validation
|
||||||
X = check_array(X)
|
X = check_array(X)
|
||||||
|
|
||||||
# Check that the input is of the same shape as the one passed
|
# Check that the input is of the same shape as the one passed
|
||||||
# during fit.
|
# during fit.
|
||||||
# if X.shape[1] != self.n_features_:
|
if X.shape[1] != self.n_features_:
|
||||||
# raise ValueError(
|
raise ValueError(
|
||||||
# "Shape of input is different from what was seen in `fit`"
|
"Shape of input is different from what was seen in `fit`"
|
||||||
# )
|
)
|
||||||
result = np.zeros_like(X, dtype=np.int32) - 1
|
result = np.zeros_like(X, dtype=np.int32) - 1
|
||||||
# Can do it in parallel
|
Parallel(n_jobs=self.n_jobs, prefer="threads")(
|
||||||
for feature in range(self.n_features_):
|
delayed(self._discretize_feature)(feature, X[:, feature], result)
|
||||||
if feature in self.features_:
|
for feature in range(self.n_features_)
|
||||||
result[:, feature] = np.searchsorted(
|
|
||||||
self.cut_points_[feature], X[:, feature]
|
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
result[:, feature] = X[:, feature]
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def get_cut_points(self):
|
def get_cut_points(self):
|
||||||
|
"""Get the cut points for each feature.
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
result: list
|
||||||
|
The list of cut points for each feature.
|
||||||
|
"""
|
||||||
result = []
|
result = []
|
||||||
for feature in range(self.n_features_):
|
for feature in range(self.n_features_):
|
||||||
result.append(self.cut_points_[feature])
|
result.append(self.cut_points_[feature])
|
||||||
|
@@ -1,74 +1,63 @@
|
|||||||
//#include "gtest/gtest.h"
|
#include "gtest/gtest.h"
|
||||||
//#include "../Metrics.h"
|
#include "../Metrics.h"
|
||||||
//#include "../CPPFImdlp.h"
|
#include "../CPPFImdlp.h"
|
||||||
//namespace mdlp {
|
namespace mdlp {
|
||||||
// class TestFImdlp : public CPPFImdlp, public testing::Test {
|
class TestFImdlp: public CPPFImdlp, public testing::Test {
|
||||||
// public:
|
public:
|
||||||
// TestFImdlp() : CPPFImdlp(true, true) {}
|
TestFImdlp(): CPPFImdlp(false) {}
|
||||||
// void SetUp()
|
void SetUp()
|
||||||
// {
|
{
|
||||||
// // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
|
// 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0]
|
||||||
// //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
|
//(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2)
|
||||||
// X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
|
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
|
||||||
// y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
||||||
// fit(X, y);
|
fit(X, y);
|
||||||
// }
|
}
|
||||||
// void setProposal(bool value)
|
void setProposal(bool value)
|
||||||
// {
|
{
|
||||||
// proposal = value;
|
proposal = value;
|
||||||
// }
|
}
|
||||||
// void initCutPoints()
|
void initIndices()
|
||||||
// {
|
{
|
||||||
// setCutPoints(cutPoints_t());
|
indices = indices_t();
|
||||||
// }
|
}
|
||||||
// void initIndices()
|
void checkSortedVector(samples_t& X_, indices_t indices_)
|
||||||
// {
|
{
|
||||||
// indices = indices_t();
|
X = X_;
|
||||||
// }
|
indices = indices_;
|
||||||
// void initDiscretized()
|
indices_t testSortedIndices = sortIndices(X);
|
||||||
// {
|
precision_t prev = X[testSortedIndices[0]];
|
||||||
// xDiscretized = labels();
|
for (auto i = 0; i < X.size(); ++i) {
|
||||||
// }
|
EXPECT_EQ(testSortedIndices[i], indices[i]);
|
||||||
// void checkSortedVector(samples& X_, indices_t indices_)
|
EXPECT_LE(prev, X[testSortedIndices[i]]);
|
||||||
// {
|
prev = X[testSortedIndices[i]];
|
||||||
// X = X_;
|
}
|
||||||
// indices = indices_;
|
}
|
||||||
// indices_t testSortedIndices = sortIndices(X);
|
void checkCutPoints(cutPoints_t& expected)
|
||||||
// precision_t prev = X[testSortedIndices[0]];
|
{
|
||||||
// for (auto i = 0; i < X.size(); ++i) {
|
int expectedSize = expected.size();
|
||||||
// EXPECT_EQ(testSortedIndices[i], indices[i]);
|
EXPECT_EQ(cutPoints.size(), expectedSize);
|
||||||
// EXPECT_LE(prev, X[testSortedIndices[i]]);
|
for (auto i = 0; i < expectedSize; i++) {
|
||||||
// prev = X[testSortedIndices[i]];
|
EXPECT_EQ(cutPoints[i], expected[i]);
|
||||||
// }
|
}
|
||||||
// }
|
}
|
||||||
// void checkCutPoints(cutPoints_t& expected)
|
template<typename T, typename A>
|
||||||
// {
|
void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
|
||||||
// int expectedSize = expected.size();
|
{
|
||||||
// EXPECT_EQ(cutPoints.size(), expectedSize);
|
EXPECT_EQ(expected.size(), computed.size());
|
||||||
// for (auto i = 0; i < expectedSize; i++) {
|
for (auto i = 0; i < expected.size(); i++) {
|
||||||
// EXPECT_EQ(cutPoints[i].start, expected[i].start);
|
EXPECT_EQ(expected[i], computed[i]);
|
||||||
// EXPECT_EQ(cutPoints[i].end, expected[i].end);
|
}
|
||||||
// EXPECT_EQ(cutPoints[i].classNumber, expected[i].classNumber);
|
}
|
||||||
// EXPECT_NEAR(cutPoints[i].fromValue, expected[i].fromValue, precision);
|
};
|
||||||
// EXPECT_NEAR(cutPoints[i].toValue, expected[i].toValue, precision);
|
TEST_F(TestFImdlp, FitErrorEmptyDataset)
|
||||||
// }
|
{
|
||||||
// }
|
X = samples_t();
|
||||||
// template<typename T, typename A>
|
y = labels_t();
|
||||||
// void checkVectors(std::vector<T, A> const& expected, std::vector<T, A> const& computed)
|
EXPECT_THROW(fit(X, y), std::invalid_argument);
|
||||||
// {
|
}
|
||||||
// EXPECT_EQ(expected.size(), computed.size());
|
}
|
||||||
// for (auto i = 0; i < expected.size(); i++) {
|
|
||||||
// EXPECT_EQ(expected[i], computed[i]);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
//
|
//
|
||||||
// };
|
|
||||||
// TEST_F(TestFImdlp, FitErrorEmptyDataset)
|
|
||||||
// {
|
|
||||||
// X = samples();
|
|
||||||
// y = labels();
|
|
||||||
// EXPECT_THROW(fit(X, y), std::invalid_argument);
|
|
||||||
// }
|
|
||||||
// TEST_F(TestFImdlp, FitErrorDifferentSize)
|
// TEST_F(TestFImdlp, FitErrorDifferentSize)
|
||||||
// {
|
// {
|
||||||
// X = { 1, 2, 3 };
|
// X = { 1, 2, 3 };
|
||||||
@@ -143,7 +132,7 @@
|
|||||||
// }
|
// }
|
||||||
// TEST_F(TestFImdlp, DiscretizedValues)
|
// TEST_F(TestFImdlp, DiscretizedValues)
|
||||||
// {
|
// {
|
||||||
// labels computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
// labels_t computed, expected = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||||
// computed = getDiscretizedValues();
|
// computed = getDiscretizedValues();
|
||||||
// checkVectors(expected, computed);
|
// checkVectors(expected, computed);
|
||||||
// }
|
// }
|
||||||
@@ -157,7 +146,7 @@
|
|||||||
// TEST_F(TestFImdlp, Constructor)
|
// TEST_F(TestFImdlp, Constructor)
|
||||||
// {
|
// {
|
||||||
// samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
|
// samples X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
|
||||||
// labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
// labels_t y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
||||||
// setProposal(false);
|
// setProposal(false);
|
||||||
// fit(X, y);
|
// fit(X, y);
|
||||||
// computeCutPointsOriginal();
|
// computeCutPointsOriginal();
|
||||||
|
@@ -1,31 +1,43 @@
|
|||||||
#include "gtest/gtest.h"
|
#include "gtest/gtest.h"
|
||||||
#include "../Metrics.h"
|
#include "../Metrics.h"
|
||||||
|
|
||||||
|
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
|
class TestMetrics: public Metrics, public testing::Test {
|
||||||
|
public:
|
||||||
|
labels_t y;
|
||||||
|
samples_t X;
|
||||||
|
indices_t indices;
|
||||||
precision_t precision = 0.000001;
|
precision_t precision = 0.000001;
|
||||||
TEST(MetricTest, NumClasses)
|
|
||||||
|
TestMetrics(): Metrics(y, indices) {}
|
||||||
|
void SetUp()
|
||||||
{
|
{
|
||||||
labels y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
||||||
indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
|
indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
|
||||||
EXPECT_EQ(1, Metrics::numClasses(y, indices, 4, 8));
|
setData(y, indices);
|
||||||
EXPECT_EQ(2, Metrics::numClasses(y, indices, 0, 10));
|
|
||||||
EXPECT_EQ(2, Metrics::numClasses(y, indices, 8, 10));
|
|
||||||
}
|
}
|
||||||
TEST(MetricTest, Entropy)
|
};
|
||||||
|
TEST_F(TestMetrics, NumClasses)
|
||||||
{
|
{
|
||||||
labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
||||||
indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
|
EXPECT_EQ(1, computeNumClasses(4, 8));
|
||||||
EXPECT_EQ(1, Metrics::entropy(y, indices, 0, 10, 2));
|
EXPECT_EQ(2, computeNumClasses(0, 10));
|
||||||
EXPECT_EQ(0, Metrics::entropy(y, indices, 0, 5, 1));
|
EXPECT_EQ(2, computeNumClasses(8, 10));
|
||||||
labels yz = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
|
||||||
ASSERT_NEAR(0.468996, Metrics::entropy(yz, indices, 0, 10, 2), precision);
|
|
||||||
}
|
}
|
||||||
TEST(MetricTest, InformationGain)
|
TEST_F(TestMetrics, Entropy)
|
||||||
{
|
{
|
||||||
labels y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
EXPECT_EQ(1, entropy(0, 10));
|
||||||
indices_t indices = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
|
EXPECT_EQ(0, entropy(0, 5));
|
||||||
labels yz = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
||||||
ASSERT_NEAR(1, Metrics::informationGain(y, indices, 0, 10, 5, 2), precision);
|
setData(y, indices);
|
||||||
ASSERT_NEAR(0.108032, Metrics::informationGain(yz, indices, 0, 10, 5, 2), precision);
|
ASSERT_NEAR(0.468996, entropy(0, 10), precision);
|
||||||
|
}
|
||||||
|
TEST_F(TestMetrics, InformationGain)
|
||||||
|
{
|
||||||
|
ASSERT_NEAR(1, informationGain(0, 5, 10), precision);
|
||||||
|
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
||||||
|
setData(y, indices);
|
||||||
|
ASSERT_NEAR(0.108032, informationGain(0, 5, 10), precision);
|
||||||
}
|
}
|
||||||
}
|
}
|
Binary file not shown.
@@ -101,13 +101,13 @@ string ArffFiles::trim(const string& source)
|
|||||||
s.erase(s.find_last_not_of(" \n\r\t") + 1);
|
s.erase(s.find_last_not_of(" \n\r\t") + 1);
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
vector<int> ArffFiles::factorize(const vector<string>& labels)
|
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
|
||||||
{
|
{
|
||||||
vector<int> yy;
|
vector<int> yy;
|
||||||
yy.reserve(labels.size());
|
yy.reserve(labels_t.size());
|
||||||
map<string, int> labelMap;
|
map<string, int> labelMap;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (string label : labels) {
|
for (string label : labels_t) {
|
||||||
if (labelMap.find(label) == labelMap.end()) {
|
if (labelMap.find(label) == labelMap.end()) {
|
||||||
labelMap[label] = i++;
|
labelMap[label] = i++;
|
||||||
}
|
}
|
||||||
|
@@ -23,6 +23,6 @@ public:
|
|||||||
vector<vector<float>>& getX();
|
vector<vector<float>>& getX();
|
||||||
vector<int>& getY();
|
vector<int>& getY();
|
||||||
vector<tuple<string, string>> getAttributes();
|
vector<tuple<string, string>> getAttributes();
|
||||||
vector<int> factorize(const vector<string>& labels);
|
vector<int> factorize(const vector<string>& labels_t);
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
@@ -8,12 +8,14 @@ from ..mdlp import FImdlp
|
|||||||
class FImdlpTest(unittest.TestCase):
|
class FImdlpTest(unittest.TestCase):
|
||||||
def test_init(self):
|
def test_init(self):
|
||||||
clf = FImdlp()
|
clf = FImdlp()
|
||||||
self.assertTrue(clf.proposal)
|
self.assertEqual(-1, clf.n_jobs)
|
||||||
clf = FImdlp(proposal=False)
|
|
||||||
self.assertFalse(clf.proposal)
|
self.assertFalse(clf.proposal)
|
||||||
|
clf = FImdlp(proposal=True, n_jobs=7)
|
||||||
|
self.assertTrue(clf.proposal)
|
||||||
|
self.assertEqual(7, clf.n_jobs)
|
||||||
|
|
||||||
def test_fit(self):
|
def test_fit_proposal(self):
|
||||||
clf = FImdlp()
|
clf = FImdlp(proposal=True)
|
||||||
clf.fit([[1, 2], [3, 4]], [1, 2])
|
clf.fit([[1, 2], [3, 4]], [1, 2])
|
||||||
self.assertEqual(clf.n_features_, 2)
|
self.assertEqual(clf.n_features_, 2)
|
||||||
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
|
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
|
||||||
@@ -25,10 +27,39 @@ class FImdlpTest(unittest.TestCase):
|
|||||||
self.assertTrue(np.array_equal(X, clf.X_))
|
self.assertTrue(np.array_equal(X, clf.X_))
|
||||||
self.assertTrue(np.array_equal(y, clf.y_))
|
self.assertTrue(np.array_equal(y, clf.y_))
|
||||||
expected = [
|
expected = [
|
||||||
[4.900000095367432, 5.0, 5.099999904632568, 5.400000095367432],
|
[
|
||||||
[2.6999998092651367, 2.9000000953674316],
|
4.900000095367432,
|
||||||
[2.3499999046325684, 4.5],
|
5.0,
|
||||||
[0.75, 1.399999976158142, 1.5],
|
5.099999904632568,
|
||||||
|
5.400000095367432,
|
||||||
|
5.699999809265137,
|
||||||
|
],
|
||||||
|
[2.6999998092651367, 2.9000000953674316, 3.1999998092651367],
|
||||||
|
[2.3499999046325684, 4.5, 4.800000190734863],
|
||||||
|
[0.75, 1.399999976158142, 1.5, 1.7000000476837158],
|
||||||
|
]
|
||||||
|
self.assertListEqual(expected, clf.get_cut_points())
|
||||||
|
self.assertListEqual([0, 1, 2, 3], clf.features_)
|
||||||
|
clf.fit(X, y, features=[0, 2, 3])
|
||||||
|
self.assertListEqual([0, 2, 3], clf.features_)
|
||||||
|
|
||||||
|
def test_fit_original(self):
|
||||||
|
clf = FImdlp(proposal=False)
|
||||||
|
clf.fit([[1, 2], [3, 4]], [1, 2])
|
||||||
|
self.assertEqual(clf.n_features_, 2)
|
||||||
|
self.assertListEqual(clf.X_.tolist(), [[1, 2], [3, 4]])
|
||||||
|
self.assertListEqual(clf.y_.tolist(), [1, 2])
|
||||||
|
self.assertListEqual([[], []], clf.get_cut_points())
|
||||||
|
X, y = load_iris(return_X_y=True)
|
||||||
|
clf.fit(X, y)
|
||||||
|
self.assertEqual(clf.n_features_, 4)
|
||||||
|
self.assertTrue(np.array_equal(X, clf.X_))
|
||||||
|
self.assertTrue(np.array_equal(y, clf.y_))
|
||||||
|
expected = [
|
||||||
|
[5.5, 5.800000190734863],
|
||||||
|
[3.0999999046325684],
|
||||||
|
[2.450000047683716, 4.800000190734863, 5.099999904632568],
|
||||||
|
[0.800000011920929, 1.7000000476837158],
|
||||||
]
|
]
|
||||||
self.assertListEqual(expected, clf.get_cut_points())
|
self.assertListEqual(expected, clf.get_cut_points())
|
||||||
self.assertListEqual([0, 1, 2, 3], clf.features_)
|
self.assertListEqual([0, 1, 2, 3], clf.features_)
|
||||||
@@ -44,8 +75,38 @@ class FImdlpTest(unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
clf.fit([[1, 2], [3, 4]], [1, 2], unexpected="class_name")
|
clf.fit([[1, 2], [3, 4]], [1, 2], unexpected="class_name")
|
||||||
|
|
||||||
def test_transform(self):
|
def test_transform_original(self):
|
||||||
clf = FImdlp()
|
clf = FImdlp(proposal=False)
|
||||||
|
clf.fit([[1, 2], [3, 4]], [1, 2])
|
||||||
|
self.assertEqual(
|
||||||
|
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
|
||||||
|
)
|
||||||
|
X, y = load_iris(return_X_y=True)
|
||||||
|
clf.fit(X, y)
|
||||||
|
self.assertEqual(clf.n_features_, 4)
|
||||||
|
self.assertTrue(np.array_equal(X, clf.X_))
|
||||||
|
self.assertTrue(np.array_equal(y, clf.y_))
|
||||||
|
self.assertListEqual(
|
||||||
|
clf.transform(X).tolist(), clf.fit(X, y).transform(X).tolist()
|
||||||
|
)
|
||||||
|
expected = [
|
||||||
|
[0, 0, 1, 1],
|
||||||
|
[2, 0, 1, 1],
|
||||||
|
[1, 0, 1, 1],
|
||||||
|
[0, 0, 1, 1],
|
||||||
|
[1, 0, 1, 1],
|
||||||
|
[1, 0, 1, 1],
|
||||||
|
[1, 0, 1, 1],
|
||||||
|
]
|
||||||
|
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.transform([[1, 2, 3], [4, 5, 6]])
|
||||||
|
with self.assertRaises(sklearn.exceptions.NotFittedError):
|
||||||
|
clf = FImdlp(proposal=False)
|
||||||
|
clf.transform([[1, 2], [3, 4]])
|
||||||
|
|
||||||
|
def test_transform_proposal(self):
|
||||||
|
clf = FImdlp(proposal=True)
|
||||||
clf.fit([[1, 2], [3, 4]], [1, 2])
|
clf.fit([[1, 2], [3, 4]], [1, 2])
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
|
clf.transform([[1, 2], [3, 4]]).tolist(), [[0, 0], [0, 0]]
|
||||||
@@ -60,16 +121,16 @@ class FImdlpTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
expected = [
|
expected = [
|
||||||
[4, 0, 1, 1],
|
[4, 0, 1, 1],
|
||||||
[4, 2, 2, 2],
|
[5, 2, 2, 2],
|
||||||
[4, 0, 1, 1],
|
[5, 0, 1, 1],
|
||||||
[1, 0, 1, 1],
|
[1, 0, 1, 1],
|
||||||
[4, 1, 1, 1],
|
[4, 1, 1, 1],
|
||||||
[4, 2, 1, 1],
|
[5, 2, 1, 1],
|
||||||
[4, 1, 1, 1],
|
[5, 1, 1, 1],
|
||||||
]
|
]
|
||||||
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
|
self.assertTrue(np.array_equal(clf.transform(X[90:97]), expected))
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
clf.transform([[1, 2, 3], [4, 5, 6]])
|
clf.transform([[1, 2, 3], [4, 5, 6]])
|
||||||
with self.assertRaises(sklearn.exceptions.NotFittedError):
|
with self.assertRaises(sklearn.exceptions.NotFittedError):
|
||||||
clf = FImdlp()
|
clf = FImdlp(proposal=True)
|
||||||
clf.transform([[1, 2], [3, 4]])
|
clf.transform([[1, 2], [3, 4]])
|
||||||
|
@@ -6,8 +6,8 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
typedef float precision_t;
|
typedef float precision_t;
|
||||||
typedef vector<precision_t> samples;
|
typedef vector<precision_t> samples_t;
|
||||||
typedef vector<int> labels;
|
typedef vector<int> labels_t;
|
||||||
typedef vector<size_t> indices_t;
|
typedef vector<size_t> indices_t;
|
||||||
typedef vector<precision_t> cutPoints_t;
|
typedef vector<precision_t> cutPoints_t;
|
||||||
typedef map<tuple<int, int>, precision_t> cacheEnt_t;
|
typedef map<tuple<int, int>, precision_t> cacheEnt_t;
|
||||||
|
37
sample.py
37
sample.py
@@ -1,37 +0,0 @@
|
|||||||
from fimdlp.mdlp import FImdlp
|
|
||||||
from fimdlp.cppfimdlp import CFImdlp
|
|
||||||
from sklearn.ensemble import RandomForestClassifier
|
|
||||||
import time
|
|
||||||
|
|
||||||
from scipy.io import arff
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
path = "fimdlp/testcpp/datasets/"
|
|
||||||
# class_name = "speaker"
|
|
||||||
# file_name = "kdd_JapaneseVowels.arff"
|
|
||||||
class_name = "class"
|
|
||||||
# file_name = "mfeat-factors.arff"
|
|
||||||
file_name = "letter.arff"
|
|
||||||
data = arff.loadarff(path + file_name)
|
|
||||||
df = pd.DataFrame(data[0])
|
|
||||||
df.dropna(axis=0, how="any", inplace=True)
|
|
||||||
dataset = df
|
|
||||||
X = df.drop(class_name, axis=1)
|
|
||||||
features = X.columns
|
|
||||||
class_name = class_name
|
|
||||||
y, _ = pd.factorize(df[class_name])
|
|
||||||
X = X.to_numpy()
|
|
||||||
|
|
||||||
test = FImdlp()
|
|
||||||
now = time.time()
|
|
||||||
# test.fit(X, y, features=[i for i in (range(3, 14))])
|
|
||||||
test.fit(X, y)
|
|
||||||
fit_time = time.time()
|
|
||||||
print("Fitting: ", fit_time - now)
|
|
||||||
now = time.time()
|
|
||||||
Xt = test.transform(X)
|
|
||||||
print("Transforming: ", time.time() - now)
|
|
||||||
print(test.get_cut_points())
|
|
||||||
|
|
||||||
clf = RandomForestClassifier(random_state=0)
|
|
||||||
print(clf.fit(Xt, y).score(Xt, y))
|
|
@@ -101,13 +101,13 @@ string ArffFiles::trim(const string& source)
|
|||||||
s.erase(s.find_last_not_of(" \n\r\t") + 1);
|
s.erase(s.find_last_not_of(" \n\r\t") + 1);
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
vector<int> ArffFiles::factorize(const vector<string>& labels)
|
vector<int> ArffFiles::factorize(const vector<string>& labels_t)
|
||||||
{
|
{
|
||||||
vector<int> yy;
|
vector<int> yy;
|
||||||
yy.reserve(labels.size());
|
yy.reserve(labels_t.size());
|
||||||
map<string, int> labelMap;
|
map<string, int> labelMap;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (string label : labels) {
|
for (string label : labels_t) {
|
||||||
if (labelMap.find(label) == labelMap.end()) {
|
if (labelMap.find(label) == labelMap.end()) {
|
||||||
labelMap[label] = i++;
|
labelMap[label] = i++;
|
||||||
}
|
}
|
@@ -23,6 +23,6 @@ public:
|
|||||||
vector<vector<float>>& getX();
|
vector<vector<float>>& getX();
|
||||||
vector<int>& getY();
|
vector<int>& getY();
|
||||||
vector<tuple<string, string>> getAttributes();
|
vector<tuple<string, string>> getAttributes();
|
||||||
vector<int> factorize(const vector<string>& labels);
|
vector<int> factorize(const vector<string>& labels_t);
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
6
samples/CMakeLists.txt
Normal file
6
samples/CMakeLists.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.24)
|
||||||
|
project(main)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
|
||||||
|
add_executable(sample sample.cpp ArffFiles.cpp ../fimdlp/Metrics.cpp ../fimdlp/CPPFImdlp.cpp)
|
@@ -2,7 +2,7 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include "../CPPFImdlp.h"
|
#include "../fimdlp/CPPFImdlp.h"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@@ -10,7 +10,7 @@ int main(int argc, char** argv)
|
|||||||
{
|
{
|
||||||
ArffFiles file;
|
ArffFiles file;
|
||||||
vector<string> lines;
|
vector<string> lines;
|
||||||
string path = "/Users/rmontanana/Code/FImdlp/fimdlp/testcpp/datasets/";
|
string path = "../fimdlp/testcpp/datasets/";
|
||||||
map<string, bool > datasets = {
|
map<string, bool > datasets = {
|
||||||
{"mfeat-factors", true},
|
{"mfeat-factors", true},
|
||||||
{"iris", true},
|
{"iris", true},
|
||||||
@@ -41,7 +41,7 @@ int main(int argc, char** argv)
|
|||||||
}
|
}
|
||||||
cout << y[i] << endl;
|
cout << y[i] << endl;
|
||||||
}
|
}
|
||||||
mdlp::CPPFImdlp test = mdlp::CPPFImdlp();
|
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(false);
|
||||||
for (auto i = 0; i < attributes.size(); i++) {
|
for (auto i = 0; i < attributes.size(); i++) {
|
||||||
cout << "Cut points for " << get<0>(attributes[i]) << endl;
|
cout << "Cut points for " << get<0>(attributes[i]) << endl;
|
||||||
cout << "--------------------------" << setprecision(3) << endl;
|
cout << "--------------------------" << setprecision(3) << endl;
|
44
samples/sample.py
Normal file
44
samples/sample.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import time
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from scipy.io import arff
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from fimdlp.mdlp import FImdlp
|
||||||
|
|
||||||
|
datasets = {
|
||||||
|
"mfeat-factors": True,
|
||||||
|
"iris": True,
|
||||||
|
"letter": True,
|
||||||
|
"kdd_JapaneseVowels": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--proposal", action="store_true")
|
||||||
|
ap.add_argument("--original", dest="proposal", action="store_false")
|
||||||
|
ap.add_argument("dataset", type=str, choices=datasets.keys())
|
||||||
|
args = ap.parse_args()
|
||||||
|
relative = "" if os.path.isdir("fimdlp") else ".."
|
||||||
|
file_name = os.path.join(
|
||||||
|
relative, "fimdlp", "testcpp", "datasets", args.dataset
|
||||||
|
)
|
||||||
|
data = arff.loadarff(file_name + ".arff")
|
||||||
|
df = pd.DataFrame(data[0])
|
||||||
|
class_column = -1 if datasets[args.dataset] else 0
|
||||||
|
class_name = df.columns.to_list()[class_column]
|
||||||
|
X = df.drop(class_name, axis=1)
|
||||||
|
y, _ = pd.factorize(df[class_name])
|
||||||
|
X = X.to_numpy()
|
||||||
|
test = FImdlp(proposal=args.proposal)
|
||||||
|
now = time.time()
|
||||||
|
test.fit(X, y)
|
||||||
|
fit_time = time.time()
|
||||||
|
print("Fitting: ", fit_time - now)
|
||||||
|
now = time.time()
|
||||||
|
Xt = test.transform(X)
|
||||||
|
print("Transforming: ", time.time() - now)
|
||||||
|
print(test.get_cut_points())
|
||||||
|
clf = RandomForestClassifier(random_state=0)
|
||||||
|
print(
|
||||||
|
"Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y)
|
||||||
|
)
|
Reference in New Issue
Block a user