Add features to discretize

This commit is contained in:
2022-12-05 23:11:45 +01:00
parent 00453f628b
commit 7c25c33409
5 changed files with 10265 additions and 8 deletions

1
.gitignore vendored
View File

@@ -133,3 +133,4 @@ cfimdlp.cpp
cmake-build-debug
cmake-build-debug/**
**/lcoverage/**
**/x/*

10177
fimdlp/kdd_JapaneseVowels.arff Executable file

File diff suppressed because it is too large Load Diff

37
fimdlp/main.cpp Normal file
View File

@@ -0,0 +1,37 @@
#include "CPPFImdlp.h"
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <sstream>
using namespace std;
using namespace mdlp;
int main()
{
ifstream fin("kdd_JapaneseVowels.arff");
if (!fin.is_open()) {
cout << "Error opening file" << endl;
return 1;
}
int count = 0;
// Read the Data from the file
// as String Vector
vector<string> row;
string line, word;
while (getline(fin, line)) {
if (count++ > 215) {
row.clear();
stringstream ss(line);
while (getline(ss, word, ',')) {
row.push_back(word);
cout << word << " ";
}
cout << endl;
}
}
fin.close();
return 0;
}

View File

@@ -20,6 +20,18 @@ class FImdlp(TransformerMixin, BaseEstimator):
----------
n_features_ : int
The number of features of the data passed to :meth:`fit`.
discretizer_ : list
The list of discretizers for each feature.
cut_points_ : list
The list of cut points for each feature.
X_ : array
the samples used to fit, shape (n_samples, n_features)
y_ : array
the labels used to fit, shape (n_samples,)
discretized_X_ :
array of the discretized samples passed to fit(n_samples, n_features)
features_ : list
the list of features to be discretized
"""
def _check_params_fit(self, X, y, expected_args, kwargs):
@@ -30,17 +42,23 @@ class FImdlp(TransformerMixin, BaseEstimator):
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
# Default values
self.class_name_ = "class"
self.features_ = [f"feature_{i}" for i in range(X.shape[1])]
self.features_ = [i for i in range(X.shape[1])]
for key, value in kwargs.items():
if key in expected_args:
setattr(self, f"{key}_", value)
else:
raise ValueError(f"Unexpected argument: {key}")
if len(self.features_) != X.shape[1]:
if len(self.features_) > X.shape[1]:
raise ValueError(
"Number of features does not match the number of columns in X"
)
if type(self.features_) != list:
raise ValueError("features must be a list")
self.features_.sort()
if list(set(self.features_)) != self.features_:
raise ValueError("Features must be unique")
if max(self.features_) >= X.shape[1]:
raise ValueError("Feature index out of range")
return X, y
def fit(self, X, y, **kwargs):
@@ -58,7 +76,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
Returns self.
"""
X, y = self._check_params_fit(
X, y, expected_args=["class_name", "features"], kwargs=kwargs
X, y, expected_args=["features"], kwargs=kwargs
)
self.n_features_ = X.shape[1]
self.X_ = X
@@ -66,7 +84,7 @@ class FImdlp(TransformerMixin, BaseEstimator):
self.discretizer_ = [None] * self.n_features_
self.cut_points_ = [None] * self.n_features_
# Can do it in parallel
for feature in range(self.n_features_):
for feature in self.features_:
self.discretizer_[feature] = CFImdlp(proposal=self.proposal)
self.discretizer_[feature].fit(X[:, feature], y)
self.cut_points_[feature] = self.discretizer_[
@@ -74,6 +92,26 @@ class FImdlp(TransformerMixin, BaseEstimator):
].get_cut_points()
return self
def get_fitted(self):
"""Return the discretized X computed during fit.
Returns
-------
X_transformed : array, shape (n_samples, n_features)
discretized X computed during fit.
"""
# Check is fit had been called
check_is_fitted(self, "n_features_")
result = np.zeros_like(self.X_, dtype=np.int32) - 1
for feature in range(self.n_features_):
if feature in self.features_:
result[:, feature] = self.discretizer_[
feature
].get_discretized_values()
else:
result[:, feature] = self.X_[:, feature]
return result
def transform(self, X):
"""Discretize X values.
Parameters
@@ -100,9 +138,12 @@ class FImdlp(TransformerMixin, BaseEstimator):
result = np.zeros_like(X, dtype=np.int32) - 1
# Can do it in parallel
for feature in range(self.n_features_):
if feature in self.features_:
result[:, feature] = np.searchsorted(
self.cut_points_[feature], X[:, feature]
)
else:
result[:, feature] = X[:, feature]
return result
def get_cut_points(self):

View File

@@ -126,3 +126,4 @@ for proposal in [True, False]:
# indices2 = np.argsort(X)
# Xs = np.array(X)[indices2]
# ys = np.array(y)[indices2]
kdd_JapaneseVowels