mirror of
https://github.com/Doctorado-ML/bayesclass.git
synced 2025-08-15 23:55:57 +00:00
Chcked mutual_info with sklearn
This commit is contained in:
@@ -1,113 +1,121 @@
|
|||||||
#include "FeatureSelect.h"
|
#include "FeatureSelect.h"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
namespace features {
|
namespace features {
|
||||||
// SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k)
|
SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat)
|
||||||
// : samples(samples), labels(labels), weights(weights), k(k)
|
: samples(samples), labels(labels), weights(weights), k(k), nat(nat)
|
||||||
// {
|
{
|
||||||
// // if (samples.size() == 0 || samples[0].size() == 0)
|
if (samples.size() == 0 || samples[0].size() == 0)
|
||||||
// // throw invalid_argument("features must be a non-empty matrix");
|
throw invalid_argument("features must be a non-empty matrix");
|
||||||
// // if (samples.size() != labels.size())
|
if (samples.size() != labels.size())
|
||||||
// // throw invalid_argument("number of samples and labels must be equal");
|
throw invalid_argument("number of samples and labels must be equal");
|
||||||
// // if (samples.size() != weights.size())
|
if (samples.size() != weights.size())
|
||||||
// // throw invalid_argument("number of samples and weights must be equal");
|
throw invalid_argument("number of samples and weights must be equal");
|
||||||
// // if (k < 1 || k > static_cast<int>(samples[0].size()))
|
if (k < 1 || k > static_cast<int>(samples[0].size()))
|
||||||
// // throw invalid_argument("k must be between 1 and number of features");
|
throw invalid_argument("k must be between 1 and number of features");
|
||||||
// numFeatures = 0;
|
numFeatures = 0;
|
||||||
// numClasses = 0;
|
numClasses = 0;
|
||||||
// numSamples = 0;
|
numSamples = 0;
|
||||||
// fitted = false;
|
fitted = false;
|
||||||
// }
|
}
|
||||||
SelectKBestWeighted::SelectKBestWeighted(samples_t& samples) : samples(samples) {}
|
|
||||||
void SelectKBestWeighted::SelectKBestWeighted::fit()
|
void SelectKBestWeighted::SelectKBestWeighted::fit()
|
||||||
{
|
{
|
||||||
// auto labelsCopy = labels;
|
auto labelsCopy = labels;
|
||||||
numFeatures = 0;//samples[0].size();
|
numFeatures = samples[0].size();
|
||||||
numSamples = samples.size();
|
numSamples = samples.size();
|
||||||
// sort(labelsCopy.begin(), labelsCopy.end());
|
sort(labelsCopy.begin(), labelsCopy.end());
|
||||||
// auto last = unique(labelsCopy.begin(), labelsCopy.end());
|
auto last = unique(labelsCopy.begin(), labelsCopy.end());
|
||||||
// labelsCopy.erase(last, labelsCopy.end());
|
labelsCopy.erase(last, labelsCopy.end());
|
||||||
// numClasses = labelsCopy.size();
|
numClasses = labelsCopy.size();
|
||||||
// score.reserve(numFeatures);
|
score.reserve(numFeatures);
|
||||||
// for (int i = 0; i < numFeatures; ++i) {
|
for (int i = 0; i < numFeatures; ++i) {
|
||||||
// score.push_back(MutualInformation(i));
|
score.push_back(MutualInformation(i));
|
||||||
// }
|
}
|
||||||
outputValues();
|
outputValues();
|
||||||
fitted = true;
|
fitted = true;
|
||||||
}
|
}
|
||||||
void SelectKBestWeighted::outputValues()
|
void SelectKBestWeighted::outputValues()
|
||||||
{
|
{
|
||||||
cout << "numFeatures: " << numFeatures << endl;
|
cout << "numFeatures: " << numFeatures << endl;
|
||||||
// cout << "numClasses: " << numClasses << endl;
|
cout << "numClasses: " << numClasses << endl;
|
||||||
cout << "numSamples: " << numSamples << endl;
|
cout << "numSamples: " << numSamples << endl;
|
||||||
// cout << "k: " << k << endl;
|
cout << "k: " << k << endl;
|
||||||
// cout << "weights: ";
|
cout << "weights: ";
|
||||||
// for (auto item : weights)
|
for (auto item : weights)
|
||||||
// cout << item << ", ";
|
|
||||||
// cout << "end." << endl;
|
|
||||||
// cout << "labels: ";
|
|
||||||
// for (auto item : labels)
|
|
||||||
// cout << item << ", ";
|
|
||||||
// cout << "end." << endl;
|
|
||||||
cout << "samples: ";
|
|
||||||
for (auto item : samples) {
|
|
||||||
// for (auto item2 : item)
|
|
||||||
// cout << item2 << ", ";
|
|
||||||
// cout << "end." << endl;
|
|
||||||
cout << item << ", ";
|
cout << item << ", ";
|
||||||
|
cout << "end." << endl;
|
||||||
|
cout << "labels: ";
|
||||||
|
for (auto item : labels)
|
||||||
|
cout << item << ", ";
|
||||||
|
cout << "end." << endl;
|
||||||
|
cout << "samples: " << endl;
|
||||||
|
for (auto item : samples) {
|
||||||
|
for (auto item2 : item)
|
||||||
|
cout << item2 << ", ";
|
||||||
|
cout << "end." << endl;
|
||||||
}
|
}
|
||||||
cout << "end." << endl;
|
cout << "end." << endl;
|
||||||
}
|
}
|
||||||
// precision_t SelectKBestWeighted::entropyLabel()
|
precision_t SelectKBestWeighted::entropyLabel()
|
||||||
// {
|
{
|
||||||
// return entropy(labels);
|
return entropy(labels);
|
||||||
// }
|
}
|
||||||
// precision_t SelectKBestWeighted::entropy(const sample_t& data)
|
precision_t SelectKBestWeighted::entropy(const sample_t& data)
|
||||||
// {
|
{
|
||||||
// precision_t p;
|
precision_t p;
|
||||||
// precision_t ventropy = 0, totalWeight = 0;
|
precision_t ventropy = 0, totalWeight = 0;
|
||||||
// score_t counts(numClasses + 1, 0);
|
score_t counts(numClasses + 1, 0);
|
||||||
// for (auto i = 0; i < data.size(); ++i) {
|
for (auto i = 0; i < data.size(); ++i) {
|
||||||
// counts[data[i]] += weights[i];
|
counts[data[i]] += weights[i];
|
||||||
// totalWeight += weights[i];
|
totalWeight += weights[i];
|
||||||
// }
|
}
|
||||||
// for (auto count : counts) {
|
for (auto count : counts) {
|
||||||
// p = count / totalWeight;
|
p = count / totalWeight;
|
||||||
// ventropy -= p * log2(p);
|
if (p > 0)
|
||||||
// }
|
if (nat)
|
||||||
// return ventropy;
|
ventropy -= p * log(p);
|
||||||
// }
|
else
|
||||||
// // H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
|
ventropy -= p * log2(p);
|
||||||
// precision_t SelectKBestWeighted::conditionalEntropy(const int feature)
|
}
|
||||||
// {
|
return ventropy;
|
||||||
// unordered_map<value_t, precision_t> featureCounts;
|
}
|
||||||
// unordered_map<value_t, unordered_map<value_t, precision_t>> jointCounts;
|
// H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
|
||||||
// featureCounts.clear();
|
precision_t SelectKBestWeighted::conditionalEntropy(const int feature)
|
||||||
// jointCounts.clear();
|
{
|
||||||
// auto totalWeight = 0;
|
unordered_map<value_t, precision_t> featureCounts;
|
||||||
// for (auto i = 0; i < numSamples; i++) {
|
unordered_map<value_t, unordered_map<value_t, precision_t>> jointCounts;
|
||||||
// featureCounts[samples[feature][i]] += weights[i];
|
featureCounts.clear();
|
||||||
// jointCounts[samples[feature][i]][labels[i]] += weights[i];
|
jointCounts.clear();
|
||||||
// totalWeight += weights[i];
|
precision_t totalWeight = 0;
|
||||||
// }
|
for (auto i = 0; i < numSamples; i++) {
|
||||||
// precision_t entropy = 0;
|
featureCounts[samples[i][feature]] += weights[i];
|
||||||
// for (auto& [f, count] : featureCounts) {
|
jointCounts[samples[i][feature]][labels[i]] += weights[i];
|
||||||
// auto p_f = count / totalWeight;
|
totalWeight += weights[i];
|
||||||
// precision_t entropy_f = 0;
|
}
|
||||||
// for (auto& [l, jointCount] : jointCounts[f]) {
|
if (totalWeight == 0)
|
||||||
// auto p_l_f = jointCount / totalWeight;
|
throw invalid_argument("Total weight should not be zero");
|
||||||
// entropy_f -= p_l_f * log2(p_l_f);
|
precision_t entropy = 0;
|
||||||
// }
|
for (auto& [feat, count] : featureCounts) {
|
||||||
// entropy += p_f * entropy_f;
|
auto p_f = count / totalWeight;
|
||||||
// }
|
precision_t entropy_f = 0;
|
||||||
// return entropy;
|
for (auto& [label, jointCount] : jointCounts[feat]) {
|
||||||
// }
|
auto p_l_f = jointCount / count;
|
||||||
|
if (p_l_f > 0) {
|
||||||
// // I(X;Y) = H(Y) - H(Y|X)
|
double epsilon = 1e-9;
|
||||||
// precision_t SelectKBestWeighted::MutualInformation(const int i)
|
if (nat)
|
||||||
// {
|
entropy_f -= p_l_f * log(p_l_f + epsilon);
|
||||||
// // return entropyLabel() - conditionalEntropy(i);
|
else
|
||||||
// return 25 / (i + 1);
|
entropy_f -= p_l_f * log2(p_l_f + epsilon);
|
||||||
// }
|
}
|
||||||
|
}
|
||||||
|
entropy += p_f * entropy_f;
|
||||||
|
}
|
||||||
|
return entropy;
|
||||||
|
}
|
||||||
|
// I(X;Y) = H(Y) - H(Y|X)
|
||||||
|
precision_t SelectKBestWeighted::MutualInformation(const int i)
|
||||||
|
{
|
||||||
|
return entropyLabel() - conditionalEntropy(i);
|
||||||
|
}
|
||||||
score_t SelectKBestWeighted::getScore() const
|
score_t SelectKBestWeighted::getScore() const
|
||||||
{
|
{
|
||||||
if (!fitted)
|
if (!fitted)
|
||||||
|
@@ -5,31 +5,30 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
namespace features {
|
namespace features {
|
||||||
typedef float precision_t;
|
typedef double precision_t;
|
||||||
typedef int value_t;
|
typedef int value_t;
|
||||||
typedef vector<value_t> sample_t;
|
typedef vector<value_t> sample_t;
|
||||||
// typedef vector<sample_t> samples_t;
|
typedef vector<sample_t> samples_t;
|
||||||
typedef vector<value_t> samples_t;
|
|
||||||
typedef vector<value_t> labels_t;
|
typedef vector<value_t> labels_t;
|
||||||
typedef vector<precision_t> score_t, weights_t;
|
typedef vector<precision_t> score_t, weights_t;
|
||||||
|
|
||||||
class SelectKBestWeighted {
|
class SelectKBestWeighted {
|
||||||
private:
|
private:
|
||||||
samples_t& samples;
|
const samples_t samples;
|
||||||
// const labels_t& labels;
|
const labels_t labels;
|
||||||
// const weights_t& weights;
|
const weights_t weights;
|
||||||
// const int k;
|
const int k;
|
||||||
|
bool nat; // use natural log or log2
|
||||||
int numFeatures, numClasses, numSamples;
|
int numFeatures, numClasses, numSamples;
|
||||||
bool fitted;
|
bool fitted;
|
||||||
score_t score;
|
score_t score;
|
||||||
// precision_t entropyLabel();
|
precision_t entropyLabel();
|
||||||
// precision_t entropy(const sample_t&);
|
precision_t entropy(const sample_t&);
|
||||||
// precision_t conditionalEntropy(const int);
|
precision_t conditionalEntropy(const int);
|
||||||
// precision_t MutualInformation(const int);
|
precision_t MutualInformation(const int);
|
||||||
void outputValues();
|
void outputValues();
|
||||||
public:
|
public:
|
||||||
// SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int);
|
SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool);
|
||||||
SelectKBestWeighted(samples_t&);
|
|
||||||
void fit();
|
void fit();
|
||||||
score_t getScore() const;
|
score_t getScore() const;
|
||||||
static inline string version() { return "0.1.0"; };
|
static inline string version() { return "0.1.0"; };
|
||||||
|
@@ -1,37 +0,0 @@
|
|||||||
#include "FeatureTest.h"
|
|
||||||
#include <iostream>
|
|
||||||
namespace featuresTest {
|
|
||||||
SelectKBest::SelectKBest(vector<int>& samples) : samples(samples) {}
|
|
||||||
SelectKBest::SelectKBest() = default;
|
|
||||||
SelectKBest::~SelectKBest() = default;
|
|
||||||
void SelectKBest::SelectKBest::fit()
|
|
||||||
{
|
|
||||||
numFeatures = 0;
|
|
||||||
numSamples = samples.size();
|
|
||||||
outputValues();
|
|
||||||
fitted = true;
|
|
||||||
}
|
|
||||||
void SelectKBest::outputValues()
|
|
||||||
{
|
|
||||||
cout << "numFeatures: " << numFeatures << endl;
|
|
||||||
// cout << "numClasses: " << numClasses << endl;
|
|
||||||
cout << "numSamples: " << numSamples << endl;
|
|
||||||
// cout << "k: " << k << endl;
|
|
||||||
// cout << "weights: ";
|
|
||||||
// for (auto item : weights)
|
|
||||||
// cout << item << ", ";
|
|
||||||
// cout << "end." << endl;
|
|
||||||
// cout << "labels: ";
|
|
||||||
// for (auto item : labels)
|
|
||||||
// cout << item << ", ";
|
|
||||||
// cout << "end." << endl;
|
|
||||||
cout << "samples: ";
|
|
||||||
for (auto item : samples) {
|
|
||||||
// for (auto item2 : item)
|
|
||||||
// cout << item2 << ", ";
|
|
||||||
// cout << "end." << endl;
|
|
||||||
cout << item << ", ";
|
|
||||||
}
|
|
||||||
cout << "end." << endl;
|
|
||||||
}
|
|
||||||
}
|
|
@@ -1,30 +0,0 @@
|
|||||||
#ifndef SELECT_K_BEST_TEST_H
|
|
||||||
#define SELECT_K_BEST_TEST_H
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
using namespace std;
|
|
||||||
namespace featuresTest {
|
|
||||||
typedef float precision_t;
|
|
||||||
typedef int value_t;
|
|
||||||
typedef vector<value_t> sample_t;
|
|
||||||
// typedef vector<sample_t> samples_t;
|
|
||||||
typedef vector<value_t> samples_t;
|
|
||||||
typedef vector<value_t> labels_t;
|
|
||||||
typedef vector<precision_t> score_t, weights_t;
|
|
||||||
|
|
||||||
class SelectKBest {
|
|
||||||
private:
|
|
||||||
vector<int>& samples;
|
|
||||||
int numFeatures, numClasses, numSamples;
|
|
||||||
bool fitted;
|
|
||||||
void outputValues();
|
|
||||||
public:
|
|
||||||
SelectKBest();
|
|
||||||
SelectKBest(vector<int>&);
|
|
||||||
~SelectKBest();
|
|
||||||
void fit();
|
|
||||||
static inline string version() { return "0.1.0"; };
|
|
||||||
};
|
|
||||||
}
|
|
||||||
#endif
|
|
File diff suppressed because it is too large
Load Diff
@@ -2,50 +2,29 @@
|
|||||||
# cython: language_level = 3
|
# cython: language_level = 3
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libcpp.string cimport string
|
from libcpp.string cimport string
|
||||||
|
from libcpp cimport bool
|
||||||
|
|
||||||
cdef extern from "FeatureTest.h" namespace "featuresTest":
|
|
||||||
ctypedef float precision_t
|
cdef extern from "FeatureSelect.h" namespace "features":
|
||||||
cdef cppclass SelectKBest:
|
ctypedef double precision_t
|
||||||
SelectKBest(vector[int]&) except +
|
cdef cppclass SelectKBestWeighted:
|
||||||
|
SelectKBestWeighted(vector[vector[int]]&, vector[int]&, vector[precision_t]&, int, bool) except +
|
||||||
void fit()
|
void fit()
|
||||||
string version()
|
string version()
|
||||||
|
vector[precision_t] getScore()
|
||||||
|
|
||||||
cdef class CSelectKBest:
|
cdef class CSelectKBestWeighted:
|
||||||
cdef SelectKBest *thisptr
|
cdef SelectKBestWeighted *thisptr
|
||||||
def __cinit__(self, X):
|
def __cinit__(self, X, y, weights, k, natural=False): # log or log2
|
||||||
self.thisptr = new SelectKBest(X)
|
self.thisptr = new SelectKBestWeighted(X, y, weights, k, natural)
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
del self.thisptr
|
del self.thisptr
|
||||||
def fit(self,):
|
def fit(self,):
|
||||||
self.thisptr.fit()
|
self.thisptr.fit()
|
||||||
return self
|
return self
|
||||||
|
def get_score(self):
|
||||||
|
return self.thisptr.getScore()
|
||||||
def get_version(self):
|
def get_version(self):
|
||||||
return self.thisptr.version()
|
return self.thisptr.version()
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (CSelectKBest, ())
|
return (CSelectKBestWeighted, ())
|
||||||
|
|
||||||
# cdef extern from "FeatureSelect.h" namespace "features":
|
|
||||||
# ctypedef float precision_t
|
|
||||||
# cdef cppclass SelectKBestWeighted:
|
|
||||||
# SelectKBestWeighted(vector[int]&) except +
|
|
||||||
# # SelectKBestWeighted(vector[int]&, vector[int]&, vector[precision_t]&, int) except +
|
|
||||||
# void fit()
|
|
||||||
# string version()
|
|
||||||
# vector[precision_t] getScore()
|
|
||||||
|
|
||||||
# cdef class CSelectKBestWeighted:
|
|
||||||
# cdef SelectKBestWeighted *thisptr
|
|
||||||
# def __cinit__(self, X, y, weights, k):
|
|
||||||
# # self.thisptr = new SelectKBestWeighted(X, y, weights, k)
|
|
||||||
# self.thisptr = new SelectKBestWeighted(X)
|
|
||||||
# def __dealloc__(self):
|
|
||||||
# del self.thisptr
|
|
||||||
# def fit(self,):
|
|
||||||
# self.thisptr.fit()
|
|
||||||
# return self
|
|
||||||
# def get_score(self):
|
|
||||||
# return self.thisptr.getScore()
|
|
||||||
# def get_version(self):
|
|
||||||
# return self.thisptr.version()
|
|
||||||
# def __reduce__(self):
|
|
||||||
# return (CSelectKBestWeighted, ())
|
|
||||||
|
10
test.py
Normal file
10
test.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
from bayesclass.cppSelectFeatures import CSelectKBestWeighted
|
||||||
|
|
||||||
|
|
||||||
|
X = [[x for x in range(i, i + 3)] for i in range(1, 30, 3)]
|
||||||
|
weights = [25 / (i + 1) for i in range(10)]
|
||||||
|
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||||
|
test = CSelectKBestWeighted(X, labels, weights, 3)
|
||||||
|
test.fit()
|
||||||
|
for item in test.get_score():
|
||||||
|
print(item)
|
Reference in New Issue
Block a user