diff --git a/bayesclass/CMakeLists.txt b/bayesclass/CMakeLists.txt deleted file mode 100644 index 6af7a67..0000000 --- a/bayesclass/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -cmake_minimum_required(VERSION 3.20) -project(feature) - -set(CMAKE_CXX_STANDARD 11) -set(CMAKE_BUILD_TYPE Debug) - -add_executable(feature FeatureSelect.cpp) diff --git a/bayesclass/FeatureSelect.cpp b/bayesclass/FeatureSelect.cpp index e85875a..9eab199 100644 --- a/bayesclass/FeatureSelect.cpp +++ b/bayesclass/FeatureSelect.cpp @@ -1,5 +1,4 @@ #include "FeatureSelect.h" -#include namespace features { SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat) : samples(samples), labels(labels), weights(weights), k(k), nat(nat) @@ -22,59 +21,46 @@ namespace features { auto labelsCopy = labels; numFeatures = samples[0].size(); numSamples = samples.size(); + // compute number of classes sort(labelsCopy.begin(), labelsCopy.end()); auto last = unique(labelsCopy.begin(), labelsCopy.end()); labelsCopy.erase(last, labelsCopy.end()); numClasses = labelsCopy.size(); - score.reserve(numFeatures); + // compute scores + scores.reserve(numFeatures); for (int i = 0; i < numFeatures; ++i) { - score.push_back(MutualInformation(i)); + scores.push_back(MutualInformation(i)); + features.push_back(i); } - outputValues(); + // sort & reduce scores and features + sort(features.begin(), features.end(), [&](int i, int j) + { return scores[i] > scores[j]; }); + sort(scores.begin(), scores.end(), greater()); + features.resize(k); + scores.resize(k); fitted = true; } - void SelectKBestWeighted::outputValues() - { - cout << "numFeatures: " << numFeatures << endl; - cout << "numClasses: " << numClasses << endl; - cout << "numSamples: " << numSamples << endl; - cout << "k: " << k << endl; - cout << "weights: "; - for (auto item : weights) - cout << item << ", "; - cout << "end." << endl; - cout << "labels: "; - for (auto item : labels) - cout << item << ", "; - cout << "end." << endl; - cout << "samples: " << endl; - for (auto item : samples) { - for (auto item2 : item) - cout << item2 << ", "; - cout << "end." << endl; - } - cout << "end." << endl; - } precision_t SelectKBestWeighted::entropyLabel() { return entropy(labels); } precision_t SelectKBestWeighted::entropy(const sample_t& data) { - precision_t p; precision_t ventropy = 0, totalWeight = 0; score_t counts(numClasses + 1, 0); - for (auto i = 0; i < data.size(); ++i) { + for (auto i = 0; i < static_cast(data.size()); ++i) { counts[data[i]] += weights[i]; totalWeight += weights[i]; } for (auto count : counts) { - p = count / totalWeight; - if (p > 0) - if (nat) + precision_t p = count / totalWeight; + if (p > 0) { + if (nat) { ventropy -= p * log(p); - else + } else { ventropy -= p * log2(p); + } + } } return ventropy; } @@ -100,10 +86,11 @@ namespace features { for (auto& [label, jointCount] : jointCounts[feat]) { auto p_l_f = jointCount / count; if (p_l_f > 0) { - if (nat) + if (nat) { entropy_f -= p_l_f * log(p_l_f); - else + } else { entropy_f -= p_l_f * log2(p_l_f); + } } } entropy += p_f * entropy_f; @@ -115,27 +102,17 @@ namespace features { { return entropyLabel() - conditionalEntropy(i); } - score_t SelectKBestWeighted::getScore() const + score_t SelectKBestWeighted::getScores() const { if (!fitted) throw logic_error("score not fitted"); - return score; + return scores; + } + //Return the indices of the selected features + labels_t SelectKBestWeighted::getFeatures() const + { + if (!fitted) + throw logic_error("score not fitted"); + return features; } } - -// using namespace std; - -// int main() -// { -// vector> samples = { {1, 2, 3}, {4, 5, 6}, {7, 8, 9} }; -// vector labels = { 1, 2, 1 }; -// vector weights = { 0.1, 0.7, 0.2 }; -// int k = 3; -// auto metric = features::SelectKBestWeighted(samples, labels, weights, k); -// metric.fit(); -// cout << "score: "; -// for (auto item : metric.getScore()) -// cout << item << ", "; -// cout << "end." << endl; -// return 0; -// } diff --git a/bayesclass/FeatureSelect.h b/bayesclass/FeatureSelect.h index 8737065..b26fb10 100644 --- a/bayesclass/FeatureSelect.h +++ b/bayesclass/FeatureSelect.h @@ -21,16 +21,17 @@ namespace features { bool nat; // use natural log or log2 int numFeatures, numClasses, numSamples; bool fitted; - score_t score; + score_t scores; // scores of the features + labels_t features; // indices of the selected features precision_t entropyLabel(); precision_t entropy(const sample_t&); precision_t conditionalEntropy(const int); precision_t MutualInformation(const int); - void outputValues(); public: SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool); void fit(); - score_t getScore() const; + score_t getScores() const; + labels_t getFeatures() const; //Return the indices of the selected features static inline string version() { return "0.1.0"; }; }; } diff --git a/bayesclass/cSelectFeatures.c b/bayesclass/cSelectFeatures.c deleted file mode 100644 index 06f2230..0000000 --- a/bayesclass/cSelectFeatures.c +++ /dev/null @@ -1 +0,0 @@ -#error Do not use this file, it is the result of a failed Cython compilation. diff --git a/bayesclass/cSelectFeatures.cpp b/bayesclass/cSelectFeatures.cpp index bc173f1..877f744 100644 --- a/bayesclass/cSelectFeatures.cpp +++ b/bayesclass/cSelectFeatures.cpp @@ -983,8 +983,8 @@ static const char *__pyx_f[] = { /*--- Type declarations ---*/ struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted; -/* "bayesclass/cSelectFeatures.pyx":16 - * vector[precision_t] getScore() +/* "bayesclass/cSelectFeatures.pyx":17 + * vector[int] getFeatures() * * cdef class CSelectKBestWeighted: # <<<<<<<<<<<<<< * cdef SelectKBestWeighted *thisptr @@ -1251,6 +1251,9 @@ static void __Pyx_CppExn2PyErr() { /* CIntFromPy.proto */ static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *); +/* CIntToPy.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value); + /* CIntFromPy.proto */ static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *); @@ -1294,6 +1297,7 @@ static std::vector __pyx_convert_vector_from_py_int(PyObject *); /*proto*/ static std::vector > __pyx_convert_vector_from_py_std_3a__3a_vector_3c_int_3e___(PyObject *); /*proto*/ static std::vector __pyx_convert_vector_from_py_features_3a__3a_precision_t(PyObject *); /*proto*/ static PyObject *__pyx_convert_vector_to_py_features_3a__3a_precision_t(const std::vector &); /*proto*/ +static PyObject *__pyx_convert_vector_to_py_int(const std::vector &); /*proto*/ static CYTHON_INLINE PyObject *__pyx_convert_PyObject_string_to_py_std__in_string(std::string const &); /*proto*/ static CYTHON_INLINE PyObject *__pyx_convert_PyUnicode_string_to_py_std__in_string(std::string const &); /*proto*/ static CYTHON_INLINE PyObject *__pyx_convert_PyStr_string_to_py_std__in_string(std::string const &); /*proto*/ @@ -1330,13 +1334,14 @@ static PyObject *__pyx_n_s_y; static int __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted___cinit__(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self, PyObject *__pyx_v_X, PyObject *__pyx_v_y, PyObject *__pyx_v_weights, PyObject *__pyx_v_k, PyObject *__pyx_v_natural); /* proto */ static void __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_2__dealloc__(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self); /* proto */ static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_4fit(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self); /* proto */ -static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_6get_score(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self); /* proto */ -static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_8get_version(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self); /* proto */ -static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_10__reduce__(CYTHON_UNUSED struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_6get_scores(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_8get_features(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_10get_version(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_12__reduce__(CYTHON_UNUSED struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self); /* proto */ static PyObject *__pyx_tp_new_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/ /* Late includes */ -/* "bayesclass/cSelectFeatures.pyx":18 +/* "bayesclass/cSelectFeatures.pyx":19 * cdef class CSelectKBestWeighted: * cdef SelectKBestWeighted *thisptr * def __cinit__(self, X, y, weights, k, natural=False): # log or log2 # <<<<<<<<<<<<<< @@ -1388,19 +1393,19 @@ static int __pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_1__c case 1: if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_y)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 4, 5, 1); __PYX_ERR(0, 18, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 4, 5, 1); __PYX_ERR(0, 19, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 2: if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_weights)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 4, 5, 2); __PYX_ERR(0, 18, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 4, 5, 2); __PYX_ERR(0, 19, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 3: if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_k)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 4, 5, 3); __PYX_ERR(0, 18, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 4, 5, 3); __PYX_ERR(0, 19, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 4: @@ -1410,7 +1415,7 @@ static int __pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_1__c } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(0, 18, __pyx_L3_error) + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(0, 19, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { @@ -1432,7 +1437,7 @@ static int __pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_1__c } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 4, 5, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 18, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 4, 5, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 19, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("bayesclass.cppSelectFeatures.CSelectKBestWeighted.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); @@ -1459,27 +1464,27 @@ static int __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted___ci int __pyx_clineno = 0; __Pyx_RefNannySetupContext("__cinit__", 0); - /* "bayesclass/cSelectFeatures.pyx":19 + /* "bayesclass/cSelectFeatures.pyx":20 * cdef SelectKBestWeighted *thisptr * def __cinit__(self, X, y, weights, k, natural=False): # log or log2 * self.thisptr = new SelectKBestWeighted(X, y, weights, k, natural) # <<<<<<<<<<<<<< * def __dealloc__(self): * del self.thisptr */ - __pyx_t_1 = __pyx_convert_vector_from_py_std_3a__3a_vector_3c_int_3e___(__pyx_v_X); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 19, __pyx_L1_error) - __pyx_t_2 = __pyx_convert_vector_from_py_int(__pyx_v_y); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 19, __pyx_L1_error) - __pyx_t_3 = __pyx_convert_vector_from_py_features_3a__3a_precision_t(__pyx_v_weights); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 19, __pyx_L1_error) - __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_v_k); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 19, __pyx_L1_error) - __pyx_t_5 = __Pyx_PyObject_IsTrue(__pyx_v_natural); if (unlikely((__pyx_t_5 == ((bool)-1)) && PyErr_Occurred())) __PYX_ERR(0, 19, __pyx_L1_error) + __pyx_t_1 = __pyx_convert_vector_from_py_std_3a__3a_vector_3c_int_3e___(__pyx_v_X); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 20, __pyx_L1_error) + __pyx_t_2 = __pyx_convert_vector_from_py_int(__pyx_v_y); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 20, __pyx_L1_error) + __pyx_t_3 = __pyx_convert_vector_from_py_features_3a__3a_precision_t(__pyx_v_weights); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 20, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_v_k); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 20, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_IsTrue(__pyx_v_natural); if (unlikely((__pyx_t_5 == ((bool)-1)) && PyErr_Occurred())) __PYX_ERR(0, 20, __pyx_L1_error) try { __pyx_t_6 = new features::SelectKBestWeighted(__pyx_t_1, __pyx_t_2, __pyx_t_3, __pyx_t_4, __pyx_t_5); } catch(...) { __Pyx_CppExn2PyErr(); - __PYX_ERR(0, 19, __pyx_L1_error) + __PYX_ERR(0, 20, __pyx_L1_error) } __pyx_v_self->thisptr = __pyx_t_6; - /* "bayesclass/cSelectFeatures.pyx":18 + /* "bayesclass/cSelectFeatures.pyx":19 * cdef class CSelectKBestWeighted: * cdef SelectKBestWeighted *thisptr * def __cinit__(self, X, y, weights, k, natural=False): # log or log2 # <<<<<<<<<<<<<< @@ -1498,7 +1503,7 @@ static int __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted___ci return __pyx_r; } -/* "bayesclass/cSelectFeatures.pyx":20 +/* "bayesclass/cSelectFeatures.pyx":21 * def __cinit__(self, X, y, weights, k, natural=False): # log or log2 * self.thisptr = new SelectKBestWeighted(X, y, weights, k, natural) * def __dealloc__(self): # <<<<<<<<<<<<<< @@ -1521,7 +1526,7 @@ static void __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_2__ __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__dealloc__", 0); - /* "bayesclass/cSelectFeatures.pyx":21 + /* "bayesclass/cSelectFeatures.pyx":22 * self.thisptr = new SelectKBestWeighted(X, y, weights, k, natural) * def __dealloc__(self): * del self.thisptr # <<<<<<<<<<<<<< @@ -1530,7 +1535,7 @@ static void __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_2__ */ delete __pyx_v_self->thisptr; - /* "bayesclass/cSelectFeatures.pyx":20 + /* "bayesclass/cSelectFeatures.pyx":21 * def __cinit__(self, X, y, weights, k, natural=False): # log or log2 * self.thisptr = new SelectKBestWeighted(X, y, weights, k, natural) * def __dealloc__(self): # <<<<<<<<<<<<<< @@ -1542,7 +1547,7 @@ static void __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_2__ __Pyx_RefNannyFinishContext(); } -/* "bayesclass/cSelectFeatures.pyx":22 +/* "bayesclass/cSelectFeatures.pyx":23 * def __dealloc__(self): * del self.thisptr * def fit(self,): # <<<<<<<<<<<<<< @@ -1568,28 +1573,28 @@ static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighte __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("fit", 0); - /* "bayesclass/cSelectFeatures.pyx":23 + /* "bayesclass/cSelectFeatures.pyx":24 * del self.thisptr * def fit(self,): * self.thisptr.fit() # <<<<<<<<<<<<<< * return self - * def get_score(self): + * def get_scores(self): */ __pyx_v_self->thisptr->fit(); - /* "bayesclass/cSelectFeatures.pyx":24 + /* "bayesclass/cSelectFeatures.pyx":25 * def fit(self,): * self.thisptr.fit() * return self # <<<<<<<<<<<<<< - * def get_score(self): - * return self.thisptr.getScore() + * def get_scores(self): + * return self.thisptr.getScores() */ __Pyx_XDECREF(__pyx_r); __Pyx_INCREF(((PyObject *)__pyx_v_self)); __pyx_r = ((PyObject *)__pyx_v_self); goto __pyx_L0; - /* "bayesclass/cSelectFeatures.pyx":22 + /* "bayesclass/cSelectFeatures.pyx":23 * def __dealloc__(self): * del self.thisptr * def fit(self,): # <<<<<<<<<<<<<< @@ -1604,62 +1609,62 @@ static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighte return __pyx_r; } -/* "bayesclass/cSelectFeatures.pyx":25 +/* "bayesclass/cSelectFeatures.pyx":26 * self.thisptr.fit() * return self - * def get_score(self): # <<<<<<<<<<<<<< - * return self.thisptr.getScore() - * def get_version(self): + * def get_scores(self): # <<<<<<<<<<<<<< + * return self.thisptr.getScores() + * def get_features(self): */ /* Python wrapper */ -static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_7get_score(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ -static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_7get_score(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { +static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_7get_scores(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_7get_scores(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations - __Pyx_RefNannySetupContext("get_score (wrapper)", 0); - __pyx_r = __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_6get_score(((struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *)__pyx_v_self)); + __Pyx_RefNannySetupContext("get_scores (wrapper)", 0); + __pyx_r = __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_6get_scores(((struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *)__pyx_v_self)); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } -static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_6get_score(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self) { +static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_6get_scores(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations PyObject *__pyx_t_1 = NULL; int __pyx_lineno = 0; const char *__pyx_filename = NULL; int __pyx_clineno = 0; - __Pyx_RefNannySetupContext("get_score", 0); + __Pyx_RefNannySetupContext("get_scores", 0); - /* "bayesclass/cSelectFeatures.pyx":26 + /* "bayesclass/cSelectFeatures.pyx":27 * return self - * def get_score(self): - * return self.thisptr.getScore() # <<<<<<<<<<<<<< - * def get_version(self): - * return self.thisptr.version() + * def get_scores(self): + * return self.thisptr.getScores() # <<<<<<<<<<<<<< + * def get_features(self): + * return self.thisptr.getFeatures() */ __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __pyx_convert_vector_to_py_features_3a__3a_precision_t(__pyx_v_self->thisptr->getScore()); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 26, __pyx_L1_error) + __pyx_t_1 = __pyx_convert_vector_to_py_features_3a__3a_precision_t(__pyx_v_self->thisptr->getScores()); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 27, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; goto __pyx_L0; - /* "bayesclass/cSelectFeatures.pyx":25 + /* "bayesclass/cSelectFeatures.pyx":26 * self.thisptr.fit() * return self - * def get_score(self): # <<<<<<<<<<<<<< - * return self.thisptr.getScore() - * def get_version(self): + * def get_scores(self): # <<<<<<<<<<<<<< + * return self.thisptr.getScores() + * def get_features(self): */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); - __Pyx_AddTraceback("bayesclass.cppSelectFeatures.CSelectKBestWeighted.get_score", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_AddTraceback("bayesclass.cppSelectFeatures.CSelectKBestWeighted.get_scores", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __Pyx_XGIVEREF(__pyx_r); @@ -1667,28 +1672,91 @@ static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighte return __pyx_r; } -/* "bayesclass/cSelectFeatures.pyx":27 - * def get_score(self): - * return self.thisptr.getScore() - * def get_version(self): # <<<<<<<<<<<<<< - * return self.thisptr.version() - * def __reduce__(self): +/* "bayesclass/cSelectFeatures.pyx":28 + * def get_scores(self): + * return self.thisptr.getScores() + * def get_features(self): # <<<<<<<<<<<<<< + * return self.thisptr.getFeatures() + * def get_version(self): */ /* Python wrapper */ -static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_9get_version(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ -static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_9get_version(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { +static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_9get_features(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_9get_features(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations - __Pyx_RefNannySetupContext("get_version (wrapper)", 0); - __pyx_r = __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_8get_version(((struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *)__pyx_v_self)); + __Pyx_RefNannySetupContext("get_features (wrapper)", 0); + __pyx_r = __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_8get_features(((struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *)__pyx_v_self)); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } -static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_8get_version(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self) { +static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_8get_features(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("get_features", 0); + + /* "bayesclass/cSelectFeatures.pyx":29 + * return self.thisptr.getScores() + * def get_features(self): + * return self.thisptr.getFeatures() # <<<<<<<<<<<<<< + * def get_version(self): + * return self.thisptr.version() + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __pyx_convert_vector_to_py_int(__pyx_v_self->thisptr->getFeatures()); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 29, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "bayesclass/cSelectFeatures.pyx":28 + * def get_scores(self): + * return self.thisptr.getScores() + * def get_features(self): # <<<<<<<<<<<<<< + * return self.thisptr.getFeatures() + * def get_version(self): + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("bayesclass.cppSelectFeatures.CSelectKBestWeighted.get_features", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "bayesclass/cSelectFeatures.pyx":30 + * def get_features(self): + * return self.thisptr.getFeatures() + * def get_version(self): # <<<<<<<<<<<<<< + * return self.thisptr.version() + * def __reduce__(self): + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_11get_version(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_11get_version(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("get_version (wrapper)", 0); + __pyx_r = __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_10get_version(((struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_10get_version(struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations PyObject *__pyx_t_1 = NULL; @@ -1697,23 +1765,23 @@ static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighte int __pyx_clineno = 0; __Pyx_RefNannySetupContext("get_version", 0); - /* "bayesclass/cSelectFeatures.pyx":28 - * return self.thisptr.getScore() + /* "bayesclass/cSelectFeatures.pyx":31 + * return self.thisptr.getFeatures() * def get_version(self): * return self.thisptr.version() # <<<<<<<<<<<<<< * def __reduce__(self): * return (CSelectKBestWeighted, ()) */ __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __pyx_convert_PyBytes_string_to_py_std__in_string(__pyx_v_self->thisptr->version()); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 28, __pyx_L1_error) + __pyx_t_1 = __pyx_convert_PyBytes_string_to_py_std__in_string(__pyx_v_self->thisptr->version()); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 31, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; goto __pyx_L0; - /* "bayesclass/cSelectFeatures.pyx":27 - * def get_score(self): - * return self.thisptr.getScore() + /* "bayesclass/cSelectFeatures.pyx":30 + * def get_features(self): + * return self.thisptr.getFeatures() * def get_version(self): # <<<<<<<<<<<<<< * return self.thisptr.version() * def __reduce__(self): @@ -1730,7 +1798,7 @@ static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighte return __pyx_r; } -/* "bayesclass/cSelectFeatures.pyx":29 +/* "bayesclass/cSelectFeatures.pyx":32 * def get_version(self): * return self.thisptr.version() * def __reduce__(self): # <<<<<<<<<<<<<< @@ -1738,19 +1806,19 @@ static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighte */ /* Python wrapper */ -static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_11__reduce__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ -static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_11__reduce__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { +static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_13__reduce__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static PyObject *__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_13__reduce__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__reduce__ (wrapper)", 0); - __pyx_r = __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_10__reduce__(((struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *)__pyx_v_self)); + __pyx_r = __pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_12__reduce__(((struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *)__pyx_v_self)); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } -static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_10__reduce__(CYTHON_UNUSED struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self) { +static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_12__reduce__(CYTHON_UNUSED struct __pyx_obj_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted *__pyx_v_self) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations PyObject *__pyx_t_1 = NULL; @@ -1759,13 +1827,13 @@ static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighte int __pyx_clineno = 0; __Pyx_RefNannySetupContext("__reduce__", 0); - /* "bayesclass/cSelectFeatures.pyx":30 + /* "bayesclass/cSelectFeatures.pyx":33 * return self.thisptr.version() * def __reduce__(self): * return (CSelectKBestWeighted, ()) # <<<<<<<<<<<<<< */ __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 30, __pyx_L1_error) + __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 33, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_INCREF(((PyObject *)__pyx_ptype_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted)); __Pyx_GIVEREF(((PyObject *)__pyx_ptype_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted)); @@ -1777,7 +1845,7 @@ static PyObject *__pyx_pf_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighte __pyx_t_1 = 0; goto __pyx_L0; - /* "bayesclass/cSelectFeatures.pyx":29 + /* "bayesclass/cSelectFeatures.pyx":32 * def get_version(self): * return self.thisptr.version() * def __reduce__(self): # <<<<<<<<<<<<<< @@ -2102,7 +2170,7 @@ static std::vector __pyx_convert_vector_from_py_features * return v * */ - __pyx_t_5 = __pyx_PyFloat_AsDouble(__pyx_v_item); if (unlikely((__pyx_t_5 == ((features::precision_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 48, __pyx_L1_error) + __pyx_t_5 = __pyx_PyFloat_AsFloat(__pyx_v_item); if (unlikely((__pyx_t_5 == ((features::precision_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 48, __pyx_L1_error) __pyx_v_v.push_back(((features::precision_t)__pyx_t_5)); /* "vector.from_py":47 @@ -2210,6 +2278,63 @@ static PyObject *__pyx_convert_vector_to_py_features_3a__3a_precision_t(const st return __pyx_r; } +static PyObject *__pyx_convert_vector_to_py_int(const std::vector &__pyx_v_v) { + size_t __pyx_v_i; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + size_t __pyx_t_2; + size_t __pyx_t_3; + size_t __pyx_t_4; + PyObject *__pyx_t_5 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__pyx_convert_vector_to_py_int", 0); + + /* "vector.to_py":61 + * @cname("__pyx_convert_vector_to_py_int") + * cdef object __pyx_convert_vector_to_py_int(vector[X]& v): + * return [v[i] for i in range(v.size())] # <<<<<<<<<<<<<< + * + * + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 61, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __pyx_v_v.size(); + __pyx_t_3 = __pyx_t_2; + for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) { + __pyx_v_i = __pyx_t_4; + __pyx_t_5 = __Pyx_PyInt_From_int((__pyx_v_v[__pyx_v_i])); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 61, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + if (unlikely(__Pyx_ListComp_Append(__pyx_t_1, (PyObject*)__pyx_t_5))) __PYX_ERR(1, 61, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + } + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "vector.to_py":60 + * + * @cname("__pyx_convert_vector_to_py_int") + * cdef object __pyx_convert_vector_to_py_int(vector[X]& v): # <<<<<<<<<<<<<< + * return [v[i] for i in range(v.size())] + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_AddTraceback("vector.to_py.__pyx_convert_vector_to_py_int", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + /* "string.to_py":31 * * @cname("__pyx_convert_PyObject_string_to_py_std__in_string") @@ -2493,9 +2618,10 @@ static void __pyx_tp_dealloc_10bayesclass_17cppSelectFeatures_CSelectKBestWeight static PyMethodDef __pyx_methods_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted[] = { {"fit", (PyCFunction)__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_5fit, METH_NOARGS, 0}, - {"get_score", (PyCFunction)__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_7get_score, METH_NOARGS, 0}, - {"get_version", (PyCFunction)__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_9get_version, METH_NOARGS, 0}, - {"__reduce__", (PyCFunction)__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_11__reduce__, METH_NOARGS, 0}, + {"get_scores", (PyCFunction)__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_7get_scores, METH_NOARGS, 0}, + {"get_features", (PyCFunction)__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_9get_features, METH_NOARGS, 0}, + {"get_version", (PyCFunction)__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_11get_version, METH_NOARGS, 0}, + {"__reduce__", (PyCFunction)__pyx_pw_10bayesclass_17cppSelectFeatures_20CSelectKBestWeighted_13__reduce__, METH_NOARGS, 0}, {0, 0, 0, 0} }; @@ -2690,14 +2816,14 @@ static int __Pyx_modinit_type_init_code(void) { int __pyx_clineno = 0; __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0); /*--- Type init code ---*/ - if (PyType_Ready(&__pyx_type_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted) < 0) __PYX_ERR(0, 16, __pyx_L1_error) + if (PyType_Ready(&__pyx_type_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted) < 0) __PYX_ERR(0, 17, __pyx_L1_error) #if PY_VERSION_HEX < 0x030800B1 __pyx_type_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted.tp_print = 0; #endif if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted.tp_dictoffset && __pyx_type_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted.tp_getattro == PyObject_GenericGetAttr)) { __pyx_type_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted.tp_getattro = __Pyx_PyObject_GenericGetAttr; } - if (PyObject_SetAttr(__pyx_m, __pyx_n_s_CSelectKBestWeighted, (PyObject *)&__pyx_type_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted) < 0) __PYX_ERR(0, 16, __pyx_L1_error) + if (PyObject_SetAttr(__pyx_m, __pyx_n_s_CSelectKBestWeighted, (PyObject *)&__pyx_type_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted) < 0) __PYX_ERR(0, 17, __pyx_L1_error) __pyx_ptype_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted = &__pyx_type_10bayesclass_17cppSelectFeatures_CSelectKBestWeighted; __Pyx_RefNannyFinishContext(); return 0; @@ -3707,6 +3833,44 @@ raise_neg_overflow: return (int) -1; } +/* CIntToPy */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) { +#ifdef __Pyx_HAS_GCC_DIAGNOSTIC +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + const int neg_one = (int) -1, const_zero = (int) 0; +#ifdef __Pyx_HAS_GCC_DIAGNOSTIC +#pragma GCC diagnostic pop +#endif + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(int) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(int) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) { + return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); +#endif + } + } else { + if (sizeof(int) <= sizeof(long)) { + return PyInt_FromLong((long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) { + return PyLong_FromLongLong((PY_LONG_LONG) value); +#endif + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(int), + little, !is_unsigned); + } +} + /* CIntFromPy */ static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *x) { #ifdef __Pyx_HAS_GCC_DIAGNOSTIC diff --git a/bayesclass/cSelectFeatures.pyx b/bayesclass/cSelectFeatures.pyx index 8e1ac6f..9da325b 100644 --- a/bayesclass/cSelectFeatures.pyx +++ b/bayesclass/cSelectFeatures.pyx @@ -6,12 +6,13 @@ from libcpp cimport bool cdef extern from "FeatureSelect.h" namespace "features": - ctypedef double precision_t + ctypedef float precision_t cdef cppclass SelectKBestWeighted: SelectKBestWeighted(vector[vector[int]]&, vector[int]&, vector[precision_t]&, int, bool) except + void fit() string version() - vector[precision_t] getScore() + vector[precision_t] getScores() + vector[int] getFeatures() cdef class CSelectKBestWeighted: cdef SelectKBestWeighted *thisptr @@ -22,8 +23,10 @@ cdef class CSelectKBestWeighted: def fit(self,): self.thisptr.fit() return self - def get_score(self): - return self.thisptr.getScore() + def get_scores(self): + return self.thisptr.getScores() + def get_features(self): + return self.thisptr.getFeatures() def get_version(self): return self.thisptr.version() def __reduce__(self): diff --git a/bayesclass/chargpt.cpp b/bayesclass/chargpt.cpp deleted file mode 100644 index f599dc2..0000000 --- a/bayesclass/chargpt.cpp +++ /dev/null @@ -1,24 +0,0 @@ -double conditionalEntropy(std::vector& classVec, std::vector& featureVec, std::vector& weightsVec) -{ - std::map> classesPerFeatureValue; - std::map> weightsPerFeatureValue; - - for (int i = 0; i < featureVec.size(); i++) { - classesPerFeatureValue[featureVec[i]].push_back(classVec[i]); - weightsPerFeatureValue[featureVec[i]].push_back(weightsVec[i]); - } - - double totalEntropy = 0; - double totalWeight = 0; - for (auto& pair : classesPerFeatureValue) { - double featureValueEntropy = calculateEntropy(pair.second, weightsPerFeatureValue[pair.first]); - double featureValueWeight = 0; - for (double weight : weightsPerFeatureValue[pair.first]) { - featureValueWeight += weight; - } - totalEntropy += featureValueWeight * featureValueEntropy; - totalWeight += featureValueWeight; - } - - return totalEntropy / totalWeight; -} \ No newline at end of file diff --git a/bayesclass/clfs.py b/bayesclass/clfs.py index 4fc71eb..8fe60cd 100644 --- a/bayesclass/clfs.py +++ b/bayesclass/clfs.py @@ -15,7 +15,7 @@ from pgmpy.models import BayesianNetwork from pgmpy.base import DAG import matplotlib.pyplot as plt from fimdlp.mdlp import FImdlp -from .feature_selection import SelectKBestWeighted +from .cppSelectFeatures import CSelectKBestWeighted from ._version import __version__ @@ -869,15 +869,39 @@ class BoostAODE(ClassifierMixin, BaseEnsemble): self.nodes_leaves = self.nodes_edges return self - def mutual_info_classif_weighted(X, y, sample_weight): - # Compute the mutual information between each feature and the target - mi = mutual_info_classif(X, y) + def version(self): + if hasattr(self, "fitted_"): + return self.estimator_.version() + return SPODE(None, False).version() - # Multiply the mutual information scores with the sample weights - mi_weighted = mi * sample_weight + @property + def states_(self): + if hasattr(self, "fitted_"): + return sum( + [ + len(item) + for model in self.estimators_ + for _, item in model.model_.states.items() + ] + ) / len(self.estimators_) + return 0 - # Return the weighted mutual information scores - return mi_weighted + @property + def depth_(self): + return self.states_ + + def nodes_edges(self): + nodes = 0 + edges = 0 + if hasattr(self, "fitted_"): + nodes = sum([len(x.dag_) for x in self.estimators_]) + edges = sum([len(x.dag_.edges()) for x in self.estimators_]) + return nodes, edges + + def plot(self, title=""): + warnings.simplefilter("ignore", UserWarning) + for idx, model in enumerate(self.estimators_): + model.plot(title=f"{idx} {title}") def _train(self, kwargs): """Build boosted SPODEs""" @@ -885,14 +909,12 @@ class BoostAODE(ClassifierMixin, BaseEnsemble): # Step 0: Set the finish condition for num in range(self.n_estimators): # Step 1: Build ranking with mutual information - # OJO MAL, ESTO NO ACTUALIZA EL RANKING CON LOS PESOS - # SIEMPRE VA A SACAR LO MISMO - feature = ( - SelectKBestWeighted(k=1) - .fit(self.X_, self.y_, weights) - .get_feature_names_out(self.feature_names_in_) - .tolist() + n_feature = ( + CSelectKBestWeighted(self.X_, self.y_, weights, k=1) + .fit() + .get_features()[0] ) + feature = self.feature_names_in_[n_feature] # Step 2: Build & train spode with the first feature as sparent estimator = clone(self.estimator_) _args = kwargs.copy() @@ -910,8 +932,8 @@ class BoostAODE(ClassifierMixin, BaseEnsemble): am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1) # Step 3.2: Update weights for next classifier weights = [ - wm * np.exp(am * (ym != y_pred)) - for wm, ym in zip(weights, self.y_) + wm * np.exp(am * (ym != yp)) + for wm, ym, yp in zip(weights, self.y_, y_pred) ] # Step 4: Add the new model self.estimators_.append(estimator) diff --git a/bayesclass/copilot.cpp b/bayesclass/copilot.cpp deleted file mode 100644 index 89902a9..0000000 --- a/bayesclass/copilot.cpp +++ /dev/null @@ -1,48 +0,0 @@ -#include -#include -#include - -using namespace std; - -using value_t = int; -using precision_t = double; -using sample_t = vector; -using score_t = vector; - -precision_t entropy(const sample_t& data, const vector& weights) -{ - precision_t p; - precision_t ventropy = 0, totalWeight = 0; - score_t counts(*max_element(data.begin(), data.end()) + 1, 0); - for (auto i = 0; i < data.size(); ++i) { - counts[data[i]] += weights[i]; - totalWeight += weights[i]; - } - for (auto count : counts) { - if (count > 0 || count < 0) { - p = (count) / totalWeight; - ventropy -= p * log2(p); - } - } - return ventropy; -} - -precision_t conditionalEntropy(const sample_t& feature, const sample_t& labels, const vector& weights) -{ - unordered_map featureCounts; - unordered_map> jointCounts; - featureCounts.clear(); - jointCounts.clear(); - auto totalWeight = 0.0; - for (auto i = 0; i < feature.size(); i++) { - featureCounts[feature[i]] += weights[i]; - jointCounts[feature[i]][labels[i]] += weights[i]; - totalWeight += weights[i]; - } - precision_t entropy = 0; - for (auto& [f, count] : featureCounts) { - auto p_f = count / totalWeight; - entropy += p_f * ::entropy(jointCounts[f], weights) / ::entropy(feature, weights); - } - return entropy; -} \ No newline at end of file diff --git a/bayesclass/feature_selection.py b/bayesclass/feature_selection.py index 92189fb..86a8d71 100644 --- a/bayesclass/feature_selection.py +++ b/bayesclass/feature_selection.py @@ -1,93 +1,93 @@ -import numpy as np -from sklearn.feature_selection import mutual_info_classif -from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.feature_selection._univariate_selection import ( - _BaseFilter, - _clean_nans, -) +# import numpy as np +# from sklearn.feature_selection import mutual_info_classif +# from sklearn.utils.validation import check_X_y, check_is_fitted +# from sklearn.feature_selection._univariate_selection import ( +# _BaseFilter, +# _clean_nans, +# ) -""" -Compute the weighted mutual information between each feature and the -target. -Based on -Silviu Guiaşu, -Weighted entropy, -Reports on Mathematical Physics, -Volume 2, Issue 3, -1971, -Pages 165-179, -ISSN 0034-4877, -https://doi.org/10.1016/0034-4877(71)90002-4. -(https://www.sciencedirect.com/science/article/pii/0034487771900024) -Abstract: Weighted entropy is the measure of information supplied by a -probablistic experiment whose elementary events are characterized both by their -objective probabilities and by some qualitative (objective or subjective) -weights. The properties, the axiomatics and the maximum value of the weighted -entropy are given. -""" +# """ +# Compute the weighted mutual information between each feature and the +# target. +# Based on +# Silviu Guiaşu, +# Weighted entropy, +# Reports on Mathematical Physics, +# Volume 2, Issue 3, +# 1971, +# Pages 165-179, +# ISSN 0034-4877, +# https://doi.org/10.1016/0034-4877(71)90002-4. +# (https://www.sciencedirect.com/science/article/pii/0034487771900024) +# Abstract: Weighted entropy is the measure of information supplied by a +# probablistic experiment whose elementary events are characterized both by their +# objective probabilities and by some qualitative (objective or subjective) +# weights. The properties, the axiomatics and the maximum value of the weighted +# entropy are given. +# """ -class SelectKBestWeighted(_BaseFilter): - def __init__(self, *, k=10): - super().__init__(score_func=mutual_info_classif) - self.k = k +# class SelectKBestWeighted(_BaseFilter): +# def __init__(self, *, k=10): +# super().__init__(score_func=mutual_info_classif) +# self.k = k - def _check_params(self, X, y): - if self.k > X.shape[1] or self.k < 1: - raise ValueError( - f"k must be between 1 and {X.shape[1]} got {self.k}." - ) +# def _check_params(self, X, y): +# if self.k > X.shape[1] or self.k < 1: +# raise ValueError( +# f"k must be between 1 and {X.shape[1]} got {self.k}." +# ) - def _get_support_mask(self): - check_is_fitted(self) +# def _get_support_mask(self): +# check_is_fitted(self) - if self.k == "all": - return np.ones(self.scores_.shape, dtype=bool) - elif self.k == 0: - return np.zeros(self.scores_.shape, dtype=bool) - else: - scores = _clean_nans(self.scores_) - mask = np.zeros(scores.shape, dtype=bool) +# if self.k == "all": +# return np.ones(self.scores_.shape, dtype=bool) +# elif self.k == 0: +# return np.zeros(self.scores_.shape, dtype=bool) +# else: +# scores = _clean_nans(self.scores_) +# mask = np.zeros(scores.shape, dtype=bool) - # Request a stable sort. Mergesort takes more memory (~40MB per - # megafeature on x86-64). - mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1 - return mask +# # Request a stable sort. Mergesort takes more memory (~40MB per +# # megafeature on x86-64). +# mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1 +# return mask - def fit(self, X, y, sample_weight): - self.X_, self.y_ = check_X_y(X, y) - self._check_params(X, y) - self.n_features_in_ = X.shape[1] - self.sample_weight_ = sample_weight - # Compute the entropy of the target variable - entropy_y = -np.sum( - np.multiply( - np.bincount(y, weights=sample_weight), - np.log(np.bincount(y, weights=sample_weight)), - ) - ) +# def fit(self, X, y, sample_weight): +# self.X_, self.y_ = check_X_y(X, y) +# self._check_params(X, y) +# self.n_features_in_ = X.shape[1] +# self.sample_weight_ = sample_weight +# # Compute the entropy of the target variable +# entropy_y = -np.sum( +# np.multiply( +# np.bincount(y, weights=sample_weight), +# np.log(np.bincount(y, weights=sample_weight)), +# ) +# ) - # Compute the mutual information between each feature and the target - mi = self.score_func(X, y) +# # Compute the mutual information between each feature and the target +# mi = self.score_func(X, y) - # Compute the weighted entropy of each feature - entropy_weighted = [] - for i in range(X.shape[1]): - # Compute the weighted frequency of each unique value of the - # feature - freq_weighted = np.bincount(X[:, i], weights=sample_weight) - freq_weighted = freq_weighted[freq_weighted != 0] +# # Compute the weighted entropy of each feature +# entropy_weighted = [] +# for i in range(X.shape[1]): +# # Compute the weighted frequency of each unique value of the +# # feature +# freq_weighted = np.bincount(X[:, i], weights=sample_weight) +# freq_weighted = freq_weighted[freq_weighted != 0] - # Compute the weighted entropy of the feature - entropy_weighted.append( - -np.sum(np.multiply(freq_weighted, np.log(freq_weighted))) - / np.sum(sample_weight) - ) +# # Compute the weighted entropy of the feature +# entropy_weighted.append( +# -np.sum(np.multiply(freq_weighted, np.log(freq_weighted))) +# / np.sum(sample_weight) +# ) - # Compute the weighted mutual information between each feature and - # the target - mi_weighted = mi * entropy_weighted / entropy_y +# # Compute the weighted mutual information between each feature and +# # the target +# mi_weighted = mi * entropy_weighted / entropy_y - # Return the weighted mutual information scores - self.scores_ = mi_weighted - return self +# # Return the weighted mutual information scores +# self.scores_ = mi_weighted +# return self diff --git a/test.py b/test.py deleted file mode 100644 index 98c039f..0000000 --- a/test.py +++ /dev/null @@ -1,10 +0,0 @@ -from bayesclass.cppSelectFeatures import CSelectKBestWeighted - - -X = [[x for x in range(i, i + 3)] for i in range(1, 30, 3)] -weights = [25 / (i + 1) for i in range(10)] -labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] -test = CSelectKBestWeighted(X, labels, weights, 3) -test.fit() -for item in test.get_score(): - print(item)