21 Commits

Author SHA1 Message Date
d1cafc230b Fix some small mistakes 2023-07-13 17:11:08 +02:00
99083ceede Fix KDB algorithm argsort 2023-07-13 16:59:37 +02:00
64f1500176 Refactor cpp library methods 2023-07-12 12:59:02 +02:00
aef22306ef Complete refactor of KDB with BayesNet library 2023-07-12 12:07:01 +02:00
2ff38f73e7 refactor conditionalEdgeWeights 2023-07-12 11:20:05 +02:00
1af3edd050 Adding Metrics 2023-07-12 03:24:40 +02:00
8b6624e08a Add getStates 2023-07-11 21:28:29 +02:00
36cc875615 Refator kdb with new BayesNetwork 2023-07-08 10:40:33 +02:00
260997c872 transpose dimensions of X in BayesNetwork 2023-07-08 01:13:29 +02:00
8a9c86a22d Update BayesNetwork class 2023-07-08 00:39:10 +02:00
4bad5ccfee Complete integration with BayesNet 2023-07-07 19:19:52 +02:00
5866e19fae Add predict_proba 2023-07-07 00:36:14 +02:00
61e4c176eb First try to link with bayesnet 2023-07-07 00:23:47 +02:00
ea473fc604 First complete boostAODE 2023-06-26 10:09:28 +02:00
9d7e787f6c Finish cppSelectFeatures 2023-06-23 20:07:26 +02:00
d7425e5af0 Remove unneeded small value added to logs 2023-06-23 01:25:23 +02:00
30cc744033 Chcked mutual_info with sklearn 2023-06-23 01:21:24 +02:00
0094d500d4 Begin cython structure 2023-06-22 17:56:34 +02:00
99321043ec Complete feature_selection with weighted entropy 2023-06-21 16:40:29 +02:00
fbaa5eb7d3 continue feature selection 2023-06-21 14:42:33 +02:00
0b27d9d9b0 Begin implementation 2023-06-21 11:27:14 +02:00
24 changed files with 13789 additions and 70 deletions

7
CMakeLists.txt Normal file
View File

@@ -0,0 +1,7 @@
cmake_minimum_required(VERSION 3.20)
project(feature)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Debug)
add_executable(feature bayesclass/cpp/FeatureSelect.cpp)

View File

@@ -1 +1,5 @@
include README.md LICENSE include README.md LICENSE
include bayesclass/FeatureSelect.h
include bayesclass/Node.h
include bayesclass/Network.h
include bayesclass/Metrics.hpp

View File

@@ -16,6 +16,10 @@ lint: ## Lint and static-check
flake8 bayesclass flake8 bayesclass
mypy bayesclass mypy bayesclass
feature: ## compile FeatureSelect
cmake -B build feature
push: ## Push code with tags push: ## Push code with tags
git push && git push --tags git push && git push --tags

View File

@@ -0,0 +1,93 @@
#include "BaseClassifier.h"
namespace bayesnet {
using namespace std;
using namespace torch;
BaseClassifier::BaseClassifier(Network model) : model(model), m(0), n(0) {}
BaseClassifier& BaseClassifier::build(vector<string>& features, string className, map<string, vector<int>>& states)
{
dataset = torch::cat({ X, y.view({y.size(0), 1}) }, 1);
this->features = features;
this->className = className;
this->states = states;
checkFitParameters();
train();
return *this;
}
BaseClassifier& BaseClassifier::fit(Tensor& X, Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states)
{
this->X = X;
this->y = y;
return build(features, className, states);
}
BaseClassifier& BaseClassifier::fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states)
{
this->X = torch::zeros({ static_cast<int64_t>(X[0].size()), static_cast<int64_t>(X.size()) }, kInt64);
for (int i = 0; i < X.size(); ++i) {
this->X.index_put_({ "...", i }, torch::tensor(X[i], kInt64));
}
this->y = torch::tensor(y, kInt64);
return build(features, className, states);
}
void BaseClassifier::checkFitParameters()
{
auto sizes = X.sizes();
m = sizes[0];
n = sizes[1];
if (m != y.size(0)) {
throw invalid_argument("X and y must have the same number of samples");
}
if (n != features.size()) {
throw invalid_argument("X and features must have the same number of features");
}
if (states.find(className) == states.end()) {
throw invalid_argument("className not found in states");
}
for (auto feature : features) {
if (states.find(feature) == states.end()) {
throw invalid_argument("feature [" + feature + "] not found in states");
}
}
}
vector<vector<int>> tensorToVector(const torch::Tensor& tensor)
{
// convert mxn tensor to nxm vector
vector<vector<int>> result;
auto tensor_accessor = tensor.accessor<int, 2>();
// Iterate over columns and rows of the tensor
for (int j = 0; j < tensor.size(1); ++j) {
vector<int> column;
for (int i = 0; i < tensor.size(0); ++i) {
column.push_back(tensor_accessor[i][j]);
}
result.push_back(column);
}
return result;
}
Tensor BaseClassifier::predict(Tensor& X)
{
auto m_ = X.size(0);
auto n_ = X.size(1);
vector<vector<int>> Xd(n_, vector<int>(m_, 0));
for (auto i = 0; i < n_; i++) {
auto temp = X.index({ "...", i });
Xd[i] = vector<int>(temp.data_ptr<int>(), temp.data_ptr<int>() + m_);
}
auto yp = model.predict(Xd);
auto ypred = torch::tensor(yp, torch::kInt64);
return ypred;
}
float BaseClassifier::score(Tensor& X, Tensor& y)
{
Tensor y_pred = predict(X);
return (y_pred == y).sum().item<float>() / y.size(0);
}
vector<string> BaseClassifier::show()
{
return model.show();
}
}

View File

@@ -0,0 +1,39 @@
#ifndef CLASSIFIERS_H
#include <torch/torch.h>
#include "Network.h"
using namespace std;
using namespace torch;
namespace bayesnet {
class BaseClassifier {
private:
BaseClassifier& build(vector<string>& features, string className, map<string, vector<int>>& states);
protected:
Network model;
int m, n; // m: number of samples, n: number of features
Tensor X;
Tensor y;
Tensor dataset;
vector<string> features;
string className;
map<string, vector<int>> states;
void checkFitParameters();
virtual void train() = 0;
public:
BaseClassifier(Network model);
Tensor& getX();
vector<string>& getFeatures();
string& getClassName();
BaseClassifier& fit(Tensor& X, Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states);
BaseClassifier& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states);
Tensor predict(Tensor& X);
float score(Tensor& X, Tensor& y);
vector<string> show();
};
}
#endif

7663
bayesclass/BayesNetwork.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,78 @@
# distutils: language = c++
# cython: language_level = 3
from libcpp.vector cimport vector
from libcpp.string cimport string
import numpy as np
cdef extern from "Network.h" namespace "bayesnet":
cdef cppclass Network:
Network(float, float) except +
void fit(vector[vector[int]]&, vector[int]&, vector[string]&, string)
vector[int] predict(vector[vector[int]]&)
vector[vector[double]] predict_proba(vector[vector[int]]&)
float score(const vector[vector[int]]&, const vector[int]&)
void addNode(string, int)
void addEdge(string, string) except +
vector[string] getFeatures()
int getClassNumStates()
int getStates()
string getClassName()
string version()
void show()
cdef class BayesNetwork:
cdef Network *thisptr
def __cinit__(self, maxThreads=0.8, laplaceSmooth=1.0):
self.thisptr = new Network(maxThreads, laplaceSmooth)
def __dealloc__(self):
del self.thisptr
def fit(self, X, y, features, className):
X_ = [X[:, i] for i in range(X.shape[1])]
features_bytes = [x.encode() for x in features]
self.thisptr.fit(X_, y, features_bytes, className.encode())
return self
def predict(self, X):
X_ = [X[:, i] for i in range(X.shape[1])]
return self.thisptr.predict(X_)
def predict_proba(self, X):
X_ = [X[:, i] for i in range(X.shape[1])]
return self.thisptr.predict_proba(X_)
def score(self, X, y):
X_ = [X[:, i] for i in range(X.shape[1])]
return self.thisptr.score(X_, y)
def addNode(self, name, states):
self.thisptr.addNode(str.encode(name), states)
def addEdge(self, source, destination):
self.thisptr.addEdge(str.encode(source), str.encode(destination))
def getFeatures(self):
res = self.thisptr.getFeatures()
return [x.decode() for x in res]
def getStates(self):
return self.thisptr.getStates()
def getClassName(self):
return self.thisptr.getClassName().decode()
def getClassNumStates(self):
return self.thisptr.getClassNumStates()
def show(self):
return self.thisptr.show()
def __reduce__(self):
return (BayesNetwork, ())
cdef extern from "Metrics.hpp" namespace "bayesnet":
cdef cppclass Metrics:
Metrics(vector[vector[int]], vector[int], vector[string]&, string&, int) except +
vector[float] conditionalEdgeWeights()
cdef class CMetrics:
cdef Metrics *thisptr
def __cinit__(self, X, y, features, className, classStates):
X_ = [X[:, i] for i in range(X.shape[1])]
features_bytes = [x.encode() for x in features]
self.thisptr = new Metrics(X_, y, features_bytes, className.encode(), classStates)
def __dealloc__(self):
del self.thisptr
def conditionalEdgeWeights(self, n_vars):
return np.reshape(self.thisptr.conditionalEdgeWeights(), (n_vars, n_vars))
def __reduce__(self):
return (CMetrics, ())

View File

@@ -0,0 +1,118 @@
#include "FeatureSelect.h"
namespace features {
SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat)
: samples(samples), labels(labels), weights(weights), k(k), nat(nat)
{
if (samples.size() == 0 || samples[0].size() == 0)
throw invalid_argument("features must be a non-empty matrix");
if (samples.size() != labels.size())
throw invalid_argument("number of samples and labels must be equal");
if (samples.size() != weights.size())
throw invalid_argument("number of samples and weights must be equal");
if (k < 1 || k > static_cast<int>(samples[0].size()))
throw invalid_argument("k must be between 1 and number of features");
numFeatures = 0;
numClasses = 0;
numSamples = 0;
fitted = false;
}
void SelectKBestWeighted::SelectKBestWeighted::fit()
{
auto labelsCopy = labels;
numFeatures = samples[0].size();
numSamples = samples.size();
// compute number of classes
sort(labelsCopy.begin(), labelsCopy.end());
auto last = unique(labelsCopy.begin(), labelsCopy.end());
labelsCopy.erase(last, labelsCopy.end());
numClasses = labelsCopy.size();
// compute scores
scores.reserve(numFeatures);
for (int i = 0; i < numFeatures; ++i) {
scores.push_back(MutualInformation(i));
features.push_back(i);
}
// sort & reduce scores and features
sort(features.begin(), features.end(), [&](int i, int j)
{ return scores[i] > scores[j]; });
sort(scores.begin(), scores.end(), greater<precision_t>());
features.resize(k);
scores.resize(k);
fitted = true;
}
precision_t SelectKBestWeighted::entropyLabel()
{
return entropy(labels);
}
precision_t SelectKBestWeighted::entropy(const sample_t& data)
{
precision_t ventropy = 0, totalWeight = 0;
score_t counts(numClasses + 1, 0);
for (auto i = 0; i < static_cast<int>(data.size()); ++i) {
counts[data[i]] += weights[i];
totalWeight += weights[i];
}
for (auto count : counts) {
precision_t p = count / totalWeight;
if (p > 0) {
if (nat) {
ventropy -= p * log(p);
} else {
ventropy -= p * log2(p);
}
}
}
return ventropy;
}
// H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
precision_t SelectKBestWeighted::conditionalEntropy(const int feature)
{
unordered_map<value_t, precision_t> featureCounts;
unordered_map<value_t, unordered_map<value_t, precision_t>> jointCounts;
featureCounts.clear();
jointCounts.clear();
precision_t totalWeight = 0;
for (auto i = 0; i < numSamples; i++) {
featureCounts[samples[i][feature]] += weights[i];
jointCounts[samples[i][feature]][labels[i]] += weights[i];
totalWeight += weights[i];
}
if (totalWeight == 0)
throw invalid_argument("Total weight should not be zero");
precision_t entropy = 0;
for (auto& [feat, count] : featureCounts) {
auto p_f = count / totalWeight;
precision_t entropy_f = 0;
for (auto& [label, jointCount] : jointCounts[feat]) {
auto p_l_f = jointCount / count;
if (p_l_f > 0) {
if (nat) {
entropy_f -= p_l_f * log(p_l_f);
} else {
entropy_f -= p_l_f * log2(p_l_f);
}
}
}
entropy += p_f * entropy_f;
}
return entropy;
}
// I(X;Y) = H(Y) - H(Y|X)
precision_t SelectKBestWeighted::MutualInformation(const int i)
{
return entropyLabel() - conditionalEntropy(i);
}
score_t SelectKBestWeighted::getScores() const
{
if (!fitted)
throw logic_error("score not fitted");
return scores;
}
//Return the indices of the selected features
labels_t SelectKBestWeighted::getFeatures() const
{
if (!fitted)
throw logic_error("score not fitted");
return features;
}
}

View File

@@ -0,0 +1,38 @@
#ifndef SELECT_K_BEST_WEIGHTED_H
#define SELECT_K_BEST_WEIGHTED_H
#include <map>
#include <vector>
#include <string>
using namespace std;
namespace features {
typedef float precision_t;
typedef int value_t;
typedef vector<value_t> sample_t;
typedef vector<sample_t> samples_t;
typedef vector<value_t> labels_t;
typedef vector<precision_t> score_t, weights_t;
class SelectKBestWeighted {
private:
const samples_t samples;
const labels_t labels;
const weights_t weights;
const int k;
bool nat; // use natural log or log2
int numFeatures, numClasses, numSamples;
bool fitted;
score_t scores; // scores of the features
labels_t features; // indices of the selected features
precision_t entropyLabel();
precision_t entropy(const sample_t&);
precision_t conditionalEntropy(const int);
precision_t MutualInformation(const int);
public:
SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool);
void fit();
score_t getScores() const;
labels_t getFeatures() const; //Return the indices of the selected features
static inline string version() { return "0.1.0"; };
};
}
#endif

110
bayesclass/KDB.cc Normal file
View File

@@ -0,0 +1,110 @@
#include "KDB.h"
#include "Metrics.hpp"
namespace bayesnet {
using namespace std;
using namespace torch;
vector<int> argsort(vector<float>& nums)
{
int n = nums.size();
vector<int> indices(n);
iota(indices.begin(), indices.end(), 0);
sort(indices.begin(), indices.end(), [&nums](int i, int j) {return nums[i] > nums[j];});
return indices;
}
KDB::KDB(int k, float theta) : BaseClassifier(Network()), k(k), theta(theta) {}
void KDB::train()
{
/*
1. For each feature Xi, compute mutual information, I(X;C),
where C is the class.
2. Compute class conditional mutual information I(Xi;XjIC), f or each
pair of features Xi and Xj, where i#j.
3. Let the used variable list, S, be empty.
4. Let the DAG network being constructed, BN, begin with a single
class node, C.
5. Repeat until S includes all domain features
5.1. Select feature Xmax which is not in S and has the largest value
I(Xmax;C).
5.2. Add a node to BN representing Xmax.
5.3. Add an arc from C to Xmax in BN.
5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with
the highest value for I(Xmax;X,jC).
5.5. Add Xmax to S.
Compute the conditional probabilility infered by the structure of BN by
using counts from DB, and output BN.
*/
// 1. For each feature Xi, compute mutual information, I(X;C),
// where C is the class.
cout << "Computing mutual information between features and class" << endl;
auto n_classes = states[className].size();
auto metrics = Metrics(dataset, features, className, n_classes);
vector <float> mi;
for (auto i = 0; i < features.size(); i++) {
Tensor firstFeature = X.index({ "...", i });
mi.push_back(metrics.mutualInformation(firstFeature, y));
cout << "Mutual information between " << features[i] << " and " << className << " is " << mi[i] << endl;
}
// 2. Compute class conditional mutual information I(Xi;XjIC), f or each
auto conditionalEdgeWeights = metrics.conditionalEdge();
cout << "Conditional edge weights" << endl;
cout << conditionalEdgeWeights << endl;
// 3. Let the used variable list, S, be empty.
vector<int> S;
// 4. Let the DAG network being constructed, BN, begin with a single
// class node, C.
model.addNode(className, states[className].size());
cout << "Adding node " << className << " to the network" << endl;
// 5. Repeat until S includes all domain features
// 5.1. Select feature Xmax which is not in S and has the largest value
// I(Xmax;C).
auto order = argsort(mi);
for (auto idx : order) {
cout << idx << " " << mi[idx] << endl;
// 5.2. Add a node to BN representing Xmax.
model.addNode(features[idx], states[features[idx]].size());
// 5.3. Add an arc from C to Xmax in BN.
model.addEdge(className, features[idx]);
// 5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with
// the highest value for I(Xmax;X,jC).
add_m_edges(idx, S, conditionalEdgeWeights);
// 5.5. Add Xmax to S.
S.push_back(idx);
}
}
void KDB::add_m_edges(int idx, vector<int>& S, Tensor& weights)
{
auto n_edges = min(k, static_cast<int>(S.size()));
auto cond_w = clone(weights);
cout << "Conditional edge weights cloned for idx " << idx << endl;
cout << cond_w << endl;
bool exit_cond = k == 0;
int num = 0;
while (!exit_cond) {
auto max_minfo = argmax(cond_w.index({ idx, "..." })).item<int>();
auto belongs = find(S.begin(), S.end(), max_minfo) != S.end();
if (belongs && cond_w.index({ idx, max_minfo }).item<float>() > theta) {
try {
model.addEdge(features[max_minfo], features[idx]);
num++;
}
catch (const invalid_argument& e) {
// Loops are not allowed
}
}
cond_w.index_put_({ idx, max_minfo }, -1);
cout << "Conditional edge weights cloned for idx " << idx << " After -1" << endl;
cout << cond_w << endl;
cout << "cond_w.index({ idx, '...'})" << endl;
cout << cond_w.index({ idx, "..." }) << endl;
auto candidates_mask = cond_w.index({ idx, "..." }).gt(theta);
auto candidates = candidates_mask.nonzero();
cout << "Candidates mask" << endl;
cout << candidates_mask << endl;
cout << "Candidates: " << endl;
cout << candidates << endl;
cout << "Candidates size: " << candidates.size(0) << endl;
exit_cond = num == n_edges || candidates.size(0) == 0;
}
}
}

18
bayesclass/KDB.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef KDB_H
#define KDB_H
#include "BaseClassifier.h"
namespace bayesnet {
using namespace std;
using namespace torch;
class KDB : public BaseClassifier {
private:
int k;
float theta;
void add_m_edges(int idx, vector<int>& S, Tensor& weights);
protected:
void train() override;
public:
KDB(int k, float theta = 0.03);
};
}
#endif

119
bayesclass/Metrics.cc Normal file
View File

@@ -0,0 +1,119 @@
#include "Metrics.hpp"
using namespace std;
namespace bayesnet {
Metrics::Metrics(torch::Tensor& samples, vector<string>& features, string& className, int classNumStates)
: samples(samples)
, features(features)
, className(className)
, classNumStates(classNumStates)
{
}
Metrics::Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates)
: features(features)
, className(className)
, classNumStates(classNumStates)
{
samples = torch::zeros({ static_cast<int64_t>(vsamples[0].size()), static_cast<int64_t>(vsamples.size() + 1) }, torch::kInt64);
for (int i = 0; i < vsamples.size(); ++i) {
samples.index_put_({ "...", i }, torch::tensor(vsamples[i], torch::kInt64));
}
samples.index_put_({ "...", -1 }, torch::tensor(labels, torch::kInt64));
}
vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
{
vector<pair<string, string>> result;
for (int i = 0; i < source.size(); ++i) {
string temp = source[i];
for (int j = i + 1; j < source.size(); ++j) {
result.push_back({ temp, source[j] });
}
}
return result;
}
torch::Tensor Metrics::conditionalEdge()
{
auto result = vector<double>();
auto source = vector<string>(features);
source.push_back(className);
auto combinations = doCombinations(source);
// Compute class prior
auto margin = torch::zeros({ classNumStates });
for (int value = 0; value < classNumStates; ++value) {
auto mask = samples.index({ "...", -1 }) == value;
margin[value] = mask.sum().item<float>() / samples.sizes()[0];
}
for (auto [first, second] : combinations) {
int64_t index_first = find(features.begin(), features.end(), first) - features.begin();
int64_t index_second = find(features.begin(), features.end(), second) - features.begin();
double accumulated = 0;
for (int value = 0; value < classNumStates; ++value) {
auto mask = samples.index({ "...", -1 }) == value;
auto first_dataset = samples.index({ mask, index_first });
auto second_dataset = samples.index({ mask, index_second });
auto mi = mutualInformation(first_dataset, second_dataset);
auto pb = margin[value].item<float>();
accumulated += pb * mi;
}
result.push_back(accumulated);
}
long n_vars = source.size();
auto matrix = torch::zeros({ n_vars, n_vars });
auto indices = torch::triu_indices(n_vars, n_vars, 1);
for (auto i = 0; i < result.size(); ++i) {
auto x = indices[0][i];
auto y = indices[1][i];
matrix[x][y] = result[i];
matrix[y][x] = result[i];
}
return matrix;
}
vector<float> Metrics::conditionalEdgeWeights()
{
auto matrix = conditionalEdge();
std::vector<float> v(matrix.data_ptr<float>(), matrix.data_ptr<float>() + matrix.numel());
return v;
}
double Metrics::entropy(torch::Tensor& feature)
{
torch::Tensor counts = feature.bincount();
int totalWeight = counts.sum().item<int>();
torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
torch::Tensor logProbs = torch::log(probs);
torch::Tensor entropy = -probs * logProbs;
return entropy.nansum().item<double>();
}
// H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
double Metrics::conditionalEntropy(torch::Tensor& firstFeature, torch::Tensor& secondFeature)
{
int numSamples = firstFeature.sizes()[0];
torch::Tensor featureCounts = secondFeature.bincount();
unordered_map<int, unordered_map<int, double>> jointCounts;
double totalWeight = 0;
for (auto i = 0; i < numSamples; i++) {
jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += 1;
totalWeight += 1;
}
if (totalWeight == 0)
throw invalid_argument("Total weight should not be zero");
double entropyValue = 0;
for (int value = 0; value < featureCounts.sizes()[0]; ++value) {
double p_f = featureCounts[value].item<double>() / totalWeight;
double entropy_f = 0;
for (auto& [label, jointCount] : jointCounts[value]) {
double p_l_f = jointCount / featureCounts[value].item<double>();
if (p_l_f > 0) {
entropy_f -= p_l_f * log(p_l_f);
} else {
entropy_f = 0;
}
}
entropyValue += p_f * entropy_f;
}
return entropyValue;
}
// I(X;Y) = H(Y) - H(Y|X)
double Metrics::mutualInformation(torch::Tensor& firstFeature, torch::Tensor& secondFeature)
{
return entropy(firstFeature) - conditionalEntropy(firstFeature, secondFeature);
}
}

25
bayesclass/Metrics.hpp Normal file
View File

@@ -0,0 +1,25 @@
#ifndef BAYESNET_METRICS_H
#define BAYESNET_METRICS_H
#include <torch/torch.h>
#include <vector>
#include <string>
using namespace std;
namespace bayesnet {
class Metrics {
private:
torch::Tensor samples;
vector<string> features;
string className;
int classNumStates;
vector<pair<string, string>> doCombinations(const vector<string>&);
double entropy(torch::Tensor&);
double conditionalEntropy(torch::Tensor&, torch::Tensor&);
public:
double mutualInformation(torch::Tensor&, torch::Tensor&);
Metrics(torch::Tensor&, vector<string>&, string&, int);
Metrics(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&, const int);
vector<float> conditionalEdgeWeights();
torch::Tensor conditionalEdge();
};
}
#endif

262
bayesclass/Network.cc Normal file
View File

@@ -0,0 +1,262 @@
#include <thread>
#include <mutex>
#include "Network.h"
namespace bayesnet {
Network::Network() : laplaceSmoothing(1), features(vector<string>()), className(""), classNumStates(0), maxThreads(0.8) {}
Network::Network(float maxT) : laplaceSmoothing(1), features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT) {}
Network::Network(float maxT, int smoothing) : laplaceSmoothing(smoothing), features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT) {}
Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other.getmaxThreads())
{
for (auto& pair : other.nodes) {
nodes[pair.first] = new Node(*pair.second);
}
}
Network::~Network()
{
for (auto& pair : nodes) {
delete pair.second;
}
}
float Network::getmaxThreads()
{
return maxThreads;
}
torch::Tensor& Network::getSamples()
{
return samples;
}
void Network::addNode(string name, int numStates)
{
if (nodes.find(name) != nodes.end()) {
// if node exists update its number of states
nodes[name]->setNumStates(numStates);
return;
}
nodes[name] = new Node(name, numStates);
}
vector<string> Network::getFeatures()
{
return features;
}
int Network::getClassNumStates()
{
return classNumStates;
}
int Network::getStates()
{
int result = 0;
for (auto node : nodes) {
result += node.second->getNumStates();
}
return result;
}
string Network::getClassName()
{
return className;
}
bool Network::isCyclic(const string& nodeId, unordered_set<string>& visited, unordered_set<string>& recStack)
{
if (visited.find(nodeId) == visited.end()) // if node hasn't been visited yet
{
visited.insert(nodeId);
recStack.insert(nodeId);
for (Node* child : nodes[nodeId]->getChildren()) {
if (visited.find(child->getName()) == visited.end() && isCyclic(child->getName(), visited, recStack))
return true;
else if (recStack.find(child->getName()) != recStack.end())
return true;
}
}
recStack.erase(nodeId); // remove node from recursion stack before function ends
return false;
}
void Network::addEdge(const string parent, const string child)
{
if (nodes.find(parent) == nodes.end()) {
throw invalid_argument("Parent node " + parent + " does not exist");
}
if (nodes.find(child) == nodes.end()) {
throw invalid_argument("Child node " + child + " does not exist");
}
// Temporarily add edge to check for cycles
nodes[parent]->addChild(nodes[child]);
nodes[child]->addParent(nodes[parent]);
unordered_set<string> visited;
unordered_set<string> recStack;
if (isCyclic(nodes[child]->getName(), visited, recStack)) // if adding this edge forms a cycle
{
// remove problematic edge
nodes[parent]->removeChild(nodes[child]);
nodes[child]->removeParent(nodes[parent]);
throw invalid_argument("Adding this edge forms a cycle in the graph.");
}
}
map<string, Node*>& Network::getNodes()
{
return nodes;
}
void Network::fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<string>& featureNames, const string& className)
{
features = featureNames;
this->className = className;
dataset.clear();
// Build dataset & tensor of samples
samples = torch::zeros({ static_cast<int64_t>(input_data[0].size()), static_cast<int64_t>(input_data.size() + 1) }, torch::kInt64);
for (int i = 0; i < featureNames.size(); ++i) {
dataset[featureNames[i]] = input_data[i];
samples.index_put_({ "...", i }, torch::tensor(input_data[i], torch::kInt64));
}
dataset[className] = labels;
samples.index_put_({ "...", -1 }, torch::tensor(labels, torch::kInt64));
classNumStates = *max_element(labels.begin(), labels.end()) + 1;
int maxThreadsRunning = static_cast<int>(std::thread::hardware_concurrency() * maxThreads);
if (maxThreadsRunning < 1) {
maxThreadsRunning = 1;
}
vector<thread> threads;
mutex mtx;
condition_variable cv;
int activeThreads = 0;
int nextNodeIndex = 0;
while (nextNodeIndex < nodes.size()) {
unique_lock<mutex> lock(mtx);
cv.wait(lock, [&activeThreads, &maxThreadsRunning]() { return activeThreads < maxThreadsRunning; });
if (nextNodeIndex >= nodes.size()) {
break; // No more work remaining
}
threads.emplace_back([this, &nextNodeIndex, &mtx, &cv, &activeThreads]() {
while (true) {
unique_lock<mutex> lock(mtx);
if (nextNodeIndex >= nodes.size()) {
break; // No more work remaining
}
auto& pair = *std::next(nodes.begin(), nextNodeIndex);
++nextNodeIndex;
lock.unlock();
pair.second->computeCPT(dataset, laplaceSmoothing);
lock.lock();
nodes[pair.first] = pair.second;
lock.unlock();
}
lock_guard<mutex> lock(mtx);
--activeThreads;
cv.notify_one();
});
++activeThreads;
}
for (auto& thread : threads) {
thread.join();
}
}
vector<int> Network::predict(const vector<vector<int>>& tsamples)
{
vector<int> predictions;
vector<int> sample;
for (int row = 0; row < tsamples[0].size(); ++row) {
sample.clear();
for (int col = 0; col < tsamples.size(); ++col) {
sample.push_back(tsamples[col][row]);
}
vector<double> classProbabilities = predict_sample(sample);
// Find the class with the maximum posterior probability
auto maxElem = max_element(classProbabilities.begin(), classProbabilities.end());
int predictedClass = distance(classProbabilities.begin(), maxElem);
predictions.push_back(predictedClass);
}
return predictions;
}
vector<vector<double>> Network::predict_proba(const vector<vector<int>>& tsamples)
{
vector<vector<double>> predictions;
vector<int> sample;
for (int row = 0; row < tsamples[0].size(); ++row) {
sample.clear();
for (int col = 0; col < tsamples.size(); ++col) {
sample.push_back(tsamples[col][row]);
}
predictions.push_back(predict_sample(sample));
}
return predictions;
}
double Network::score(const vector<vector<int>>& tsamples, const vector<int>& labels)
{
vector<int> y_pred = predict(tsamples);
int correct = 0;
for (int i = 0; i < y_pred.size(); ++i) {
if (y_pred[i] == labels[i]) {
correct++;
}
}
return (double)correct / y_pred.size();
}
vector<double> Network::predict_sample(const vector<int>& sample)
{
// Ensure the sample size is equal to the number of features
if (sample.size() != features.size()) {
throw invalid_argument("Sample size (" + to_string(sample.size()) +
") does not match the number of features (" + to_string(features.size()) + ")");
}
map<string, int> evidence;
for (int i = 0; i < sample.size(); ++i) {
evidence[features[i]] = sample[i];
}
return exactInference(evidence);
}
double Network::computeFactor(map<string, int>& completeEvidence)
{
double result = 1.0;
for (auto node : getNodes()) {
result *= node.second->getFactorValue(completeEvidence);
}
return result;
}
vector<double> Network::exactInference(map<string, int>& evidence)
{
vector<double> result(classNumStates, 0.0);
vector<thread> threads;
mutex mtx;
for (int i = 0; i < classNumStates; ++i) {
threads.emplace_back([this, &result, &evidence, i, &mtx]() {
auto completeEvidence = map<string, int>(evidence);
completeEvidence[getClassName()] = i;
double factor = computeFactor(completeEvidence);
lock_guard<mutex> lock(mtx);
result[i] = factor;
});
}
for (auto& thread : threads) {
thread.join();
}
// Normalize result
double sum = accumulate(result.begin(), result.end(), 0.0);
for (double& value : result) {
value /= sum;
}
return result;
}
vector<string> Network::show()
{
vector<string> result;
// Draw the network
for (auto node : nodes) {
string line = node.first + " -> ";
for (auto child : node.second->getChildren()) {
line += child->getName() + ", ";
}
result.push_back(line);
}
return result;
}
}

51
bayesclass/Network.h Normal file
View File

@@ -0,0 +1,51 @@
#ifndef NETWORK_H
#define NETWORK_H
#include "Node.h"
#include <map>
#include <vector>
namespace bayesnet {
class Network {
private:
map<string, Node*> nodes;
map<string, vector<int>> dataset;
float maxThreads;
int classNumStates;
vector<string> features;
string className;
int laplaceSmoothing;
torch::Tensor samples;
bool isCyclic(const std::string&, std::unordered_set<std::string>&, std::unordered_set<std::string>&);
vector<double> predict_sample(const vector<int>&);
vector<double> exactInference(map<string, int>&);
double computeFactor(map<string, int>&);
double mutual_info(torch::Tensor&, torch::Tensor&);
double entropy(torch::Tensor&);
double conditionalEntropy(torch::Tensor&, torch::Tensor&);
double mutualInformation(torch::Tensor&, torch::Tensor&);
public:
Network();
Network(float, int);
Network(float);
Network(Network&);
~Network();
torch::Tensor& getSamples();
float getmaxThreads();
void addNode(string, int);
void addEdge(const string, const string);
map<string, Node*>& getNodes();
vector<string> getFeatures();
int getStates();
int getClassNumStates();
string getClassName();
void fit(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&);
vector<int> predict(const vector<vector<int>>&);
//Computes the conditional edge weight of variable index u and v conditioned on class_node
torch::Tensor conditionalEdgeWeight();
vector<vector<double>> predict_proba(const vector<vector<int>>&);
double score(const vector<vector<int>>&, const vector<int>&);
vector<string> show();
inline string version() { return "0.1.0"; }
};
}
#endif

114
bayesclass/Node.cc Normal file
View File

@@ -0,0 +1,114 @@
#include "Node.h"
namespace bayesnet {
Node::Node(const std::string& name, int numStates)
: name(name), numStates(numStates), cpTable(torch::Tensor()), parents(vector<Node*>()), children(vector<Node*>())
{
}
string Node::getName() const
{
return name;
}
void Node::addParent(Node* parent)
{
parents.push_back(parent);
}
void Node::removeParent(Node* parent)
{
parents.erase(std::remove(parents.begin(), parents.end(), parent), parents.end());
}
void Node::removeChild(Node* child)
{
children.erase(std::remove(children.begin(), children.end(), child), children.end());
}
void Node::addChild(Node* child)
{
children.push_back(child);
}
vector<Node*>& Node::getParents()
{
return parents;
}
vector<Node*>& Node::getChildren()
{
return children;
}
int Node::getNumStates() const
{
return numStates;
}
void Node::setNumStates(int numStates)
{
this->numStates = numStates;
}
torch::Tensor& Node::getCPT()
{
return cpTable;
}
/*
The MinFill criterion is a heuristic for variable elimination.
The variable that minimizes the number of edges that need to be added to the graph to make it triangulated.
This is done by counting the number of edges that need to be added to the graph if the variable is eliminated.
The variable with the minimum number of edges is chosen.
Here this is done computing the length of the combinations of the node neighbors taken 2 by 2.
*/
unsigned Node::minFill()
{
unordered_set<string> neighbors;
for (auto child : children) {
neighbors.emplace(child->getName());
}
for (auto parent : parents) {
neighbors.emplace(parent->getName());
}
auto source = vector<string>(neighbors.begin(), neighbors.end());
return combinations(source).size();
}
vector<pair<string, string>> Node::combinations(const vector<string>& source)
{
vector<pair<string, string>> result;
for (int i = 0; i < source.size(); ++i) {
string temp = source[i];
for (int j = i + 1; j < source.size(); ++j) {
result.push_back({ temp, source[j] });
}
}
return result;
}
void Node::computeCPT(map<string, vector<int>>& dataset, const int laplaceSmoothing)
{
// Get dimensions of the CPT
dimensions.push_back(numStates);
for (auto father : getParents()) {
dimensions.push_back(father->getNumStates());
}
auto length = dimensions.size();
// Create a tensor of zeros with the dimensions of the CPT
cpTable = torch::zeros(dimensions, torch::kFloat) + laplaceSmoothing;
// Fill table with counts
for (int n_sample = 0; n_sample < dataset[name].size(); ++n_sample) {
torch::List<c10::optional<torch::Tensor>> coordinates;
coordinates.push_back(torch::tensor(dataset[name][n_sample]));
for (auto father : getParents()) {
coordinates.push_back(torch::tensor(dataset[father->getName()][n_sample]));
}
// Increment the count of the corresponding coordinate
cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + 1);
}
// Normalize the counts
cpTable = cpTable / cpTable.sum(0);
}
float Node::getFactorValue(map<string, int>& evidence)
{
torch::List<c10::optional<torch::Tensor>> coordinates;
// following predetermined order of indices in the cpTable (see Node.h)
coordinates.push_back(torch::tensor(evidence[name]));
for (auto parent : getParents()) {
coordinates.push_back(torch::tensor(evidence[parent->getName()]));
}
return cpTable.index({ coordinates }).item<float>();
}
}

35
bayesclass/Node.h Normal file
View File

@@ -0,0 +1,35 @@
#ifndef NODE_H
#define NODE_H
#include <torch/torch.h>
#include <unordered_set>
#include <vector>
#include <string>
namespace bayesnet {
using namespace std;
class Node {
private:
string name;
vector<Node*> parents;
vector<Node*> children;
int numStates; // number of states of the variable
torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ...
vector<int64_t> dimensions; // dimensions of the cpTable
public:
vector<pair<string, string>> combinations(const vector<string>&);
Node(const std::string&, int);
void addParent(Node*);
void addChild(Node*);
void removeParent(Node*);
void removeChild(Node*);
string getName() const;
vector<Node*>& getParents();
vector<Node*>& getChildren();
torch::Tensor& getCPT();
void computeCPT(map<string, vector<int>>&, const int);
int getNumStates() const;
void setNumStates(int);
unsigned minFill();
float getFactorValue(map<string, int>&);
};
}
#endif

View File

@@ -1 +1 @@
__version__ = "0.1.1" __version__ = "0.2.0"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,33 @@
# distutils: language = c++
# cython: language_level = 3
from libcpp.vector cimport vector
from libcpp.string cimport string
from libcpp cimport bool
cdef extern from "FeatureSelect.h" namespace "features":
ctypedef float precision_t
cdef cppclass SelectKBestWeighted:
SelectKBestWeighted(vector[vector[int]]&, vector[int]&, vector[precision_t]&, int, bool) except +
void fit()
string version()
vector[precision_t] getScores()
vector[int] getFeatures()
cdef class CSelectKBestWeighted:
cdef SelectKBestWeighted *thisptr
def __cinit__(self, X, y, weights, k, natural=False): # log or log2
self.thisptr = new SelectKBestWeighted(X, y, weights, k, natural)
def __dealloc__(self):
del self.thisptr
def fit(self,):
self.thisptr.fit()
return self
def get_scores(self):
return self.thisptr.getScores()
def get_features(self):
return self.thisptr.getFeatures()
def get_version(self):
return self.thisptr.version()
def __reduce__(self):
return (CSelectKBestWeighted, ())

View File

@@ -4,8 +4,8 @@ import numpy as np
import pandas as pd import pandas as pd
from scipy.stats import mode from scipy.stats import mode
from sklearn.base import clone, ClassifierMixin, BaseEstimator from sklearn.base import clone, ClassifierMixin, BaseEstimator
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import BaseEnsemble from sklearn.ensemble import BaseEnsemble
from sklearn.feature_selection import mutual_info_classif
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels from sklearn.utils.multiclass import unique_labels
from sklearn.feature_selection import mutual_info_classif from sklearn.feature_selection import mutual_info_classif
@@ -15,6 +15,8 @@ from pgmpy.models import BayesianNetwork
from pgmpy.base import DAG from pgmpy.base import DAG
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from fimdlp.mdlp import FImdlp from fimdlp.mdlp import FImdlp
from .cppSelectFeatures import CSelectKBestWeighted
from .BayesNet import BayesNetwork, CMetrics
from ._version import __version__ from ._version import __version__
@@ -93,7 +95,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
@property @property
def states_(self): def states_(self):
if hasattr(self, "fitted_"): if hasattr(self, "fitted_"):
return sum([len(item) for _, item in self.model_.states.items()]) return self.states_computed_
return 0 return 0
@property @property
@@ -142,7 +144,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
# Store the information needed to build the model # Store the information needed to build the model
self.build_dataset() self.build_dataset()
# Build the DAG # Build the DAG
self._build() self._build(kwargs)
# Train the model # Train the model
self._train(kwargs) self._train(kwargs)
self.fitted_ = True self.fitted_ = True
@@ -151,11 +153,14 @@ class BayesBase(BaseEstimator, ClassifierMixin):
# Return the classifier # Return the classifier
return self return self
def _build(self): def _build(self, kwargs):
"""This method should be implemented by the subclasses to self.model_ = BayesNetwork()
build the DAG features = kwargs["features"]
""" states = kwargs["state_names"]
... for feature in features:
self.model_.addNode(feature, len(states[feature]))
class_name = kwargs["class_name"]
self.model_.addNode(class_name, max(self.y_) + 1)
def _train(self, kwargs): def _train(self, kwargs):
"""Build and train a BayesianNetwork from the DAG and the dataset """Build and train a BayesianNetwork from the DAG and the dataset
@@ -165,17 +170,24 @@ class BayesBase(BaseEstimator, ClassifierMixin):
kwargs : dict kwargs : dict
fit parameters fit parameters
""" """
self.model_ = BayesianNetwork( # self.model_ = BayesianNetwork(
self.dag_.edges(), show_progress=self.show_progress # self.dag_.edges(), show_progress=self.show_progress
) # )
states = dict(state_names=kwargs.pop("state_names", [])) # states = dict(state_names=kwargs.pop("state_names", []))
self.model_.fit( # self.model_.fit(
self.dataset_, # self.dataset_,
estimator=BayesianEstimator, # estimator=BayesianEstimator,
prior_type="K2", # prior_type="K2",
weighted=self.weighted_, # weighted=self.weighted_,
**states, # **states,
) # )
features = kwargs["features"]
class_name = kwargs["class_name"]
for source, destination in self.edges_:
self.model_.addEdge(source, destination)
self.model_.fit(self.X_, self.y_, features, class_name)
self.states_computed_ = self.model_.getStates()
def predict(self, X): def predict(self, X):
"""A reference implementation of a prediction for a classifier. """A reference implementation of a prediction for a classifier.
@@ -227,10 +239,11 @@ class BayesBase(BaseEstimator, ClassifierMixin):
check_is_fitted(self, ["X_", "y_", "fitted_"]) check_is_fitted(self, ["X_", "y_", "fitted_"])
# Input validation # Input validation
X = check_array(X) X = check_array(X)
dataset = pd.DataFrame( # dataset = pd.DataFrame(
X, columns=self.feature_names_in_, dtype=np.int32 # X, columns=self.feature_names_in_, dtype=np.int32
) # )
return self.model_.predict(dataset).values.ravel() # return self.model_.predict(dataset).values.ravel()
return self.model_.predict(X)
def plot(self, title="", node_size=800): def plot(self, title="", node_size=800):
warnings.simplefilter("ignore", UserWarning) warnings.simplefilter("ignore", UserWarning)
@@ -293,7 +306,7 @@ class TAN(BayesBase):
raise ValueError("Head index out of range") raise ValueError("Head index out of range")
return X, y return X, y
def _build(self): def _build(self, kwargs):
est = TreeSearch( est = TreeSearch(
self.dataset_, root_node=self.feature_names_in_[self.head_] self.dataset_, root_node=self.feature_names_in_[self.head_]
) )
@@ -346,7 +359,7 @@ class KDB(BayesBase):
] ]
return self._check_params_fit(X, y, expected_args, kwargs) return self._check_params_fit(X, y, expected_args, kwargs)
def _add_m_edges(self, dag, idx, S_nodes, conditional_weights): def _add_m_edges(self, idx, S_nodes, conditional_weights):
n_edges = min(self.k, len(S_nodes)) n_edges = min(self.k, len(S_nodes))
cond_w = conditional_weights.copy() cond_w = conditional_weights.copy()
exit_cond = self.k == 0 exit_cond = self.k == 0
@@ -355,7 +368,7 @@ class KDB(BayesBase):
max_minfo = np.argmax(cond_w[idx, :]) max_minfo = np.argmax(cond_w[idx, :])
if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta: if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta:
try: try:
dag.add_edge( self.model_.addEdge(
self.feature_names_in_[max_minfo], self.feature_names_in_[max_minfo],
self.feature_names_in_[idx], self.feature_names_in_[idx],
) )
@@ -366,9 +379,9 @@ class KDB(BayesBase):
cond_w[idx, max_minfo] = -1 cond_w[idx, max_minfo] = -1
exit_cond = num == n_edges or np.all(cond_w[idx, :] <= self.theta) exit_cond = num == n_edges or np.all(cond_w[idx, :] <= self.theta)
def _build(self): def _build(self, kwargs):
""" """
1. For each feature Xi, compute mutual information, I(X;;C), 1. For each feature Xi, compute mutual information, I(X;C),
where C is the class. where C is the class.
2. Compute class conditional mutual information I(Xi;XjIC), f or each 2. Compute class conditional mutual information I(Xi;XjIC), f or each
pair of features Xi and Xj, where i#j. pair of features Xi and Xj, where i#j.
@@ -389,29 +402,38 @@ class KDB(BayesBase):
# 1. get the mutual information between each feature and the class # 1. get the mutual information between each feature and the class
mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True) mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True)
# 2. symmetric matrix where each element represents I(X, Y| class_node) # 2. symmetric matrix where each element represents I(X, Y| class_node)
conditional_weights = TreeSearch( metrics = CMetrics(
self.dataset_ self.X_,
)._get_conditional_weights( self.y_,
self.dataset_, self.class_name_, show_progress=self.show_progress self.features_,
self.class_name_,
self.n_classes_,
)
conditional_weights = metrics.conditionalEdgeWeights(
self.n_features_in_ + 1
) )
# 3. Let the used variable list, S, be empty. # 3. Let the used variable list, S, be empty.
S_nodes = [] S_nodes = []
num_states = {
feature: len(states)
for feature, states in kwargs["state_names"].items()
}
# 4. Let the DAG being constructed, BN, begin with a single class node # 4. Let the DAG being constructed, BN, begin with a single class node
dag = BayesianNetwork(show_progress=self.show_progress) self.model_ = BayesNetwork()
dag.add_node(self.class_name_) # , state_names=self.classes_) self.model_.addNode(self.class_name_, self.n_classes_)
# 5. Repeat until S includes all domain features # 5. Repeat until S includes all domain features
# 5.1 Select feature Xmax which is not in S and has the largest value # 5.1 Select feature Xmax which is not in S and has the largest value
for idx in np.argsort(mutual): for idx in np.argsort(-mutual):
# 5.2 Add a node to BN representing Xmax. # 5.2 Add a node to BN representing Xmax.
feature = self.feature_names_in_[idx] feature = self.feature_names_in_[idx]
dag.add_node(feature) self.model_.addNode(feature, num_states[feature])
# 5.3 Add an arc from C to Xmax in BN. # 5.3 Add an arc from C to Xmax in BN.
dag.add_edge(self.class_name_, feature) self.model_.addEdge(self.class_name_, feature)
# 5.4 Add m = min(lSl,/c) arcs from m distinct features Xj in S # 5.4 Add m = min(lSl,/c) arcs from m distinct features Xj in S
self._add_m_edges(dag, idx, S_nodes, conditional_weights) self._add_m_edges(idx, S_nodes, conditional_weights)
# 5.5 Add Xmax to S. # 5.5 Add Xmax to S.
S_nodes.append(idx) S_nodes.append(idx)
self.dag_ = dag self.edges_ = []
def build_spodes(features, class_name): def build_spodes(features, class_name):
@@ -806,7 +828,7 @@ class BoostSPODE(BayesBase):
] ]
return self._check_params_fit(X, y, expected_args, kwargs) return self._check_params_fit(X, y, expected_args, kwargs)
def _build(self): def _build(self, _):
class_edges = [(self.class_name_, f) for f in self.feature_names_in_] class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
feature_edges = [ feature_edges = [
(self.sparent_, f) (self.sparent_, f)
@@ -818,7 +840,6 @@ class BoostSPODE(BayesBase):
def _train(self, kwargs): def _train(self, kwargs):
states = dict(state_names=kwargs.get("state_names", [])) states = dict(state_names=kwargs.get("state_names", []))
breakpoint()
self.model_ = BayesianNetwork(self.dag_.edges(), show_progress=False) self.model_ = BayesianNetwork(self.dag_.edges(), show_progress=False)
self.model_.fit( self.model_.fit(
self.dataset_, self.dataset_,
@@ -835,11 +856,9 @@ class BoostAODE(ClassifierMixin, BaseEnsemble):
show_progress=False, show_progress=False,
random_state=None, random_state=None,
estimator=None, estimator=None,
n_estimators=10,
): ):
self.show_progress = show_progress self.show_progress = show_progress
self.random_state = random_state self.random_state = random_state
self.n_estimators = n_estimators
super().__init__(estimator=estimator) super().__init__(estimator=estimator)
def _validate_estimator(self) -> None: def _validate_estimator(self) -> None:
@@ -868,26 +887,67 @@ class BoostAODE(ClassifierMixin, BaseEnsemble):
self.nodes_leaves = self.nodes_edges self.nodes_leaves = self.nodes_edges
return self return self
def version(self):
if hasattr(self, "fitted_"):
return self.estimator_.version()
return SPODE(None, False).version()
@property
def states_(self):
if hasattr(self, "fitted_"):
return sum(
[
len(item)
for model in self.estimators_
for _, item in model.model_.states.items()
]
) / len(self.estimators_)
return 0
@property
def depth_(self):
return self.states_
def nodes_edges(self):
nodes = 0
edges = 0
if hasattr(self, "fitted_"):
nodes = sum([len(x.dag_) for x in self.estimators_])
edges = sum([len(x.dag_.edges()) for x in self.estimators_])
return nodes, edges
def plot(self, title=""):
warnings.simplefilter("ignore", UserWarning)
for idx, model in enumerate(self.estimators_):
model.plot(title=f"{idx} {title}")
def _train(self, kwargs): def _train(self, kwargs):
"""Build boosted SPODEs""" """Build boosted SPODEs"""
weights = [1 / self.n_samples_] * self.n_samples_ weights = [1 / self.n_samples_] * self.n_samples_
selected_features = []
# Step 0: Set the finish condition # Step 0: Set the finish condition
for num in range(self.n_estimators): for _ in range(self.n_features_in_):
# Step 1: Build ranking with mutual information # Step 1: Build ranking with mutual information
# OJO MAL, ESTO NO ACTUALIZA EL RANKING CON LOS PESOS features = (
# SIEMPRE VA A SACAR LO MISMO CSelectKBestWeighted(
feature = ( self.X_, self.y_, weights, k=self.n_features_in_
SelectKBest(k=1)
.fit(self.X_, self.y_)
.get_feature_names_out(self.feature_names_in_)
.tolist()[0]
) )
.fit()
.get_features()
)
# Step 1.1: Select the feature to become the sparent
for n_feature in features:
if n_feature not in selected_features:
selected_features.append(n_feature)
break
feature = self.feature_names_in_[n_feature]
# Step 2: Build & train spode with the first feature as sparent # Step 2: Build & train spode with the first feature as sparent
estimator = clone(self.estimator_) estimator = clone(self.estimator_)
_args = kwargs.copy() _args = kwargs.copy()
_args["sparent"] = feature _args["sparent"] = feature
_args["sample_weight"] = weights _args["sample_weight"] = weights
_args["weighted"] = True _args["weighted"] = True
# print("I'm gonna build a spode with", feature)
# Step 2.1: build dataset # Step 2.1: build dataset
# Step 2.2: Train the model # Step 2.2: Train the model
estimator.fit(self.X_, self.y_, **_args) estimator.fit(self.X_, self.y_, **_args)
@@ -898,18 +958,17 @@ class BoostAODE(ClassifierMixin, BaseEnsemble):
am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1) am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1)
# Step 3.2: Update weights for next classifier # Step 3.2: Update weights for next classifier
weights = [ weights = [
wm * np.exp(am * (ym != y_pred)) wm * np.exp(am * (ym != yp))
for wm, ym in zip(weights, self.y_) for wm, ym, yp in zip(weights, self.y_, y_pred)
] ]
# Step 4: Add the new model # Step 4: Add the new model
self.estimators_.append(estimator) self.estimators_.append(estimator)
""" self.weights_ = weights
class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
feature_edges = [ def predict(self, X: np.ndarray) -> np.ndarray:
(sparent, f) for f in self.feature_names_in_ if f != sparent n_samples = X.shape[0]
] n_estimators = len(self.estimators_)
self.weights_ = weights.copy() if weights is not None else None result = np.empty((n_samples, n_estimators))
feature_edges.extend(class_edges) for index, estimator in enumerate(self.estimators_):
self.model_ = BayesianNetwork(feature_edges, show_progress=False) result[:, index] = estimator.predict(X)
return self.model_ return mode(result, axis=1, keepdims=False).mode.ravel()
"""

View File

@@ -0,0 +1,93 @@
# import numpy as np
# from sklearn.feature_selection import mutual_info_classif
# from sklearn.utils.validation import check_X_y, check_is_fitted
# from sklearn.feature_selection._univariate_selection import (
# _BaseFilter,
# _clean_nans,
# )
# """
# Compute the weighted mutual information between each feature and the
# target.
# Based on
# Silviu Guiaşu,
# Weighted entropy,
# Reports on Mathematical Physics,
# Volume 2, Issue 3,
# 1971,
# Pages 165-179,
# ISSN 0034-4877,
# https://doi.org/10.1016/0034-4877(71)90002-4.
# (https://www.sciencedirect.com/science/article/pii/0034487771900024)
# Abstract: Weighted entropy is the measure of information supplied by a
# probablistic experiment whose elementary events are characterized both by their
# objective probabilities and by some qualitative (objective or subjective)
# weights. The properties, the axiomatics and the maximum value of the weighted
# entropy are given.
# """
# class SelectKBestWeighted(_BaseFilter):
# def __init__(self, *, k=10):
# super().__init__(score_func=mutual_info_classif)
# self.k = k
# def _check_params(self, X, y):
# if self.k > X.shape[1] or self.k < 1:
# raise ValueError(
# f"k must be between 1 and {X.shape[1]} got {self.k}."
# )
# def _get_support_mask(self):
# check_is_fitted(self)
# if self.k == "all":
# return np.ones(self.scores_.shape, dtype=bool)
# elif self.k == 0:
# return np.zeros(self.scores_.shape, dtype=bool)
# else:
# scores = _clean_nans(self.scores_)
# mask = np.zeros(scores.shape, dtype=bool)
# # Request a stable sort. Mergesort takes more memory (~40MB per
# # megafeature on x86-64).
# mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1
# return mask
# def fit(self, X, y, sample_weight):
# self.X_, self.y_ = check_X_y(X, y)
# self._check_params(X, y)
# self.n_features_in_ = X.shape[1]
# self.sample_weight_ = sample_weight
# # Compute the entropy of the target variable
# entropy_y = -np.sum(
# np.multiply(
# np.bincount(y, weights=sample_weight),
# np.log(np.bincount(y, weights=sample_weight)),
# )
# )
# # Compute the mutual information between each feature and the target
# mi = self.score_func(X, y)
# # Compute the weighted entropy of each feature
# entropy_weighted = []
# for i in range(X.shape[1]):
# # Compute the weighted frequency of each unique value of the
# # feature
# freq_weighted = np.bincount(X[:, i], weights=sample_weight)
# freq_weighted = freq_weighted[freq_weighted != 0]
# # Compute the weighted entropy of the feature
# entropy_weighted.append(
# -np.sum(np.multiply(freq_weighted, np.log(freq_weighted)))
# / np.sum(sample_weight)
# )
# # Compute the weighted mutual information between each feature and
# # the target
# mi_weighted = mi * entropy_weighted / entropy_y
# # Return the weighted mutual information scores
# self.scores_ = mi_weighted
# return self

View File

@@ -1,5 +1,5 @@
[build-system] [build-system]
requires = ["setuptools", "setuptools-scm", "wheel"] requires = ["setuptools", "setuptools-scm", "cython", "wheel", "torch"]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
[tool.setuptools] [tool.setuptools]
@@ -39,9 +39,7 @@ classifiers = [
"Operating System :: OS Independent", "Operating System :: OS Independent",
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
] ]
[project.optional-dependencies] [project.optional-dependencies]
@@ -61,7 +59,7 @@ show_missing = true
[tool.black] [tool.black]
line-length = 79 line-length = 79
target_version = ['py38', 'py39', 'py310'] target_version = ['py311']
include = '\.pyi?$' include = '\.pyi?$'
exclude = ''' exclude = '''
/( /(

41
setup.py Normal file
View File

@@ -0,0 +1,41 @@
"""
Calling
$python setup.py build_ext --inplace
will build the extension library in the current file.
"""
from setuptools import Extension, setup
from torch.utils.cpp_extension import (
BuildExtension,
CppExtension,
include_paths,
)
setup(
ext_modules=[
Extension(
name="bayesclass.cppSelectFeatures",
sources=[
"bayesclass/cSelectFeatures.pyx",
"bayesclass/FeatureSelect.cpp",
],
language="c++",
include_dirs=["bayesclass"],
extra_compile_args=[
"-std=c++17",
],
),
CppExtension(
name="bayesclass.BayesNet",
sources=[
"bayesclass/BayesNetwork.pyx",
"bayesclass/Network.cc",
"bayesclass/Node.cc",
"bayesclass/Metrics.cc",
],
include_dirs=include_paths(),
),
],
cmdclass={"build_ext": BuildExtension},
)