mirror of
https://github.com/Doctorado-ML/bayesclass.git
synced 2025-08-18 17:15:53 +00:00
Compare commits
60 Commits
Author | SHA1 | Date | |
---|---|---|---|
d1cafc230b
|
|||
99083ceede
|
|||
64f1500176
|
|||
aef22306ef
|
|||
2ff38f73e7
|
|||
1af3edd050
|
|||
8b6624e08a
|
|||
36cc875615
|
|||
260997c872
|
|||
8a9c86a22d
|
|||
4bad5ccfee
|
|||
5866e19fae
|
|||
61e4c176eb
|
|||
ea473fc604
|
|||
9d7e787f6c
|
|||
d7425e5af0
|
|||
30cc744033
|
|||
0094d500d4
|
|||
99321043ec
|
|||
fbaa5eb7d3
|
|||
0b27d9d9b0
|
|||
212f7e5584
|
|||
a797381c00
|
|||
3812d271e5
|
|||
923a06b3be
|
|||
c906d6a361
|
|||
|
f0f7c43944 | ||
f9b35f61f0
|
|||
74cd8a6aa2
|
|||
9843f5f8db
|
|||
c6390d9da9
|
|||
c9afafbf60
|
|||
3af05c9511
|
|||
80b1ab3699
|
|||
5a772b0bca
|
|||
ea251aca05
|
|||
7b66097728
|
|||
ea8c5b805e
|
|||
2ffc06b232
|
|||
a5244f1c7f
|
|||
42ac57eb79
|
|||
63a2feef3a
|
|||
3e049ac89d
|
|||
2a6547c71d
|
|||
de45a94c9b
|
|||
9019b878f0
|
|||
bba9255605
|
|||
41ca6fad5e
|
|||
c88591dd64
|
|||
8089e4fd57
|
|||
6f9488f281
|
|||
e837c6cef7
|
|||
a4edc74e8d
|
|||
|
4d416959ad
|
||
|
bdd3f483d9
|
||
|
8fd796155d
|
||
|
d08aea4681
|
||
|
dd2e0a3b7e
|
||
65d41488cb
|
|||
e7300366ca
|
7
CMakeLists.txt
Normal file
7
CMakeLists.txt
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.20)
|
||||||
|
project(feature)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
set(CMAKE_BUILD_TYPE Debug)
|
||||||
|
|
||||||
|
add_executable(feature bayesclass/cpp/FeatureSelect.cpp)
|
@@ -1 +1,5 @@
|
|||||||
include README.md LICENSE
|
include README.md LICENSE
|
||||||
|
include bayesclass/FeatureSelect.h
|
||||||
|
include bayesclass/Node.h
|
||||||
|
include bayesclass/Network.h
|
||||||
|
include bayesclass/Metrics.hpp
|
10
Makefile
10
Makefile
@@ -16,6 +16,10 @@ lint: ## Lint and static-check
|
|||||||
flake8 bayesclass
|
flake8 bayesclass
|
||||||
mypy bayesclass
|
mypy bayesclass
|
||||||
|
|
||||||
|
feature: ## compile FeatureSelect
|
||||||
|
cmake -B build feature
|
||||||
|
|
||||||
|
|
||||||
push: ## Push code with tags
|
push: ## Push code with tags
|
||||||
git push && git push --tags
|
git push && git push --tags
|
||||||
|
|
||||||
@@ -37,6 +41,12 @@ doc-clean: ## Update documentation
|
|||||||
audit: ## Audit pip
|
audit: ## Audit pip
|
||||||
pip-audit
|
pip-audit
|
||||||
|
|
||||||
|
version:
|
||||||
|
@echo "Current Python version .....: $(shell python --version)"
|
||||||
|
@echo "Current Bayesclass version .: $(shell python -c "from bayesclass import _version; print(_version.__version__)")"
|
||||||
|
@echo "Installed Bayesclass version: $(shell pip show bayesclass | grep Version | cut -d' ' -f2)"
|
||||||
|
@echo "Installed pgmpy version ....: $(shell pip show pgmpy | grep Version | cut -d' ' -f2)"
|
||||||
|
|
||||||
help: ## Show help message
|
help: ## Show help message
|
||||||
@IFS=$$'\n' ; \
|
@IFS=$$'\n' ; \
|
||||||
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||||
|
93
bayesclass/BaseClassifier.cc
Normal file
93
bayesclass/BaseClassifier.cc
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
#include "BaseClassifier.h"
|
||||||
|
|
||||||
|
namespace bayesnet {
|
||||||
|
using namespace std;
|
||||||
|
using namespace torch;
|
||||||
|
|
||||||
|
BaseClassifier::BaseClassifier(Network model) : model(model), m(0), n(0) {}
|
||||||
|
BaseClassifier& BaseClassifier::build(vector<string>& features, string className, map<string, vector<int>>& states)
|
||||||
|
{
|
||||||
|
|
||||||
|
dataset = torch::cat({ X, y.view({y.size(0), 1}) }, 1);
|
||||||
|
this->features = features;
|
||||||
|
this->className = className;
|
||||||
|
this->states = states;
|
||||||
|
checkFitParameters();
|
||||||
|
train();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
BaseClassifier& BaseClassifier::fit(Tensor& X, Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states)
|
||||||
|
{
|
||||||
|
this->X = X;
|
||||||
|
this->y = y;
|
||||||
|
return build(features, className, states);
|
||||||
|
}
|
||||||
|
BaseClassifier& BaseClassifier::fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states)
|
||||||
|
{
|
||||||
|
this->X = torch::zeros({ static_cast<int64_t>(X[0].size()), static_cast<int64_t>(X.size()) }, kInt64);
|
||||||
|
for (int i = 0; i < X.size(); ++i) {
|
||||||
|
this->X.index_put_({ "...", i }, torch::tensor(X[i], kInt64));
|
||||||
|
}
|
||||||
|
this->y = torch::tensor(y, kInt64);
|
||||||
|
return build(features, className, states);
|
||||||
|
}
|
||||||
|
void BaseClassifier::checkFitParameters()
|
||||||
|
{
|
||||||
|
auto sizes = X.sizes();
|
||||||
|
m = sizes[0];
|
||||||
|
n = sizes[1];
|
||||||
|
if (m != y.size(0)) {
|
||||||
|
throw invalid_argument("X and y must have the same number of samples");
|
||||||
|
}
|
||||||
|
if (n != features.size()) {
|
||||||
|
throw invalid_argument("X and features must have the same number of features");
|
||||||
|
}
|
||||||
|
if (states.find(className) == states.end()) {
|
||||||
|
throw invalid_argument("className not found in states");
|
||||||
|
}
|
||||||
|
for (auto feature : features) {
|
||||||
|
if (states.find(feature) == states.end()) {
|
||||||
|
throw invalid_argument("feature [" + feature + "] not found in states");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
vector<vector<int>> tensorToVector(const torch::Tensor& tensor)
|
||||||
|
{
|
||||||
|
// convert mxn tensor to nxm vector
|
||||||
|
vector<vector<int>> result;
|
||||||
|
auto tensor_accessor = tensor.accessor<int, 2>();
|
||||||
|
|
||||||
|
// Iterate over columns and rows of the tensor
|
||||||
|
for (int j = 0; j < tensor.size(1); ++j) {
|
||||||
|
vector<int> column;
|
||||||
|
for (int i = 0; i < tensor.size(0); ++i) {
|
||||||
|
column.push_back(tensor_accessor[i][j]);
|
||||||
|
}
|
||||||
|
result.push_back(column);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
Tensor BaseClassifier::predict(Tensor& X)
|
||||||
|
{
|
||||||
|
auto m_ = X.size(0);
|
||||||
|
auto n_ = X.size(1);
|
||||||
|
vector<vector<int>> Xd(n_, vector<int>(m_, 0));
|
||||||
|
for (auto i = 0; i < n_; i++) {
|
||||||
|
auto temp = X.index({ "...", i });
|
||||||
|
Xd[i] = vector<int>(temp.data_ptr<int>(), temp.data_ptr<int>() + m_);
|
||||||
|
}
|
||||||
|
auto yp = model.predict(Xd);
|
||||||
|
auto ypred = torch::tensor(yp, torch::kInt64);
|
||||||
|
return ypred;
|
||||||
|
}
|
||||||
|
float BaseClassifier::score(Tensor& X, Tensor& y)
|
||||||
|
{
|
||||||
|
Tensor y_pred = predict(X);
|
||||||
|
return (y_pred == y).sum().item<float>() / y.size(0);
|
||||||
|
}
|
||||||
|
vector<string> BaseClassifier::show()
|
||||||
|
{
|
||||||
|
return model.show();
|
||||||
|
}
|
||||||
|
}
|
39
bayesclass/BaseClassifier.h
Normal file
39
bayesclass/BaseClassifier.h
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
#ifndef CLASSIFIERS_H
|
||||||
|
#include <torch/torch.h>
|
||||||
|
#include "Network.h"
|
||||||
|
using namespace std;
|
||||||
|
using namespace torch;
|
||||||
|
|
||||||
|
namespace bayesnet {
|
||||||
|
class BaseClassifier {
|
||||||
|
private:
|
||||||
|
BaseClassifier& build(vector<string>& features, string className, map<string, vector<int>>& states);
|
||||||
|
protected:
|
||||||
|
Network model;
|
||||||
|
int m, n; // m: number of samples, n: number of features
|
||||||
|
Tensor X;
|
||||||
|
Tensor y;
|
||||||
|
Tensor dataset;
|
||||||
|
vector<string> features;
|
||||||
|
string className;
|
||||||
|
map<string, vector<int>> states;
|
||||||
|
void checkFitParameters();
|
||||||
|
virtual void train() = 0;
|
||||||
|
public:
|
||||||
|
BaseClassifier(Network model);
|
||||||
|
Tensor& getX();
|
||||||
|
vector<string>& getFeatures();
|
||||||
|
string& getClassName();
|
||||||
|
BaseClassifier& fit(Tensor& X, Tensor& y, vector<string>& features, string className, map<string, vector<int>>& states);
|
||||||
|
BaseClassifier& fit(vector<vector<int>>& X, vector<int>& y, vector<string>& features, string className, map<string, vector<int>>& states);
|
||||||
|
Tensor predict(Tensor& X);
|
||||||
|
float score(Tensor& X, Tensor& y);
|
||||||
|
vector<string> show();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
7663
bayesclass/BayesNetwork.cpp
Normal file
7663
bayesclass/BayesNetwork.cpp
Normal file
File diff suppressed because it is too large
Load Diff
78
bayesclass/BayesNetwork.pyx
Normal file
78
bayesclass/BayesNetwork.pyx
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
# distutils: language = c++
|
||||||
|
# cython: language_level = 3
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from libcpp.string cimport string
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
cdef extern from "Network.h" namespace "bayesnet":
|
||||||
|
cdef cppclass Network:
|
||||||
|
Network(float, float) except +
|
||||||
|
void fit(vector[vector[int]]&, vector[int]&, vector[string]&, string)
|
||||||
|
vector[int] predict(vector[vector[int]]&)
|
||||||
|
vector[vector[double]] predict_proba(vector[vector[int]]&)
|
||||||
|
float score(const vector[vector[int]]&, const vector[int]&)
|
||||||
|
void addNode(string, int)
|
||||||
|
void addEdge(string, string) except +
|
||||||
|
vector[string] getFeatures()
|
||||||
|
int getClassNumStates()
|
||||||
|
int getStates()
|
||||||
|
string getClassName()
|
||||||
|
string version()
|
||||||
|
void show()
|
||||||
|
|
||||||
|
cdef class BayesNetwork:
|
||||||
|
cdef Network *thisptr
|
||||||
|
def __cinit__(self, maxThreads=0.8, laplaceSmooth=1.0):
|
||||||
|
self.thisptr = new Network(maxThreads, laplaceSmooth)
|
||||||
|
def __dealloc__(self):
|
||||||
|
del self.thisptr
|
||||||
|
def fit(self, X, y, features, className):
|
||||||
|
X_ = [X[:, i] for i in range(X.shape[1])]
|
||||||
|
features_bytes = [x.encode() for x in features]
|
||||||
|
self.thisptr.fit(X_, y, features_bytes, className.encode())
|
||||||
|
return self
|
||||||
|
def predict(self, X):
|
||||||
|
X_ = [X[:, i] for i in range(X.shape[1])]
|
||||||
|
return self.thisptr.predict(X_)
|
||||||
|
def predict_proba(self, X):
|
||||||
|
X_ = [X[:, i] for i in range(X.shape[1])]
|
||||||
|
return self.thisptr.predict_proba(X_)
|
||||||
|
def score(self, X, y):
|
||||||
|
X_ = [X[:, i] for i in range(X.shape[1])]
|
||||||
|
return self.thisptr.score(X_, y)
|
||||||
|
def addNode(self, name, states):
|
||||||
|
self.thisptr.addNode(str.encode(name), states)
|
||||||
|
def addEdge(self, source, destination):
|
||||||
|
self.thisptr.addEdge(str.encode(source), str.encode(destination))
|
||||||
|
def getFeatures(self):
|
||||||
|
res = self.thisptr.getFeatures()
|
||||||
|
return [x.decode() for x in res]
|
||||||
|
def getStates(self):
|
||||||
|
return self.thisptr.getStates()
|
||||||
|
def getClassName(self):
|
||||||
|
return self.thisptr.getClassName().decode()
|
||||||
|
def getClassNumStates(self):
|
||||||
|
return self.thisptr.getClassNumStates()
|
||||||
|
def show(self):
|
||||||
|
return self.thisptr.show()
|
||||||
|
def __reduce__(self):
|
||||||
|
return (BayesNetwork, ())
|
||||||
|
|
||||||
|
cdef extern from "Metrics.hpp" namespace "bayesnet":
|
||||||
|
cdef cppclass Metrics:
|
||||||
|
Metrics(vector[vector[int]], vector[int], vector[string]&, string&, int) except +
|
||||||
|
vector[float] conditionalEdgeWeights()
|
||||||
|
|
||||||
|
cdef class CMetrics:
|
||||||
|
cdef Metrics *thisptr
|
||||||
|
def __cinit__(self, X, y, features, className, classStates):
|
||||||
|
X_ = [X[:, i] for i in range(X.shape[1])]
|
||||||
|
features_bytes = [x.encode() for x in features]
|
||||||
|
self.thisptr = new Metrics(X_, y, features_bytes, className.encode(), classStates)
|
||||||
|
def __dealloc__(self):
|
||||||
|
del self.thisptr
|
||||||
|
def conditionalEdgeWeights(self, n_vars):
|
||||||
|
return np.reshape(self.thisptr.conditionalEdgeWeights(), (n_vars, n_vars))
|
||||||
|
def __reduce__(self):
|
||||||
|
return (CMetrics, ())
|
||||||
|
|
118
bayesclass/FeatureSelect.cpp
Normal file
118
bayesclass/FeatureSelect.cpp
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
#include "FeatureSelect.h"
|
||||||
|
namespace features {
|
||||||
|
SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat)
|
||||||
|
: samples(samples), labels(labels), weights(weights), k(k), nat(nat)
|
||||||
|
{
|
||||||
|
if (samples.size() == 0 || samples[0].size() == 0)
|
||||||
|
throw invalid_argument("features must be a non-empty matrix");
|
||||||
|
if (samples.size() != labels.size())
|
||||||
|
throw invalid_argument("number of samples and labels must be equal");
|
||||||
|
if (samples.size() != weights.size())
|
||||||
|
throw invalid_argument("number of samples and weights must be equal");
|
||||||
|
if (k < 1 || k > static_cast<int>(samples[0].size()))
|
||||||
|
throw invalid_argument("k must be between 1 and number of features");
|
||||||
|
numFeatures = 0;
|
||||||
|
numClasses = 0;
|
||||||
|
numSamples = 0;
|
||||||
|
fitted = false;
|
||||||
|
}
|
||||||
|
void SelectKBestWeighted::SelectKBestWeighted::fit()
|
||||||
|
{
|
||||||
|
auto labelsCopy = labels;
|
||||||
|
numFeatures = samples[0].size();
|
||||||
|
numSamples = samples.size();
|
||||||
|
// compute number of classes
|
||||||
|
sort(labelsCopy.begin(), labelsCopy.end());
|
||||||
|
auto last = unique(labelsCopy.begin(), labelsCopy.end());
|
||||||
|
labelsCopy.erase(last, labelsCopy.end());
|
||||||
|
numClasses = labelsCopy.size();
|
||||||
|
// compute scores
|
||||||
|
scores.reserve(numFeatures);
|
||||||
|
for (int i = 0; i < numFeatures; ++i) {
|
||||||
|
scores.push_back(MutualInformation(i));
|
||||||
|
features.push_back(i);
|
||||||
|
}
|
||||||
|
// sort & reduce scores and features
|
||||||
|
sort(features.begin(), features.end(), [&](int i, int j)
|
||||||
|
{ return scores[i] > scores[j]; });
|
||||||
|
sort(scores.begin(), scores.end(), greater<precision_t>());
|
||||||
|
features.resize(k);
|
||||||
|
scores.resize(k);
|
||||||
|
fitted = true;
|
||||||
|
}
|
||||||
|
precision_t SelectKBestWeighted::entropyLabel()
|
||||||
|
{
|
||||||
|
return entropy(labels);
|
||||||
|
}
|
||||||
|
precision_t SelectKBestWeighted::entropy(const sample_t& data)
|
||||||
|
{
|
||||||
|
precision_t ventropy = 0, totalWeight = 0;
|
||||||
|
score_t counts(numClasses + 1, 0);
|
||||||
|
for (auto i = 0; i < static_cast<int>(data.size()); ++i) {
|
||||||
|
counts[data[i]] += weights[i];
|
||||||
|
totalWeight += weights[i];
|
||||||
|
}
|
||||||
|
for (auto count : counts) {
|
||||||
|
precision_t p = count / totalWeight;
|
||||||
|
if (p > 0) {
|
||||||
|
if (nat) {
|
||||||
|
ventropy -= p * log(p);
|
||||||
|
} else {
|
||||||
|
ventropy -= p * log2(p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ventropy;
|
||||||
|
}
|
||||||
|
// H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
|
||||||
|
precision_t SelectKBestWeighted::conditionalEntropy(const int feature)
|
||||||
|
{
|
||||||
|
unordered_map<value_t, precision_t> featureCounts;
|
||||||
|
unordered_map<value_t, unordered_map<value_t, precision_t>> jointCounts;
|
||||||
|
featureCounts.clear();
|
||||||
|
jointCounts.clear();
|
||||||
|
precision_t totalWeight = 0;
|
||||||
|
for (auto i = 0; i < numSamples; i++) {
|
||||||
|
featureCounts[samples[i][feature]] += weights[i];
|
||||||
|
jointCounts[samples[i][feature]][labels[i]] += weights[i];
|
||||||
|
totalWeight += weights[i];
|
||||||
|
}
|
||||||
|
if (totalWeight == 0)
|
||||||
|
throw invalid_argument("Total weight should not be zero");
|
||||||
|
precision_t entropy = 0;
|
||||||
|
for (auto& [feat, count] : featureCounts) {
|
||||||
|
auto p_f = count / totalWeight;
|
||||||
|
precision_t entropy_f = 0;
|
||||||
|
for (auto& [label, jointCount] : jointCounts[feat]) {
|
||||||
|
auto p_l_f = jointCount / count;
|
||||||
|
if (p_l_f > 0) {
|
||||||
|
if (nat) {
|
||||||
|
entropy_f -= p_l_f * log(p_l_f);
|
||||||
|
} else {
|
||||||
|
entropy_f -= p_l_f * log2(p_l_f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
entropy += p_f * entropy_f;
|
||||||
|
}
|
||||||
|
return entropy;
|
||||||
|
}
|
||||||
|
// I(X;Y) = H(Y) - H(Y|X)
|
||||||
|
precision_t SelectKBestWeighted::MutualInformation(const int i)
|
||||||
|
{
|
||||||
|
return entropyLabel() - conditionalEntropy(i);
|
||||||
|
}
|
||||||
|
score_t SelectKBestWeighted::getScores() const
|
||||||
|
{
|
||||||
|
if (!fitted)
|
||||||
|
throw logic_error("score not fitted");
|
||||||
|
return scores;
|
||||||
|
}
|
||||||
|
//Return the indices of the selected features
|
||||||
|
labels_t SelectKBestWeighted::getFeatures() const
|
||||||
|
{
|
||||||
|
if (!fitted)
|
||||||
|
throw logic_error("score not fitted");
|
||||||
|
return features;
|
||||||
|
}
|
||||||
|
}
|
38
bayesclass/FeatureSelect.h
Normal file
38
bayesclass/FeatureSelect.h
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
#ifndef SELECT_K_BEST_WEIGHTED_H
|
||||||
|
#define SELECT_K_BEST_WEIGHTED_H
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
using namespace std;
|
||||||
|
namespace features {
|
||||||
|
typedef float precision_t;
|
||||||
|
typedef int value_t;
|
||||||
|
typedef vector<value_t> sample_t;
|
||||||
|
typedef vector<sample_t> samples_t;
|
||||||
|
typedef vector<value_t> labels_t;
|
||||||
|
typedef vector<precision_t> score_t, weights_t;
|
||||||
|
|
||||||
|
class SelectKBestWeighted {
|
||||||
|
private:
|
||||||
|
const samples_t samples;
|
||||||
|
const labels_t labels;
|
||||||
|
const weights_t weights;
|
||||||
|
const int k;
|
||||||
|
bool nat; // use natural log or log2
|
||||||
|
int numFeatures, numClasses, numSamples;
|
||||||
|
bool fitted;
|
||||||
|
score_t scores; // scores of the features
|
||||||
|
labels_t features; // indices of the selected features
|
||||||
|
precision_t entropyLabel();
|
||||||
|
precision_t entropy(const sample_t&);
|
||||||
|
precision_t conditionalEntropy(const int);
|
||||||
|
precision_t MutualInformation(const int);
|
||||||
|
public:
|
||||||
|
SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool);
|
||||||
|
void fit();
|
||||||
|
score_t getScores() const;
|
||||||
|
labels_t getFeatures() const; //Return the indices of the selected features
|
||||||
|
static inline string version() { return "0.1.0"; };
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
110
bayesclass/KDB.cc
Normal file
110
bayesclass/KDB.cc
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
#include "KDB.h"
|
||||||
|
#include "Metrics.hpp"
|
||||||
|
|
||||||
|
namespace bayesnet {
|
||||||
|
using namespace std;
|
||||||
|
using namespace torch;
|
||||||
|
vector<int> argsort(vector<float>& nums)
|
||||||
|
{
|
||||||
|
int n = nums.size();
|
||||||
|
vector<int> indices(n);
|
||||||
|
iota(indices.begin(), indices.end(), 0);
|
||||||
|
sort(indices.begin(), indices.end(), [&nums](int i, int j) {return nums[i] > nums[j];});
|
||||||
|
return indices;
|
||||||
|
}
|
||||||
|
KDB::KDB(int k, float theta) : BaseClassifier(Network()), k(k), theta(theta) {}
|
||||||
|
void KDB::train()
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
1. For each feature Xi, compute mutual information, I(X;C),
|
||||||
|
where C is the class.
|
||||||
|
2. Compute class conditional mutual information I(Xi;XjIC), f or each
|
||||||
|
pair of features Xi and Xj, where i#j.
|
||||||
|
3. Let the used variable list, S, be empty.
|
||||||
|
4. Let the DAG network being constructed, BN, begin with a single
|
||||||
|
class node, C.
|
||||||
|
5. Repeat until S includes all domain features
|
||||||
|
5.1. Select feature Xmax which is not in S and has the largest value
|
||||||
|
I(Xmax;C).
|
||||||
|
5.2. Add a node to BN representing Xmax.
|
||||||
|
5.3. Add an arc from C to Xmax in BN.
|
||||||
|
5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with
|
||||||
|
the highest value for I(Xmax;X,jC).
|
||||||
|
5.5. Add Xmax to S.
|
||||||
|
Compute the conditional probabilility infered by the structure of BN by
|
||||||
|
using counts from DB, and output BN.
|
||||||
|
*/
|
||||||
|
// 1. For each feature Xi, compute mutual information, I(X;C),
|
||||||
|
// where C is the class.
|
||||||
|
cout << "Computing mutual information between features and class" << endl;
|
||||||
|
auto n_classes = states[className].size();
|
||||||
|
auto metrics = Metrics(dataset, features, className, n_classes);
|
||||||
|
vector <float> mi;
|
||||||
|
for (auto i = 0; i < features.size(); i++) {
|
||||||
|
Tensor firstFeature = X.index({ "...", i });
|
||||||
|
mi.push_back(metrics.mutualInformation(firstFeature, y));
|
||||||
|
cout << "Mutual information between " << features[i] << " and " << className << " is " << mi[i] << endl;
|
||||||
|
}
|
||||||
|
// 2. Compute class conditional mutual information I(Xi;XjIC), f or each
|
||||||
|
auto conditionalEdgeWeights = metrics.conditionalEdge();
|
||||||
|
cout << "Conditional edge weights" << endl;
|
||||||
|
cout << conditionalEdgeWeights << endl;
|
||||||
|
// 3. Let the used variable list, S, be empty.
|
||||||
|
vector<int> S;
|
||||||
|
// 4. Let the DAG network being constructed, BN, begin with a single
|
||||||
|
// class node, C.
|
||||||
|
model.addNode(className, states[className].size());
|
||||||
|
cout << "Adding node " << className << " to the network" << endl;
|
||||||
|
// 5. Repeat until S includes all domain features
|
||||||
|
// 5.1. Select feature Xmax which is not in S and has the largest value
|
||||||
|
// I(Xmax;C).
|
||||||
|
auto order = argsort(mi);
|
||||||
|
for (auto idx : order) {
|
||||||
|
cout << idx << " " << mi[idx] << endl;
|
||||||
|
// 5.2. Add a node to BN representing Xmax.
|
||||||
|
model.addNode(features[idx], states[features[idx]].size());
|
||||||
|
// 5.3. Add an arc from C to Xmax in BN.
|
||||||
|
model.addEdge(className, features[idx]);
|
||||||
|
// 5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with
|
||||||
|
// the highest value for I(Xmax;X,jC).
|
||||||
|
add_m_edges(idx, S, conditionalEdgeWeights);
|
||||||
|
// 5.5. Add Xmax to S.
|
||||||
|
S.push_back(idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void KDB::add_m_edges(int idx, vector<int>& S, Tensor& weights)
|
||||||
|
{
|
||||||
|
auto n_edges = min(k, static_cast<int>(S.size()));
|
||||||
|
auto cond_w = clone(weights);
|
||||||
|
cout << "Conditional edge weights cloned for idx " << idx << endl;
|
||||||
|
cout << cond_w << endl;
|
||||||
|
bool exit_cond = k == 0;
|
||||||
|
int num = 0;
|
||||||
|
while (!exit_cond) {
|
||||||
|
auto max_minfo = argmax(cond_w.index({ idx, "..." })).item<int>();
|
||||||
|
auto belongs = find(S.begin(), S.end(), max_minfo) != S.end();
|
||||||
|
if (belongs && cond_w.index({ idx, max_minfo }).item<float>() > theta) {
|
||||||
|
try {
|
||||||
|
model.addEdge(features[max_minfo], features[idx]);
|
||||||
|
num++;
|
||||||
|
}
|
||||||
|
catch (const invalid_argument& e) {
|
||||||
|
// Loops are not allowed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cond_w.index_put_({ idx, max_minfo }, -1);
|
||||||
|
cout << "Conditional edge weights cloned for idx " << idx << " After -1" << endl;
|
||||||
|
cout << cond_w << endl;
|
||||||
|
cout << "cond_w.index({ idx, '...'})" << endl;
|
||||||
|
cout << cond_w.index({ idx, "..." }) << endl;
|
||||||
|
auto candidates_mask = cond_w.index({ idx, "..." }).gt(theta);
|
||||||
|
auto candidates = candidates_mask.nonzero();
|
||||||
|
cout << "Candidates mask" << endl;
|
||||||
|
cout << candidates_mask << endl;
|
||||||
|
cout << "Candidates: " << endl;
|
||||||
|
cout << candidates << endl;
|
||||||
|
cout << "Candidates size: " << candidates.size(0) << endl;
|
||||||
|
exit_cond = num == n_edges || candidates.size(0) == 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
18
bayesclass/KDB.h
Normal file
18
bayesclass/KDB.h
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
#ifndef KDB_H
|
||||||
|
#define KDB_H
|
||||||
|
#include "BaseClassifier.h"
|
||||||
|
namespace bayesnet {
|
||||||
|
using namespace std;
|
||||||
|
using namespace torch;
|
||||||
|
class KDB : public BaseClassifier {
|
||||||
|
private:
|
||||||
|
int k;
|
||||||
|
float theta;
|
||||||
|
void add_m_edges(int idx, vector<int>& S, Tensor& weights);
|
||||||
|
protected:
|
||||||
|
void train() override;
|
||||||
|
public:
|
||||||
|
KDB(int k, float theta = 0.03);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
119
bayesclass/Metrics.cc
Normal file
119
bayesclass/Metrics.cc
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
#include "Metrics.hpp"
|
||||||
|
using namespace std;
|
||||||
|
namespace bayesnet {
|
||||||
|
Metrics::Metrics(torch::Tensor& samples, vector<string>& features, string& className, int classNumStates)
|
||||||
|
: samples(samples)
|
||||||
|
, features(features)
|
||||||
|
, className(className)
|
||||||
|
, classNumStates(classNumStates)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
Metrics::Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates)
|
||||||
|
: features(features)
|
||||||
|
, className(className)
|
||||||
|
, classNumStates(classNumStates)
|
||||||
|
{
|
||||||
|
samples = torch::zeros({ static_cast<int64_t>(vsamples[0].size()), static_cast<int64_t>(vsamples.size() + 1) }, torch::kInt64);
|
||||||
|
for (int i = 0; i < vsamples.size(); ++i) {
|
||||||
|
samples.index_put_({ "...", i }, torch::tensor(vsamples[i], torch::kInt64));
|
||||||
|
}
|
||||||
|
samples.index_put_({ "...", -1 }, torch::tensor(labels, torch::kInt64));
|
||||||
|
}
|
||||||
|
vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
|
||||||
|
{
|
||||||
|
vector<pair<string, string>> result;
|
||||||
|
for (int i = 0; i < source.size(); ++i) {
|
||||||
|
string temp = source[i];
|
||||||
|
for (int j = i + 1; j < source.size(); ++j) {
|
||||||
|
result.push_back({ temp, source[j] });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
torch::Tensor Metrics::conditionalEdge()
|
||||||
|
{
|
||||||
|
auto result = vector<double>();
|
||||||
|
auto source = vector<string>(features);
|
||||||
|
source.push_back(className);
|
||||||
|
auto combinations = doCombinations(source);
|
||||||
|
// Compute class prior
|
||||||
|
auto margin = torch::zeros({ classNumStates });
|
||||||
|
for (int value = 0; value < classNumStates; ++value) {
|
||||||
|
auto mask = samples.index({ "...", -1 }) == value;
|
||||||
|
margin[value] = mask.sum().item<float>() / samples.sizes()[0];
|
||||||
|
}
|
||||||
|
for (auto [first, second] : combinations) {
|
||||||
|
int64_t index_first = find(features.begin(), features.end(), first) - features.begin();
|
||||||
|
int64_t index_second = find(features.begin(), features.end(), second) - features.begin();
|
||||||
|
double accumulated = 0;
|
||||||
|
for (int value = 0; value < classNumStates; ++value) {
|
||||||
|
auto mask = samples.index({ "...", -1 }) == value;
|
||||||
|
auto first_dataset = samples.index({ mask, index_first });
|
||||||
|
auto second_dataset = samples.index({ mask, index_second });
|
||||||
|
auto mi = mutualInformation(first_dataset, second_dataset);
|
||||||
|
auto pb = margin[value].item<float>();
|
||||||
|
accumulated += pb * mi;
|
||||||
|
}
|
||||||
|
result.push_back(accumulated);
|
||||||
|
}
|
||||||
|
long n_vars = source.size();
|
||||||
|
auto matrix = torch::zeros({ n_vars, n_vars });
|
||||||
|
auto indices = torch::triu_indices(n_vars, n_vars, 1);
|
||||||
|
for (auto i = 0; i < result.size(); ++i) {
|
||||||
|
auto x = indices[0][i];
|
||||||
|
auto y = indices[1][i];
|
||||||
|
matrix[x][y] = result[i];
|
||||||
|
matrix[y][x] = result[i];
|
||||||
|
}
|
||||||
|
return matrix;
|
||||||
|
}
|
||||||
|
vector<float> Metrics::conditionalEdgeWeights()
|
||||||
|
{
|
||||||
|
auto matrix = conditionalEdge();
|
||||||
|
std::vector<float> v(matrix.data_ptr<float>(), matrix.data_ptr<float>() + matrix.numel());
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
double Metrics::entropy(torch::Tensor& feature)
|
||||||
|
{
|
||||||
|
torch::Tensor counts = feature.bincount();
|
||||||
|
int totalWeight = counts.sum().item<int>();
|
||||||
|
torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
|
||||||
|
torch::Tensor logProbs = torch::log(probs);
|
||||||
|
torch::Tensor entropy = -probs * logProbs;
|
||||||
|
return entropy.nansum().item<double>();
|
||||||
|
}
|
||||||
|
// H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
|
||||||
|
double Metrics::conditionalEntropy(torch::Tensor& firstFeature, torch::Tensor& secondFeature)
|
||||||
|
{
|
||||||
|
int numSamples = firstFeature.sizes()[0];
|
||||||
|
torch::Tensor featureCounts = secondFeature.bincount();
|
||||||
|
unordered_map<int, unordered_map<int, double>> jointCounts;
|
||||||
|
double totalWeight = 0;
|
||||||
|
for (auto i = 0; i < numSamples; i++) {
|
||||||
|
jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += 1;
|
||||||
|
totalWeight += 1;
|
||||||
|
}
|
||||||
|
if (totalWeight == 0)
|
||||||
|
throw invalid_argument("Total weight should not be zero");
|
||||||
|
double entropyValue = 0;
|
||||||
|
for (int value = 0; value < featureCounts.sizes()[0]; ++value) {
|
||||||
|
double p_f = featureCounts[value].item<double>() / totalWeight;
|
||||||
|
double entropy_f = 0;
|
||||||
|
for (auto& [label, jointCount] : jointCounts[value]) {
|
||||||
|
double p_l_f = jointCount / featureCounts[value].item<double>();
|
||||||
|
if (p_l_f > 0) {
|
||||||
|
entropy_f -= p_l_f * log(p_l_f);
|
||||||
|
} else {
|
||||||
|
entropy_f = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
entropyValue += p_f * entropy_f;
|
||||||
|
}
|
||||||
|
return entropyValue;
|
||||||
|
}
|
||||||
|
// I(X;Y) = H(Y) - H(Y|X)
|
||||||
|
double Metrics::mutualInformation(torch::Tensor& firstFeature, torch::Tensor& secondFeature)
|
||||||
|
{
|
||||||
|
return entropy(firstFeature) - conditionalEntropy(firstFeature, secondFeature);
|
||||||
|
}
|
||||||
|
}
|
25
bayesclass/Metrics.hpp
Normal file
25
bayesclass/Metrics.hpp
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
#ifndef BAYESNET_METRICS_H
|
||||||
|
#define BAYESNET_METRICS_H
|
||||||
|
#include <torch/torch.h>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
using namespace std;
|
||||||
|
namespace bayesnet {
|
||||||
|
class Metrics {
|
||||||
|
private:
|
||||||
|
torch::Tensor samples;
|
||||||
|
vector<string> features;
|
||||||
|
string className;
|
||||||
|
int classNumStates;
|
||||||
|
vector<pair<string, string>> doCombinations(const vector<string>&);
|
||||||
|
double entropy(torch::Tensor&);
|
||||||
|
double conditionalEntropy(torch::Tensor&, torch::Tensor&);
|
||||||
|
public:
|
||||||
|
double mutualInformation(torch::Tensor&, torch::Tensor&);
|
||||||
|
Metrics(torch::Tensor&, vector<string>&, string&, int);
|
||||||
|
Metrics(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&, const int);
|
||||||
|
vector<float> conditionalEdgeWeights();
|
||||||
|
torch::Tensor conditionalEdge();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
262
bayesclass/Network.cc
Normal file
262
bayesclass/Network.cc
Normal file
@@ -0,0 +1,262 @@
|
|||||||
|
#include <thread>
|
||||||
|
#include <mutex>
|
||||||
|
#include "Network.h"
|
||||||
|
namespace bayesnet {
|
||||||
|
Network::Network() : laplaceSmoothing(1), features(vector<string>()), className(""), classNumStates(0), maxThreads(0.8) {}
|
||||||
|
Network::Network(float maxT) : laplaceSmoothing(1), features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT) {}
|
||||||
|
Network::Network(float maxT, int smoothing) : laplaceSmoothing(smoothing), features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT) {}
|
||||||
|
Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other.getmaxThreads())
|
||||||
|
{
|
||||||
|
for (auto& pair : other.nodes) {
|
||||||
|
nodes[pair.first] = new Node(*pair.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Network::~Network()
|
||||||
|
{
|
||||||
|
for (auto& pair : nodes) {
|
||||||
|
delete pair.second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
float Network::getmaxThreads()
|
||||||
|
{
|
||||||
|
return maxThreads;
|
||||||
|
}
|
||||||
|
torch::Tensor& Network::getSamples()
|
||||||
|
{
|
||||||
|
return samples;
|
||||||
|
}
|
||||||
|
void Network::addNode(string name, int numStates)
|
||||||
|
{
|
||||||
|
if (nodes.find(name) != nodes.end()) {
|
||||||
|
// if node exists update its number of states
|
||||||
|
nodes[name]->setNumStates(numStates);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
nodes[name] = new Node(name, numStates);
|
||||||
|
}
|
||||||
|
vector<string> Network::getFeatures()
|
||||||
|
{
|
||||||
|
return features;
|
||||||
|
}
|
||||||
|
int Network::getClassNumStates()
|
||||||
|
{
|
||||||
|
return classNumStates;
|
||||||
|
}
|
||||||
|
int Network::getStates()
|
||||||
|
{
|
||||||
|
int result = 0;
|
||||||
|
for (auto node : nodes) {
|
||||||
|
result += node.second->getNumStates();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
string Network::getClassName()
|
||||||
|
{
|
||||||
|
return className;
|
||||||
|
}
|
||||||
|
bool Network::isCyclic(const string& nodeId, unordered_set<string>& visited, unordered_set<string>& recStack)
|
||||||
|
{
|
||||||
|
if (visited.find(nodeId) == visited.end()) // if node hasn't been visited yet
|
||||||
|
{
|
||||||
|
visited.insert(nodeId);
|
||||||
|
recStack.insert(nodeId);
|
||||||
|
for (Node* child : nodes[nodeId]->getChildren()) {
|
||||||
|
if (visited.find(child->getName()) == visited.end() && isCyclic(child->getName(), visited, recStack))
|
||||||
|
return true;
|
||||||
|
else if (recStack.find(child->getName()) != recStack.end())
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
recStack.erase(nodeId); // remove node from recursion stack before function ends
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
void Network::addEdge(const string parent, const string child)
|
||||||
|
{
|
||||||
|
if (nodes.find(parent) == nodes.end()) {
|
||||||
|
throw invalid_argument("Parent node " + parent + " does not exist");
|
||||||
|
}
|
||||||
|
if (nodes.find(child) == nodes.end()) {
|
||||||
|
throw invalid_argument("Child node " + child + " does not exist");
|
||||||
|
}
|
||||||
|
// Temporarily add edge to check for cycles
|
||||||
|
nodes[parent]->addChild(nodes[child]);
|
||||||
|
nodes[child]->addParent(nodes[parent]);
|
||||||
|
unordered_set<string> visited;
|
||||||
|
unordered_set<string> recStack;
|
||||||
|
if (isCyclic(nodes[child]->getName(), visited, recStack)) // if adding this edge forms a cycle
|
||||||
|
{
|
||||||
|
// remove problematic edge
|
||||||
|
nodes[parent]->removeChild(nodes[child]);
|
||||||
|
nodes[child]->removeParent(nodes[parent]);
|
||||||
|
throw invalid_argument("Adding this edge forms a cycle in the graph.");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
map<string, Node*>& Network::getNodes()
|
||||||
|
{
|
||||||
|
return nodes;
|
||||||
|
}
|
||||||
|
void Network::fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<string>& featureNames, const string& className)
|
||||||
|
{
|
||||||
|
features = featureNames;
|
||||||
|
this->className = className;
|
||||||
|
dataset.clear();
|
||||||
|
|
||||||
|
// Build dataset & tensor of samples
|
||||||
|
samples = torch::zeros({ static_cast<int64_t>(input_data[0].size()), static_cast<int64_t>(input_data.size() + 1) }, torch::kInt64);
|
||||||
|
for (int i = 0; i < featureNames.size(); ++i) {
|
||||||
|
dataset[featureNames[i]] = input_data[i];
|
||||||
|
samples.index_put_({ "...", i }, torch::tensor(input_data[i], torch::kInt64));
|
||||||
|
}
|
||||||
|
dataset[className] = labels;
|
||||||
|
samples.index_put_({ "...", -1 }, torch::tensor(labels, torch::kInt64));
|
||||||
|
classNumStates = *max_element(labels.begin(), labels.end()) + 1;
|
||||||
|
int maxThreadsRunning = static_cast<int>(std::thread::hardware_concurrency() * maxThreads);
|
||||||
|
if (maxThreadsRunning < 1) {
|
||||||
|
maxThreadsRunning = 1;
|
||||||
|
}
|
||||||
|
vector<thread> threads;
|
||||||
|
mutex mtx;
|
||||||
|
condition_variable cv;
|
||||||
|
int activeThreads = 0;
|
||||||
|
int nextNodeIndex = 0;
|
||||||
|
|
||||||
|
while (nextNodeIndex < nodes.size()) {
|
||||||
|
unique_lock<mutex> lock(mtx);
|
||||||
|
cv.wait(lock, [&activeThreads, &maxThreadsRunning]() { return activeThreads < maxThreadsRunning; });
|
||||||
|
|
||||||
|
if (nextNodeIndex >= nodes.size()) {
|
||||||
|
break; // No more work remaining
|
||||||
|
}
|
||||||
|
|
||||||
|
threads.emplace_back([this, &nextNodeIndex, &mtx, &cv, &activeThreads]() {
|
||||||
|
while (true) {
|
||||||
|
unique_lock<mutex> lock(mtx);
|
||||||
|
if (nextNodeIndex >= nodes.size()) {
|
||||||
|
break; // No more work remaining
|
||||||
|
}
|
||||||
|
auto& pair = *std::next(nodes.begin(), nextNodeIndex);
|
||||||
|
++nextNodeIndex;
|
||||||
|
lock.unlock();
|
||||||
|
|
||||||
|
pair.second->computeCPT(dataset, laplaceSmoothing);
|
||||||
|
|
||||||
|
lock.lock();
|
||||||
|
nodes[pair.first] = pair.second;
|
||||||
|
lock.unlock();
|
||||||
|
}
|
||||||
|
lock_guard<mutex> lock(mtx);
|
||||||
|
--activeThreads;
|
||||||
|
cv.notify_one();
|
||||||
|
});
|
||||||
|
|
||||||
|
++activeThreads;
|
||||||
|
}
|
||||||
|
for (auto& thread : threads) {
|
||||||
|
thread.join();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<int> Network::predict(const vector<vector<int>>& tsamples)
|
||||||
|
{
|
||||||
|
vector<int> predictions;
|
||||||
|
vector<int> sample;
|
||||||
|
for (int row = 0; row < tsamples[0].size(); ++row) {
|
||||||
|
sample.clear();
|
||||||
|
for (int col = 0; col < tsamples.size(); ++col) {
|
||||||
|
sample.push_back(tsamples[col][row]);
|
||||||
|
}
|
||||||
|
vector<double> classProbabilities = predict_sample(sample);
|
||||||
|
// Find the class with the maximum posterior probability
|
||||||
|
auto maxElem = max_element(classProbabilities.begin(), classProbabilities.end());
|
||||||
|
int predictedClass = distance(classProbabilities.begin(), maxElem);
|
||||||
|
predictions.push_back(predictedClass);
|
||||||
|
}
|
||||||
|
return predictions;
|
||||||
|
}
|
||||||
|
vector<vector<double>> Network::predict_proba(const vector<vector<int>>& tsamples)
|
||||||
|
{
|
||||||
|
vector<vector<double>> predictions;
|
||||||
|
vector<int> sample;
|
||||||
|
for (int row = 0; row < tsamples[0].size(); ++row) {
|
||||||
|
sample.clear();
|
||||||
|
for (int col = 0; col < tsamples.size(); ++col) {
|
||||||
|
sample.push_back(tsamples[col][row]);
|
||||||
|
}
|
||||||
|
predictions.push_back(predict_sample(sample));
|
||||||
|
}
|
||||||
|
return predictions;
|
||||||
|
}
|
||||||
|
double Network::score(const vector<vector<int>>& tsamples, const vector<int>& labels)
|
||||||
|
{
|
||||||
|
vector<int> y_pred = predict(tsamples);
|
||||||
|
int correct = 0;
|
||||||
|
for (int i = 0; i < y_pred.size(); ++i) {
|
||||||
|
if (y_pred[i] == labels[i]) {
|
||||||
|
correct++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (double)correct / y_pred.size();
|
||||||
|
}
|
||||||
|
vector<double> Network::predict_sample(const vector<int>& sample)
|
||||||
|
{
|
||||||
|
// Ensure the sample size is equal to the number of features
|
||||||
|
if (sample.size() != features.size()) {
|
||||||
|
throw invalid_argument("Sample size (" + to_string(sample.size()) +
|
||||||
|
") does not match the number of features (" + to_string(features.size()) + ")");
|
||||||
|
}
|
||||||
|
map<string, int> evidence;
|
||||||
|
for (int i = 0; i < sample.size(); ++i) {
|
||||||
|
evidence[features[i]] = sample[i];
|
||||||
|
}
|
||||||
|
return exactInference(evidence);
|
||||||
|
|
||||||
|
}
|
||||||
|
double Network::computeFactor(map<string, int>& completeEvidence)
|
||||||
|
{
|
||||||
|
double result = 1.0;
|
||||||
|
for (auto node : getNodes()) {
|
||||||
|
result *= node.second->getFactorValue(completeEvidence);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
vector<double> Network::exactInference(map<string, int>& evidence)
|
||||||
|
{
|
||||||
|
vector<double> result(classNumStates, 0.0);
|
||||||
|
vector<thread> threads;
|
||||||
|
mutex mtx;
|
||||||
|
for (int i = 0; i < classNumStates; ++i) {
|
||||||
|
threads.emplace_back([this, &result, &evidence, i, &mtx]() {
|
||||||
|
auto completeEvidence = map<string, int>(evidence);
|
||||||
|
completeEvidence[getClassName()] = i;
|
||||||
|
double factor = computeFactor(completeEvidence);
|
||||||
|
lock_guard<mutex> lock(mtx);
|
||||||
|
result[i] = factor;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for (auto& thread : threads) {
|
||||||
|
thread.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize result
|
||||||
|
double sum = accumulate(result.begin(), result.end(), 0.0);
|
||||||
|
for (double& value : result) {
|
||||||
|
value /= sum;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
vector<string> Network::show()
|
||||||
|
{
|
||||||
|
vector<string> result;
|
||||||
|
// Draw the network
|
||||||
|
for (auto node : nodes) {
|
||||||
|
string line = node.first + " -> ";
|
||||||
|
for (auto child : node.second->getChildren()) {
|
||||||
|
line += child->getName() + ", ";
|
||||||
|
}
|
||||||
|
result.push_back(line);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
51
bayesclass/Network.h
Normal file
51
bayesclass/Network.h
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
#ifndef NETWORK_H
|
||||||
|
#define NETWORK_H
|
||||||
|
#include "Node.h"
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace bayesnet {
|
||||||
|
class Network {
|
||||||
|
private:
|
||||||
|
map<string, Node*> nodes;
|
||||||
|
map<string, vector<int>> dataset;
|
||||||
|
float maxThreads;
|
||||||
|
int classNumStates;
|
||||||
|
vector<string> features;
|
||||||
|
string className;
|
||||||
|
int laplaceSmoothing;
|
||||||
|
torch::Tensor samples;
|
||||||
|
bool isCyclic(const std::string&, std::unordered_set<std::string>&, std::unordered_set<std::string>&);
|
||||||
|
vector<double> predict_sample(const vector<int>&);
|
||||||
|
vector<double> exactInference(map<string, int>&);
|
||||||
|
double computeFactor(map<string, int>&);
|
||||||
|
double mutual_info(torch::Tensor&, torch::Tensor&);
|
||||||
|
double entropy(torch::Tensor&);
|
||||||
|
double conditionalEntropy(torch::Tensor&, torch::Tensor&);
|
||||||
|
double mutualInformation(torch::Tensor&, torch::Tensor&);
|
||||||
|
public:
|
||||||
|
Network();
|
||||||
|
Network(float, int);
|
||||||
|
Network(float);
|
||||||
|
Network(Network&);
|
||||||
|
~Network();
|
||||||
|
torch::Tensor& getSamples();
|
||||||
|
float getmaxThreads();
|
||||||
|
void addNode(string, int);
|
||||||
|
void addEdge(const string, const string);
|
||||||
|
map<string, Node*>& getNodes();
|
||||||
|
vector<string> getFeatures();
|
||||||
|
int getStates();
|
||||||
|
int getClassNumStates();
|
||||||
|
string getClassName();
|
||||||
|
void fit(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&);
|
||||||
|
vector<int> predict(const vector<vector<int>>&);
|
||||||
|
//Computes the conditional edge weight of variable index u and v conditioned on class_node
|
||||||
|
torch::Tensor conditionalEdgeWeight();
|
||||||
|
vector<vector<double>> predict_proba(const vector<vector<int>>&);
|
||||||
|
double score(const vector<vector<int>>&, const vector<int>&);
|
||||||
|
vector<string> show();
|
||||||
|
inline string version() { return "0.1.0"; }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
114
bayesclass/Node.cc
Normal file
114
bayesclass/Node.cc
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
#include "Node.h"
|
||||||
|
|
||||||
|
namespace bayesnet {
|
||||||
|
|
||||||
|
Node::Node(const std::string& name, int numStates)
|
||||||
|
: name(name), numStates(numStates), cpTable(torch::Tensor()), parents(vector<Node*>()), children(vector<Node*>())
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
string Node::getName() const
|
||||||
|
{
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Node::addParent(Node* parent)
|
||||||
|
{
|
||||||
|
parents.push_back(parent);
|
||||||
|
}
|
||||||
|
void Node::removeParent(Node* parent)
|
||||||
|
{
|
||||||
|
parents.erase(std::remove(parents.begin(), parents.end(), parent), parents.end());
|
||||||
|
}
|
||||||
|
void Node::removeChild(Node* child)
|
||||||
|
{
|
||||||
|
children.erase(std::remove(children.begin(), children.end(), child), children.end());
|
||||||
|
}
|
||||||
|
void Node::addChild(Node* child)
|
||||||
|
{
|
||||||
|
children.push_back(child);
|
||||||
|
}
|
||||||
|
vector<Node*>& Node::getParents()
|
||||||
|
{
|
||||||
|
return parents;
|
||||||
|
}
|
||||||
|
vector<Node*>& Node::getChildren()
|
||||||
|
{
|
||||||
|
return children;
|
||||||
|
}
|
||||||
|
int Node::getNumStates() const
|
||||||
|
{
|
||||||
|
return numStates;
|
||||||
|
}
|
||||||
|
void Node::setNumStates(int numStates)
|
||||||
|
{
|
||||||
|
this->numStates = numStates;
|
||||||
|
}
|
||||||
|
torch::Tensor& Node::getCPT()
|
||||||
|
{
|
||||||
|
return cpTable;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
The MinFill criterion is a heuristic for variable elimination.
|
||||||
|
The variable that minimizes the number of edges that need to be added to the graph to make it triangulated.
|
||||||
|
This is done by counting the number of edges that need to be added to the graph if the variable is eliminated.
|
||||||
|
The variable with the minimum number of edges is chosen.
|
||||||
|
Here this is done computing the length of the combinations of the node neighbors taken 2 by 2.
|
||||||
|
*/
|
||||||
|
unsigned Node::minFill()
|
||||||
|
{
|
||||||
|
unordered_set<string> neighbors;
|
||||||
|
for (auto child : children) {
|
||||||
|
neighbors.emplace(child->getName());
|
||||||
|
}
|
||||||
|
for (auto parent : parents) {
|
||||||
|
neighbors.emplace(parent->getName());
|
||||||
|
}
|
||||||
|
auto source = vector<string>(neighbors.begin(), neighbors.end());
|
||||||
|
return combinations(source).size();
|
||||||
|
}
|
||||||
|
vector<pair<string, string>> Node::combinations(const vector<string>& source)
|
||||||
|
{
|
||||||
|
vector<pair<string, string>> result;
|
||||||
|
for (int i = 0; i < source.size(); ++i) {
|
||||||
|
string temp = source[i];
|
||||||
|
for (int j = i + 1; j < source.size(); ++j) {
|
||||||
|
result.push_back({ temp, source[j] });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
void Node::computeCPT(map<string, vector<int>>& dataset, const int laplaceSmoothing)
|
||||||
|
{
|
||||||
|
// Get dimensions of the CPT
|
||||||
|
dimensions.push_back(numStates);
|
||||||
|
for (auto father : getParents()) {
|
||||||
|
dimensions.push_back(father->getNumStates());
|
||||||
|
}
|
||||||
|
auto length = dimensions.size();
|
||||||
|
// Create a tensor of zeros with the dimensions of the CPT
|
||||||
|
cpTable = torch::zeros(dimensions, torch::kFloat) + laplaceSmoothing;
|
||||||
|
// Fill table with counts
|
||||||
|
for (int n_sample = 0; n_sample < dataset[name].size(); ++n_sample) {
|
||||||
|
torch::List<c10::optional<torch::Tensor>> coordinates;
|
||||||
|
coordinates.push_back(torch::tensor(dataset[name][n_sample]));
|
||||||
|
for (auto father : getParents()) {
|
||||||
|
coordinates.push_back(torch::tensor(dataset[father->getName()][n_sample]));
|
||||||
|
}
|
||||||
|
// Increment the count of the corresponding coordinate
|
||||||
|
cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + 1);
|
||||||
|
}
|
||||||
|
// Normalize the counts
|
||||||
|
cpTable = cpTable / cpTable.sum(0);
|
||||||
|
}
|
||||||
|
float Node::getFactorValue(map<string, int>& evidence)
|
||||||
|
{
|
||||||
|
torch::List<c10::optional<torch::Tensor>> coordinates;
|
||||||
|
// following predetermined order of indices in the cpTable (see Node.h)
|
||||||
|
coordinates.push_back(torch::tensor(evidence[name]));
|
||||||
|
for (auto parent : getParents()) {
|
||||||
|
coordinates.push_back(torch::tensor(evidence[parent->getName()]));
|
||||||
|
}
|
||||||
|
return cpTable.index({ coordinates }).item<float>();
|
||||||
|
}
|
||||||
|
}
|
35
bayesclass/Node.h
Normal file
35
bayesclass/Node.h
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
#ifndef NODE_H
|
||||||
|
#define NODE_H
|
||||||
|
#include <torch/torch.h>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
namespace bayesnet {
|
||||||
|
using namespace std;
|
||||||
|
class Node {
|
||||||
|
private:
|
||||||
|
string name;
|
||||||
|
vector<Node*> parents;
|
||||||
|
vector<Node*> children;
|
||||||
|
int numStates; // number of states of the variable
|
||||||
|
torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ...
|
||||||
|
vector<int64_t> dimensions; // dimensions of the cpTable
|
||||||
|
public:
|
||||||
|
vector<pair<string, string>> combinations(const vector<string>&);
|
||||||
|
Node(const std::string&, int);
|
||||||
|
void addParent(Node*);
|
||||||
|
void addChild(Node*);
|
||||||
|
void removeParent(Node*);
|
||||||
|
void removeChild(Node*);
|
||||||
|
string getName() const;
|
||||||
|
vector<Node*>& getParents();
|
||||||
|
vector<Node*>& getChildren();
|
||||||
|
torch::Tensor& getCPT();
|
||||||
|
void computeCPT(map<string, vector<int>>&, const int);
|
||||||
|
int getNumStates() const;
|
||||||
|
void setNumStates(int);
|
||||||
|
unsigned minFill();
|
||||||
|
float getFactorValue(map<string, int>&);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
@@ -16,4 +16,8 @@ __all__ = [
|
|||||||
"TAN",
|
"TAN",
|
||||||
"KDB",
|
"KDB",
|
||||||
"AODE",
|
"AODE",
|
||||||
|
"KDBNew",
|
||||||
|
"AODENew",
|
||||||
|
"BoostAODE",
|
||||||
|
"BoostSPODE",
|
||||||
]
|
]
|
||||||
|
@@ -1 +1 @@
|
|||||||
__version__ = "0.1.0"
|
__version__ = "0.2.0"
|
||||||
|
4717
bayesclass/cSelectFeatures.cpp
Normal file
4717
bayesclass/cSelectFeatures.cpp
Normal file
File diff suppressed because it is too large
Load Diff
33
bayesclass/cSelectFeatures.pyx
Normal file
33
bayesclass/cSelectFeatures.pyx
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# distutils: language = c++
|
||||||
|
# cython: language_level = 3
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from libcpp.string cimport string
|
||||||
|
from libcpp cimport bool
|
||||||
|
|
||||||
|
|
||||||
|
cdef extern from "FeatureSelect.h" namespace "features":
|
||||||
|
ctypedef float precision_t
|
||||||
|
cdef cppclass SelectKBestWeighted:
|
||||||
|
SelectKBestWeighted(vector[vector[int]]&, vector[int]&, vector[precision_t]&, int, bool) except +
|
||||||
|
void fit()
|
||||||
|
string version()
|
||||||
|
vector[precision_t] getScores()
|
||||||
|
vector[int] getFeatures()
|
||||||
|
|
||||||
|
cdef class CSelectKBestWeighted:
|
||||||
|
cdef SelectKBestWeighted *thisptr
|
||||||
|
def __cinit__(self, X, y, weights, k, natural=False): # log or log2
|
||||||
|
self.thisptr = new SelectKBestWeighted(X, y, weights, k, natural)
|
||||||
|
def __dealloc__(self):
|
||||||
|
del self.thisptr
|
||||||
|
def fit(self,):
|
||||||
|
self.thisptr.fit()
|
||||||
|
return self
|
||||||
|
def get_scores(self):
|
||||||
|
return self.thisptr.getScores()
|
||||||
|
def get_features(self):
|
||||||
|
return self.thisptr.getFeatures()
|
||||||
|
def get_version(self):
|
||||||
|
return self.thisptr.version()
|
||||||
|
def __reduce__(self):
|
||||||
|
return (CSelectKBestWeighted, ())
|
@@ -1,19 +1,29 @@
|
|||||||
import random
|
import random
|
||||||
|
import warnings
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from scipy.stats import mode
|
from scipy.stats import mode
|
||||||
from sklearn.base import ClassifierMixin, BaseEstimator
|
from sklearn.base import clone, ClassifierMixin, BaseEstimator
|
||||||
from sklearn.ensemble import BaseEnsemble
|
from sklearn.ensemble import BaseEnsemble
|
||||||
|
from sklearn.feature_selection import mutual_info_classif
|
||||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||||
from sklearn.utils.multiclass import unique_labels
|
from sklearn.utils.multiclass import unique_labels
|
||||||
from sklearn.feature_selection import mutual_info_classif
|
from sklearn.feature_selection import mutual_info_classif
|
||||||
import networkx as nx
|
import networkx as nx
|
||||||
from pgmpy.estimators import TreeSearch, BayesianEstimator
|
from pgmpy.estimators import TreeSearch, BayesianEstimator
|
||||||
from pgmpy.models import BayesianNetwork
|
from pgmpy.models import BayesianNetwork
|
||||||
|
from pgmpy.base import DAG
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
from fimdlp.mdlp import FImdlp
|
||||||
|
from .cppSelectFeatures import CSelectKBestWeighted
|
||||||
|
from .BayesNet import BayesNetwork, CMetrics
|
||||||
from ._version import __version__
|
from ._version import __version__
|
||||||
|
|
||||||
|
|
||||||
|
def default_feature_names(num_features):
|
||||||
|
return [f"feature_{i}" for i in range(num_features)]
|
||||||
|
|
||||||
|
|
||||||
class BayesBase(BaseEstimator, ClassifierMixin):
|
class BayesBase(BaseEstimator, ClassifierMixin):
|
||||||
def __init__(self, random_state, show_progress):
|
def __init__(self, random_state, show_progress):
|
||||||
self.random_state = random_state
|
self.random_state = random_state
|
||||||
@@ -23,7 +33,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
|
|||||||
return {
|
return {
|
||||||
"requires_positive_X": True,
|
"requires_positive_X": True,
|
||||||
"requires_positive_y": True,
|
"requires_positive_y": True,
|
||||||
"preserve_dtype": [np.int64, np.int32],
|
"preserve_dtype": [np.int32, np.int64],
|
||||||
"requires_y": True,
|
"requires_y": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,35 +42,68 @@ class BayesBase(BaseEstimator, ClassifierMixin):
|
|||||||
"""Return the version of the package."""
|
"""Return the version of the package."""
|
||||||
return __version__
|
return __version__
|
||||||
|
|
||||||
def nodes_leaves(self):
|
def nodes_edges(self):
|
||||||
"""To keep compatiblity with the benchmark platform"""
|
if hasattr(self, "dag_"):
|
||||||
|
return len(self.dag_), len(self.dag_.edges())
|
||||||
return 0, 0
|
return 0, 0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def default_class_name():
|
||||||
|
return "class"
|
||||||
|
|
||||||
|
def build_dataset(self):
|
||||||
|
self.dataset_ = pd.DataFrame(
|
||||||
|
self.X_, columns=self.feature_names_in_, dtype=np.int32
|
||||||
|
)
|
||||||
|
self.dataset_[self.class_name_] = self.y_
|
||||||
|
if self.sample_weight_ is not None:
|
||||||
|
self.dataset_["_weight"] = self.sample_weight_
|
||||||
|
|
||||||
def _check_params_fit(self, X, y, expected_args, kwargs):
|
def _check_params_fit(self, X, y, expected_args, kwargs):
|
||||||
"""Check the common parameters passed to fit"""
|
"""Check the common parameters passed to fit"""
|
||||||
# Check that X and y have correct shape
|
# Check that X and y have correct shape
|
||||||
X, y = check_X_y(X, y)
|
X, y = check_X_y(X, y)
|
||||||
|
X = self._validate_data(X, reset=True)
|
||||||
# Store the classes seen during fit
|
# Store the classes seen during fit
|
||||||
self.classes_ = unique_labels(y)
|
self.classes_ = unique_labels(y)
|
||||||
self.n_classes_ = self.classes_.shape[0]
|
self.n_classes_ = self.classes_.shape[0]
|
||||||
# Default values
|
# Default values
|
||||||
self.class_name_ = "class"
|
self.weighted_ = False
|
||||||
self.features_ = [f"feature_{i}" for i in range(X.shape[1])]
|
self.sample_weight_ = None
|
||||||
|
self.class_name_ = self.default_class_name()
|
||||||
|
self.features_ = default_feature_names(X.shape[1])
|
||||||
for key, value in kwargs.items():
|
for key, value in kwargs.items():
|
||||||
if key in expected_args:
|
if key in expected_args:
|
||||||
setattr(self, f"{key}_", value)
|
setattr(self, f"{key}_", value)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unexpected argument: {key}")
|
raise ValueError(f"Unexpected argument: {key}")
|
||||||
|
self.feature_names_in_ = self.features_
|
||||||
|
# used for local discretization
|
||||||
|
self.indexed_features_ = {
|
||||||
|
feature: i for i, feature in enumerate(self.features_)
|
||||||
|
}
|
||||||
if self.random_state is not None:
|
if self.random_state is not None:
|
||||||
random.seed(self.random_state)
|
random.seed(self.random_state)
|
||||||
if len(self.features_) != X.shape[1]:
|
if len(self.feature_names_in_) != X.shape[1]:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Number of features does not match the number of columns in X"
|
"Number of features does not match the number of columns in X"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.n_features_in_ = X.shape[1]
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
@property
|
||||||
|
def states_(self):
|
||||||
|
if hasattr(self, "fitted_"):
|
||||||
|
return self.states_computed_
|
||||||
|
return 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def depth_(self):
|
||||||
|
return self.states_
|
||||||
|
|
||||||
def fit(self, X, y, **kwargs):
|
def fit(self, X, y, **kwargs):
|
||||||
"""A reference implementation of a fitting function for a classifier.
|
"""Fit classifier
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -97,29 +140,54 @@ class BayesBase(BaseEstimator, ClassifierMixin):
|
|||||||
>>> model.fit(train_data, train_y, features=features, class_name='E')
|
>>> model.fit(train_data, train_y, features=features, class_name='E')
|
||||||
TAN(random_state=17)
|
TAN(random_state=17)
|
||||||
"""
|
"""
|
||||||
X_, y_ = self._check_params(X, y, kwargs)
|
self.X_, self.y_ = self._check_params(X, y, kwargs)
|
||||||
# Store the information needed to build the model
|
# Store the information needed to build the model
|
||||||
self.X_ = X_
|
self.build_dataset()
|
||||||
self.y_ = y_
|
|
||||||
self.dataset_ = pd.DataFrame(self.X_, columns=self.features_)
|
|
||||||
self.dataset_[self.class_name_] = self.y_
|
|
||||||
# Build the DAG
|
# Build the DAG
|
||||||
self._build()
|
self._build(kwargs)
|
||||||
# Train the model
|
# Train the model
|
||||||
self._train()
|
self._train(kwargs)
|
||||||
self.fitted_ = True
|
self.fitted_ = True
|
||||||
|
# To keep compatiblity with the benchmark platform
|
||||||
|
self.nodes_leaves = self.nodes_edges
|
||||||
# Return the classifier
|
# Return the classifier
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _train(self):
|
def _build(self, kwargs):
|
||||||
self.model_ = BayesianNetwork(
|
self.model_ = BayesNetwork()
|
||||||
self.dag_.edges(), show_progress=self.show_progress
|
features = kwargs["features"]
|
||||||
)
|
states = kwargs["state_names"]
|
||||||
self.model_.fit(
|
for feature in features:
|
||||||
self.dataset_,
|
self.model_.addNode(feature, len(states[feature]))
|
||||||
estimator=BayesianEstimator,
|
class_name = kwargs["class_name"]
|
||||||
prior_type="K2",
|
self.model_.addNode(class_name, max(self.y_) + 1)
|
||||||
)
|
|
||||||
|
def _train(self, kwargs):
|
||||||
|
"""Build and train a BayesianNetwork from the DAG and the dataset
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
kwargs : dict
|
||||||
|
fit parameters
|
||||||
|
"""
|
||||||
|
# self.model_ = BayesianNetwork(
|
||||||
|
# self.dag_.edges(), show_progress=self.show_progress
|
||||||
|
# )
|
||||||
|
# states = dict(state_names=kwargs.pop("state_names", []))
|
||||||
|
# self.model_.fit(
|
||||||
|
# self.dataset_,
|
||||||
|
# estimator=BayesianEstimator,
|
||||||
|
# prior_type="K2",
|
||||||
|
# weighted=self.weighted_,
|
||||||
|
# **states,
|
||||||
|
# )
|
||||||
|
|
||||||
|
features = kwargs["features"]
|
||||||
|
class_name = kwargs["class_name"]
|
||||||
|
for source, destination in self.edges_:
|
||||||
|
self.model_.addEdge(source, destination)
|
||||||
|
self.model_.fit(self.X_, self.y_, features, class_name)
|
||||||
|
self.states_computed_ = self.model_.getStates()
|
||||||
|
|
||||||
def predict(self, X):
|
def predict(self, X):
|
||||||
"""A reference implementation of a prediction for a classifier.
|
"""A reference implementation of a prediction for a classifier.
|
||||||
@@ -169,13 +237,16 @@ class BayesBase(BaseEstimator, ClassifierMixin):
|
|||||||
"""
|
"""
|
||||||
# Check is fit had been called
|
# Check is fit had been called
|
||||||
check_is_fitted(self, ["X_", "y_", "fitted_"])
|
check_is_fitted(self, ["X_", "y_", "fitted_"])
|
||||||
|
|
||||||
# Input validation
|
# Input validation
|
||||||
X = check_array(X)
|
X = check_array(X)
|
||||||
dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
|
# dataset = pd.DataFrame(
|
||||||
return self.model_.predict(dataset).values.ravel()
|
# X, columns=self.feature_names_in_, dtype=np.int32
|
||||||
|
# )
|
||||||
|
# return self.model_.predict(dataset).values.ravel()
|
||||||
|
return self.model_.predict(X)
|
||||||
|
|
||||||
def plot(self, title="", node_size=800):
|
def plot(self, title="", node_size=800):
|
||||||
|
warnings.simplefilter("ignore", UserWarning)
|
||||||
nx.draw_circular(
|
nx.draw_circular(
|
||||||
self.model_,
|
self.model_,
|
||||||
with_labels=True,
|
with_labels=True,
|
||||||
@@ -208,7 +279,7 @@ class TAN(BayesBase):
|
|||||||
The classes seen at :meth:`fit`.
|
The classes seen at :meth:`fit`.
|
||||||
class_name_ : str
|
class_name_ : str
|
||||||
The name of the class column
|
The name of the class column
|
||||||
features_ : list
|
feature_names_in_ : list
|
||||||
The list of features names
|
The list of features names
|
||||||
head_ : int
|
head_ : int
|
||||||
The index of the node used as head for the initial DAG
|
The index of the node used as head for the initial DAG
|
||||||
@@ -227,21 +298,47 @@ class TAN(BayesBase):
|
|||||||
|
|
||||||
def _check_params(self, X, y, kwargs):
|
def _check_params(self, X, y, kwargs):
|
||||||
self.head_ = 0
|
self.head_ = 0
|
||||||
expected_args = ["class_name", "features", "head"]
|
expected_args = ["class_name", "features", "head", "state_names"]
|
||||||
X, y = self._check_params_fit(X, y, expected_args, kwargs)
|
X, y = self._check_params_fit(X, y, expected_args, kwargs)
|
||||||
if self.head_ == "random":
|
if self.head_ == "random":
|
||||||
self.head_ = random.randint(0, len(self.features_) - 1)
|
self.head_ = random.randint(0, self.n_features_in_ - 1)
|
||||||
if self.head_ is not None and self.head_ >= len(self.features_):
|
if self.head_ is not None and self.head_ >= self.n_features_in_:
|
||||||
raise ValueError("Head index out of range")
|
raise ValueError("Head index out of range")
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
def _build(self):
|
def _build(self, kwargs):
|
||||||
est = TreeSearch(self.dataset_, root_node=self.features_[self.head_])
|
est = TreeSearch(
|
||||||
|
self.dataset_, root_node=self.feature_names_in_[self.head_]
|
||||||
|
)
|
||||||
self.dag_ = est.estimate(
|
self.dag_ = est.estimate(
|
||||||
estimator_type="tan",
|
estimator_type="tan",
|
||||||
class_node=self.class_name_,
|
class_node=self.class_name_,
|
||||||
show_progress=self.show_progress,
|
show_progress=self.show_progress,
|
||||||
)
|
)
|
||||||
|
# Code taken from pgmpy
|
||||||
|
# n_jobs = -1
|
||||||
|
# weights = TreeSearch._get_conditional_weights(
|
||||||
|
# self.dataset_,
|
||||||
|
# self.class_name_,
|
||||||
|
# "mutual_info",
|
||||||
|
# n_jobs,
|
||||||
|
# self.show_progress,
|
||||||
|
# )
|
||||||
|
# # Step 4.2: Construct chow-liu DAG on {data.columns - class_node}
|
||||||
|
# class_node_idx = np.where(self.dataset_.columns == self.class_name_)[
|
||||||
|
# 0
|
||||||
|
# ][0]
|
||||||
|
# weights = np.delete(weights, class_node_idx, axis=0)
|
||||||
|
# weights = np.delete(weights, class_node_idx, axis=1)
|
||||||
|
# reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
|
||||||
|
# D = TreeSearch._create_tree_and_dag(
|
||||||
|
# weights, reduced_columns, self.feature_names_in_[self.head_]
|
||||||
|
# )
|
||||||
|
# # Step 4.3: Add edges from class_node to all other nodes.
|
||||||
|
# D.add_edges_from(
|
||||||
|
# [(self.class_name_, node) for node in reduced_columns]
|
||||||
|
# )
|
||||||
|
# self.dag_ = D
|
||||||
|
|
||||||
|
|
||||||
class KDB(BayesBase):
|
class KDB(BayesBase):
|
||||||
@@ -253,121 +350,625 @@ class KDB(BayesBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _check_params(self, X, y, kwargs):
|
def _check_params(self, X, y, kwargs):
|
||||||
expected_args = ["class_name", "features"]
|
expected_args = [
|
||||||
|
"class_name",
|
||||||
|
"features",
|
||||||
|
"state_names",
|
||||||
|
"sample_weight",
|
||||||
|
"weighted",
|
||||||
|
]
|
||||||
return self._check_params_fit(X, y, expected_args, kwargs)
|
return self._check_params_fit(X, y, expected_args, kwargs)
|
||||||
|
|
||||||
def _build(self):
|
def _add_m_edges(self, idx, S_nodes, conditional_weights):
|
||||||
|
n_edges = min(self.k, len(S_nodes))
|
||||||
|
cond_w = conditional_weights.copy()
|
||||||
|
exit_cond = self.k == 0
|
||||||
|
num = 0
|
||||||
|
while not exit_cond:
|
||||||
|
max_minfo = np.argmax(cond_w[idx, :])
|
||||||
|
if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta:
|
||||||
|
try:
|
||||||
|
self.model_.addEdge(
|
||||||
|
self.feature_names_in_[max_minfo],
|
||||||
|
self.feature_names_in_[idx],
|
||||||
|
)
|
||||||
|
num += 1
|
||||||
|
except ValueError:
|
||||||
|
# Loops are not allowed
|
||||||
|
pass
|
||||||
|
cond_w[idx, max_minfo] = -1
|
||||||
|
exit_cond = num == n_edges or np.all(cond_w[idx, :] <= self.theta)
|
||||||
|
|
||||||
|
def _build(self, kwargs):
|
||||||
"""
|
"""
|
||||||
1. For each feature Xi, compute mutual information, I(X;;C), where C is the class.
|
1. For each feature Xi, compute mutual information, I(X;C),
|
||||||
2. Compute class conditional mutual information I(Xi;XjIC), f or each pair of features Xi and Xj, where i#j.
|
where C is the class.
|
||||||
|
2. Compute class conditional mutual information I(Xi;XjIC), f or each
|
||||||
|
pair of features Xi and Xj, where i#j.
|
||||||
3. Let the used variable list, S, be empty.
|
3. Let the used variable list, S, be empty.
|
||||||
4. Let the Bayesian network being constructed, BN, begin with a single class node, C.
|
4. Let the DAG network being constructed, BN, begin with a single
|
||||||
|
class node, C.
|
||||||
5. Repeat until S includes all domain features
|
5. Repeat until S includes all domain features
|
||||||
5.1. Select feature Xmax which is not in S and has the largest value I(Xmax;C).
|
5.1. Select feature Xmax which is not in S and has the largest value
|
||||||
|
I(Xmax;C).
|
||||||
5.2. Add a node to BN representing Xmax.
|
5.2. Add a node to BN representing Xmax.
|
||||||
5.3. Add an arc from C to Xmax in BN.
|
5.3. Add an arc from C to Xmax in BN.
|
||||||
5.4. Add m =min(lSl,/c) arcs from m distinct features Xj in S with the highest value for I(Xmax;X,jC).
|
5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with
|
||||||
|
the highest value for I(Xmax;X,jC).
|
||||||
5.5. Add Xmax to S.
|
5.5. Add Xmax to S.
|
||||||
Compute the conditional probabilility infered by the structure of BN by using counts from DB, and output BN.
|
Compute the conditional probabilility infered by the structure of BN by
|
||||||
|
using counts from DB, and output BN.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def add_m_edges(dag, idx, S_nodes, conditional_weights):
|
|
||||||
n_edges = min(self.k, len(S_nodes))
|
|
||||||
cond_w = conditional_weights.copy()
|
|
||||||
exit_cond = self.k == 0
|
|
||||||
num = 0
|
|
||||||
while not exit_cond:
|
|
||||||
max_minfo = np.argmax(cond_w[idx, :])
|
|
||||||
if (
|
|
||||||
max_minfo in S_nodes
|
|
||||||
and cond_w[idx, max_minfo] > self.theta
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
dag.add_edge(
|
|
||||||
self.features_[max_minfo], self.features_[idx]
|
|
||||||
)
|
|
||||||
num += 1
|
|
||||||
except ValueError:
|
|
||||||
# Loops are not allowed
|
|
||||||
pass
|
|
||||||
cond_w[idx, max_minfo] = -1
|
|
||||||
exit_cond = num == n_edges or np.all(cond_w[idx, :] <= 0)
|
|
||||||
|
|
||||||
# 1. get the mutual information between each feature and the class
|
# 1. get the mutual information between each feature and the class
|
||||||
mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True)
|
mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True)
|
||||||
# 2. symmetric matrix where each element represents I(X, Y| class_node)
|
# 2. symmetric matrix where each element represents I(X, Y| class_node)
|
||||||
conditional_weights = TreeSearch(
|
metrics = CMetrics(
|
||||||
self.dataset_
|
self.X_,
|
||||||
)._get_conditional_weights(
|
self.y_,
|
||||||
self.dataset_, self.class_name_, show_progress=self.show_progress
|
self.features_,
|
||||||
|
self.class_name_,
|
||||||
|
self.n_classes_,
|
||||||
)
|
)
|
||||||
# 3.
|
conditional_weights = metrics.conditionalEdgeWeights(
|
||||||
|
self.n_features_in_ + 1
|
||||||
|
)
|
||||||
|
# 3. Let the used variable list, S, be empty.
|
||||||
S_nodes = []
|
S_nodes = []
|
||||||
# 4.
|
num_states = {
|
||||||
dag = BayesianNetwork()
|
feature: len(states)
|
||||||
dag.add_node(self.class_name_) # , state_names=self.classes_)
|
for feature, states in kwargs["state_names"].items()
|
||||||
# 5. 5.1
|
}
|
||||||
for idx in np.argsort(mutual):
|
# 4. Let the DAG being constructed, BN, begin with a single class node
|
||||||
# 5.2
|
self.model_ = BayesNetwork()
|
||||||
feature = self.features_[idx]
|
self.model_.addNode(self.class_name_, self.n_classes_)
|
||||||
dag.add_node(feature)
|
# 5. Repeat until S includes all domain features
|
||||||
# 5.3
|
# 5.1 Select feature Xmax which is not in S and has the largest value
|
||||||
dag.add_edge(self.class_name_, feature)
|
for idx in np.argsort(-mutual):
|
||||||
# 5.4
|
# 5.2 Add a node to BN representing Xmax.
|
||||||
add_m_edges(dag, idx, S_nodes, conditional_weights)
|
feature = self.feature_names_in_[idx]
|
||||||
# 5.5
|
self.model_.addNode(feature, num_states[feature])
|
||||||
|
# 5.3 Add an arc from C to Xmax in BN.
|
||||||
|
self.model_.addEdge(self.class_name_, feature)
|
||||||
|
# 5.4 Add m = min(lSl,/c) arcs from m distinct features Xj in S
|
||||||
|
self._add_m_edges(idx, S_nodes, conditional_weights)
|
||||||
|
# 5.5 Add Xmax to S.
|
||||||
S_nodes.append(idx)
|
S_nodes.append(idx)
|
||||||
self.dag_ = dag
|
self.edges_ = []
|
||||||
|
|
||||||
|
|
||||||
class AODE(BayesBase, BaseEnsemble):
|
def build_spodes(features, class_name):
|
||||||
def __init__(self, show_progress=False, random_state=None):
|
"""Build SPODE estimators (Super Parent One Dependent Estimator)"""
|
||||||
|
class_edges = [(class_name, f) for f in features]
|
||||||
|
for idx in range(len(features)):
|
||||||
|
feature_edges = [
|
||||||
|
(features[idx], f) for f in features if f != features[idx]
|
||||||
|
]
|
||||||
|
feature_edges.extend(class_edges)
|
||||||
|
model = BayesianNetwork(feature_edges, show_progress=False)
|
||||||
|
yield model
|
||||||
|
|
||||||
|
|
||||||
|
class SPODE(BayesBase):
|
||||||
|
def _check_params(self, X, y, kwargs):
|
||||||
|
expected_args = [
|
||||||
|
"class_name",
|
||||||
|
"features",
|
||||||
|
"state_names",
|
||||||
|
"sample_weight",
|
||||||
|
"weighted",
|
||||||
|
]
|
||||||
|
return self._check_params_fit(X, y, expected_args, kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class AODE(ClassifierMixin, BaseEnsemble):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
show_progress=False,
|
||||||
|
random_state=None,
|
||||||
|
estimator=None,
|
||||||
|
):
|
||||||
|
self.show_progress = show_progress
|
||||||
|
self.random_state = random_state
|
||||||
|
super().__init__(estimator=estimator)
|
||||||
|
|
||||||
|
def _validate_estimator(self) -> None:
|
||||||
|
"""Check the estimator and set the estimator_ attribute."""
|
||||||
|
super()._validate_estimator(
|
||||||
|
default=SPODE(
|
||||||
|
random_state=self.random_state,
|
||||||
|
show_progress=self.show_progress,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def fit(self, X, y, **kwargs):
|
||||||
|
self.n_features_in_ = X.shape[1]
|
||||||
|
self.feature_names_in_ = kwargs.get(
|
||||||
|
"features", default_feature_names(self.n_features_in_)
|
||||||
|
)
|
||||||
|
self.class_name_ = kwargs.get("class_name", "class")
|
||||||
|
# build estimator
|
||||||
|
self._validate_estimator()
|
||||||
|
self.X_ = X
|
||||||
|
self.y_ = y
|
||||||
|
self.n_samples_ = X.shape[0]
|
||||||
|
self.estimators_ = []
|
||||||
|
self._train(kwargs)
|
||||||
|
self.fitted_ = True
|
||||||
|
# To keep compatiblity with the benchmark platform
|
||||||
|
self.nodes_leaves = self.nodes_edges
|
||||||
|
return self
|
||||||
|
|
||||||
|
def _train(self, kwargs):
|
||||||
|
for dag in build_spodes(self.feature_names_in_, self.class_name_):
|
||||||
|
estimator = clone(self.estimator_)
|
||||||
|
estimator.dag_ = estimator.model_ = dag
|
||||||
|
estimator.fit(self.X_, self.y_, **kwargs)
|
||||||
|
self.estimators_.append(estimator)
|
||||||
|
|
||||||
|
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||||
|
n_samples = X.shape[0]
|
||||||
|
n_estimators = len(self.estimators_)
|
||||||
|
result = np.empty((n_samples, n_estimators))
|
||||||
|
for index, estimator in enumerate(self.estimators_):
|
||||||
|
result[:, index] = estimator.predict(X)
|
||||||
|
return mode(result, axis=1, keepdims=False).mode.ravel()
|
||||||
|
|
||||||
|
def version(self):
|
||||||
|
if hasattr(self, "fitted_"):
|
||||||
|
return self.estimator_.version()
|
||||||
|
return SPODE(None, False).version()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def states_(self):
|
||||||
|
if hasattr(self, "fitted_"):
|
||||||
|
return sum(
|
||||||
|
[
|
||||||
|
len(item)
|
||||||
|
for model in self.estimators_
|
||||||
|
for _, item in model.model_.states.items()
|
||||||
|
]
|
||||||
|
) / len(self.estimators_)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def depth_(self):
|
||||||
|
return self.states_
|
||||||
|
|
||||||
|
def nodes_edges(self):
|
||||||
|
nodes = 0
|
||||||
|
edges = 0
|
||||||
|
if hasattr(self, "fitted_"):
|
||||||
|
nodes = sum([len(x.dag_) for x in self.estimators_])
|
||||||
|
edges = sum([len(x.dag_.edges()) for x in self.estimators_])
|
||||||
|
return nodes, edges
|
||||||
|
|
||||||
|
def plot(self, title=""):
|
||||||
|
warnings.simplefilter("ignore", UserWarning)
|
||||||
|
for idx, model in enumerate(self.estimators_):
|
||||||
|
model.plot(title=f"{idx} {title}")
|
||||||
|
|
||||||
|
|
||||||
|
class TANNew(TAN):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
show_progress=False,
|
||||||
|
random_state=None,
|
||||||
|
discretizer_depth=1e6,
|
||||||
|
discretizer_length=3,
|
||||||
|
discretizer_cuts=0,
|
||||||
|
):
|
||||||
|
self.discretizer_depth = discretizer_depth
|
||||||
|
self.discretizer_length = discretizer_length
|
||||||
|
self.discretizer_cuts = discretizer_cuts
|
||||||
super().__init__(
|
super().__init__(
|
||||||
show_progress=show_progress, random_state=random_state
|
show_progress=show_progress, random_state=random_state
|
||||||
)
|
)
|
||||||
|
|
||||||
def _check_params(self, X, y, kwargs):
|
def fit(self, X, y, **kwargs):
|
||||||
expected_args = ["class_name", "features"]
|
self.estimator_ = Proposal(self)
|
||||||
return self._check_params_fit(X, y, expected_args, kwargs)
|
self.estimator_.fit(X, y, **kwargs)
|
||||||
|
return self
|
||||||
|
|
||||||
def _build(self):
|
def predict(self, X):
|
||||||
|
return self.estimator_.predict(X)
|
||||||
|
|
||||||
self.dag_ = None
|
|
||||||
|
|
||||||
def _train(self):
|
class KDBNew(KDB):
|
||||||
"""Build SPODE estimators (Super Parent One Dependent Estimator)"""
|
def __init__(
|
||||||
self.models_ = []
|
self,
|
||||||
class_edges = [(self.class_name_, f) for f in self.features_]
|
k=2,
|
||||||
for idx in range(len(self.features_)):
|
show_progress=False,
|
||||||
feature_edges = [
|
random_state=None,
|
||||||
(self.features_[idx], f)
|
discretizer_depth=1e6,
|
||||||
for f in self.features_
|
discretizer_length=3,
|
||||||
if f != self.features_[idx]
|
discretizer_cuts=0,
|
||||||
]
|
):
|
||||||
feature_edges.extend(class_edges)
|
self.discretizer_depth = discretizer_depth
|
||||||
model = BayesianNetwork(
|
self.discretizer_length = discretizer_length
|
||||||
feature_edges, show_progress=self.show_progress
|
self.discretizer_cuts = discretizer_cuts
|
||||||
)
|
super().__init__(
|
||||||
model.fit(
|
k=k, show_progress=show_progress, random_state=random_state
|
||||||
self.dataset_,
|
)
|
||||||
estimator=BayesianEstimator,
|
|
||||||
prior_type="K2",
|
|
||||||
)
|
|
||||||
self.models_.append(model)
|
|
||||||
|
|
||||||
def plot(self, title=""):
|
def fit(self, X, y, **kwargs):
|
||||||
for idx, model in enumerate(self.models_):
|
self.estimator_ = Proposal(self)
|
||||||
self.model_ = model
|
self.estimator_.fit(X, y, **kwargs)
|
||||||
super().plot(title=f"{idx} {title}")
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
return self.estimator_.predict(X)
|
||||||
|
|
||||||
|
|
||||||
|
class SPODENew(SPODE):
|
||||||
|
"""This class implements a classifier for the SPODE algorithm similar to
|
||||||
|
TANNew and KDBNew"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
random_state,
|
||||||
|
show_progress,
|
||||||
|
discretizer_depth=1e6,
|
||||||
|
discretizer_length=3,
|
||||||
|
discretizer_cuts=0,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
random_state=random_state, show_progress=show_progress
|
||||||
|
)
|
||||||
|
self.discretizer_depth = discretizer_depth
|
||||||
|
self.discretizer_length = discretizer_length
|
||||||
|
self.discretizer_cuts = discretizer_cuts
|
||||||
|
|
||||||
|
|
||||||
|
class AODENew(AODE):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
random_state=None,
|
||||||
|
show_progress=False,
|
||||||
|
discretizer_depth=1e6,
|
||||||
|
discretizer_length=3,
|
||||||
|
discretizer_cuts=0,
|
||||||
|
):
|
||||||
|
self.discretizer_depth = discretizer_depth
|
||||||
|
self.discretizer_length = discretizer_length
|
||||||
|
self.discretizer_cuts = discretizer_cuts
|
||||||
|
super().__init__(
|
||||||
|
random_state=random_state,
|
||||||
|
show_progress=show_progress,
|
||||||
|
estimator=Proposal(
|
||||||
|
SPODENew(
|
||||||
|
random_state=random_state,
|
||||||
|
show_progress=show_progress,
|
||||||
|
discretizer_depth=discretizer_depth,
|
||||||
|
discretizer_length=discretizer_length,
|
||||||
|
discretizer_cuts=discretizer_cuts,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _train(self, kwargs):
|
||||||
|
for dag in build_spodes(self.feature_names_in_, self.class_name_):
|
||||||
|
proposal = clone(self.estimator_)
|
||||||
|
proposal.estimator.dag_ = proposal.estimator.model_ = dag
|
||||||
|
self.estimators_.append(proposal.fit(self.X_, self.y_, **kwargs))
|
||||||
|
self.n_estimators_ = len(self.estimators_)
|
||||||
|
|
||||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||||
check_is_fitted(self, ["X_", "y_", "fitted_"])
|
check_is_fitted(self, ["X_", "y_", "fitted_"])
|
||||||
# Input validation
|
# Input validation
|
||||||
X = self._validate_data(X, reset=False)
|
X = check_array(X)
|
||||||
n_samples = X.shape[0]
|
result = np.empty((X.shape[0], self.n_estimators_))
|
||||||
n_estimators = len(self.models_)
|
for index, model in enumerate(self.estimators_):
|
||||||
result = np.empty((n_samples, n_estimators))
|
result[:, index] = model.predict(X)
|
||||||
dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
|
return mode(result, axis=1, keepdims=False).mode.ravel()
|
||||||
for index, model in enumerate(self.models_):
|
|
||||||
result[:, index] = model.predict(dataset).values.ravel()
|
@property
|
||||||
|
def states_(self):
|
||||||
|
if hasattr(self, "fitted_"):
|
||||||
|
return sum(
|
||||||
|
[
|
||||||
|
len(item)
|
||||||
|
for model in self.estimators_
|
||||||
|
for _, item in model.estimator.model_.states.items()
|
||||||
|
]
|
||||||
|
) / len(self.estimators_)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def depth_(self):
|
||||||
|
return self.states_
|
||||||
|
|
||||||
|
def nodes_edges(self):
|
||||||
|
nodes = 0
|
||||||
|
edges = 0
|
||||||
|
if hasattr(self, "fitted_"):
|
||||||
|
nodes = sum([len(x.estimator.dag_) for x in self.estimators_])
|
||||||
|
edges = sum(
|
||||||
|
[len(x.estimator.dag_.edges()) for x in self.estimators_]
|
||||||
|
)
|
||||||
|
return nodes, edges
|
||||||
|
|
||||||
|
def plot(self, title=""):
|
||||||
|
warnings.simplefilter("ignore", UserWarning)
|
||||||
|
for idx, model in enumerate(self.estimators_):
|
||||||
|
model.estimator.plot(title=f"{idx} {title}")
|
||||||
|
|
||||||
|
def version(self):
|
||||||
|
if hasattr(self, "fitted_"):
|
||||||
|
return self.estimator_.estimator.version()
|
||||||
|
return SPODENew(None, False).version()
|
||||||
|
|
||||||
|
|
||||||
|
class Proposal(BaseEstimator):
|
||||||
|
def __init__(self, estimator):
|
||||||
|
self.estimator = estimator
|
||||||
|
self.class_type = estimator.__class__
|
||||||
|
|
||||||
|
def fit(self, X, y, **kwargs):
|
||||||
|
# Check parameters
|
||||||
|
self.estimator._check_params(X, y, kwargs)
|
||||||
|
# Discretize train data
|
||||||
|
self.discretizer_ = FImdlp(
|
||||||
|
n_jobs=1,
|
||||||
|
max_depth=self.estimator.discretizer_depth,
|
||||||
|
min_length=self.estimator.discretizer_length,
|
||||||
|
max_cuts=self.estimator.discretizer_cuts,
|
||||||
|
)
|
||||||
|
self.Xd = self.discretizer_.fit_transform(X, y)
|
||||||
|
kwargs = self.update_kwargs(y, kwargs)
|
||||||
|
# Build the model
|
||||||
|
super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs)
|
||||||
|
# Local discretization based on the model
|
||||||
|
self._local_discretization()
|
||||||
|
# self.check_integrity("fit", self.Xd)
|
||||||
|
self.fitted_ = True
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
# Check is fit had been called
|
||||||
|
check_is_fitted(self, ["fitted_"])
|
||||||
|
# Input validation
|
||||||
|
X = check_array(X)
|
||||||
|
Xd = self.discretizer_.transform(X)
|
||||||
|
# self.check_integrity("predict", Xd)
|
||||||
|
return super(self.class_type, self.estimator).predict(Xd)
|
||||||
|
|
||||||
|
def update_kwargs(self, y, kwargs):
|
||||||
|
features = (
|
||||||
|
kwargs["features"]
|
||||||
|
if "features" in kwargs
|
||||||
|
else default_feature_names(self.Xd.shape[1])
|
||||||
|
)
|
||||||
|
states = {
|
||||||
|
features[i]: self.discretizer_.get_states_feature(i)
|
||||||
|
for i in range(self.Xd.shape[1])
|
||||||
|
}
|
||||||
|
class_name = (
|
||||||
|
kwargs["class_name"]
|
||||||
|
if "class_name" in kwargs
|
||||||
|
else self.estimator.default_class_name()
|
||||||
|
)
|
||||||
|
states[class_name] = np.unique(y).tolist()
|
||||||
|
kwargs["state_names"] = states
|
||||||
|
self.state_names_ = states
|
||||||
|
self.features_ = features
|
||||||
|
kwargs["features"] = features
|
||||||
|
kwargs["class_name"] = class_name
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
def _local_discretization(self):
|
||||||
|
"""Discretize each feature with its fathers and the class"""
|
||||||
|
upgrade = False
|
||||||
|
# order of local discretization is important. no good 0, 1, 2...
|
||||||
|
ancestral_order = list(nx.topological_sort(self.estimator.dag_))
|
||||||
|
for feature in ancestral_order:
|
||||||
|
if feature == self.estimator.class_name_:
|
||||||
|
continue
|
||||||
|
idx = self.estimator.indexed_features_[feature]
|
||||||
|
fathers = self.estimator.dag_.get_parents(feature)
|
||||||
|
if len(fathers) > 1:
|
||||||
|
# First remove the class name as it will be added later
|
||||||
|
fathers.remove(self.estimator.class_name_)
|
||||||
|
# Get the fathers indices
|
||||||
|
features = [
|
||||||
|
self.estimator.indexed_features_[f] for f in fathers
|
||||||
|
]
|
||||||
|
# Update the discretization of the feature
|
||||||
|
self.Xd[:, idx] = self.discretizer_.join_fit(
|
||||||
|
# each feature has to use previous discretization data=res
|
||||||
|
target=idx,
|
||||||
|
features=features,
|
||||||
|
data=self.Xd,
|
||||||
|
)
|
||||||
|
upgrade = True
|
||||||
|
if upgrade:
|
||||||
|
# Update the dataset
|
||||||
|
self.estimator.X_ = self.Xd
|
||||||
|
self.estimator.build_dataset()
|
||||||
|
self.state_names_ = {
|
||||||
|
key: self.discretizer_.get_states_feature(value)
|
||||||
|
for key, value in self.estimator.indexed_features_.items()
|
||||||
|
}
|
||||||
|
states = {"state_names": self.state_names_}
|
||||||
|
# Update the model
|
||||||
|
self.estimator.model_.fit(
|
||||||
|
self.estimator.dataset_,
|
||||||
|
estimator=BayesianEstimator,
|
||||||
|
prior_type="K2",
|
||||||
|
**states,
|
||||||
|
)
|
||||||
|
|
||||||
|
# def check_integrity(self, source, X):
|
||||||
|
# # print(f"Checking integrity of {source} data")
|
||||||
|
# for i in range(X.shape[1]):
|
||||||
|
# if not set(np.unique(X[:, i]).tolist()).issubset(
|
||||||
|
# set(self.state_names_[self.features_[i]])
|
||||||
|
# ):
|
||||||
|
# print(
|
||||||
|
# "i",
|
||||||
|
# i,
|
||||||
|
# "features[i]",
|
||||||
|
# self.features_[i],
|
||||||
|
# "np.unique(X[:, i])",
|
||||||
|
# np.unique(X[:, i]),
|
||||||
|
# "np.array(state_names[features[i]])",
|
||||||
|
# np.array(self.state_names_[self.features_[i]]),
|
||||||
|
# )
|
||||||
|
# raise ValueError("Discretization error")
|
||||||
|
|
||||||
|
|
||||||
|
class BoostSPODE(BayesBase):
|
||||||
|
def _check_params(self, X, y, kwargs):
|
||||||
|
expected_args = [
|
||||||
|
"class_name",
|
||||||
|
"features",
|
||||||
|
"state_names",
|
||||||
|
"sample_weight",
|
||||||
|
"weighted",
|
||||||
|
"sparent",
|
||||||
|
]
|
||||||
|
return self._check_params_fit(X, y, expected_args, kwargs)
|
||||||
|
|
||||||
|
def _build(self, _):
|
||||||
|
class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
|
||||||
|
feature_edges = [
|
||||||
|
(self.sparent_, f)
|
||||||
|
for f in self.feature_names_in_
|
||||||
|
if f != self.sparent_
|
||||||
|
]
|
||||||
|
feature_edges.extend(class_edges)
|
||||||
|
self.dag_ = DAG(feature_edges)
|
||||||
|
|
||||||
|
def _train(self, kwargs):
|
||||||
|
states = dict(state_names=kwargs.get("state_names", []))
|
||||||
|
self.model_ = BayesianNetwork(self.dag_.edges(), show_progress=False)
|
||||||
|
self.model_.fit(
|
||||||
|
self.dataset_,
|
||||||
|
estimator=BayesianEstimator,
|
||||||
|
prior_type="K2",
|
||||||
|
weighted=self.weighted_,
|
||||||
|
**states,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BoostAODE(ClassifierMixin, BaseEnsemble):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
show_progress=False,
|
||||||
|
random_state=None,
|
||||||
|
estimator=None,
|
||||||
|
):
|
||||||
|
self.show_progress = show_progress
|
||||||
|
self.random_state = random_state
|
||||||
|
super().__init__(estimator=estimator)
|
||||||
|
|
||||||
|
def _validate_estimator(self) -> None:
|
||||||
|
"""Check the estimator and set the estimator_ attribute."""
|
||||||
|
super()._validate_estimator(
|
||||||
|
default=BoostSPODE(
|
||||||
|
random_state=self.random_state,
|
||||||
|
show_progress=self.show_progress,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def fit(self, X, y, **kwargs):
|
||||||
|
self.n_features_in_ = X.shape[1]
|
||||||
|
self.feature_names_in_ = kwargs.get(
|
||||||
|
"features", default_feature_names(self.n_features_in_)
|
||||||
|
)
|
||||||
|
self.class_name_ = kwargs.get("class_name", "class")
|
||||||
|
self.X_ = X
|
||||||
|
self.y_ = y
|
||||||
|
self.n_samples_ = X.shape[0]
|
||||||
|
self.estimators_ = []
|
||||||
|
self._validate_estimator()
|
||||||
|
self._train(kwargs)
|
||||||
|
self.fitted_ = True
|
||||||
|
# To keep compatiblity with the benchmark platform
|
||||||
|
self.nodes_leaves = self.nodes_edges
|
||||||
|
return self
|
||||||
|
|
||||||
|
def version(self):
|
||||||
|
if hasattr(self, "fitted_"):
|
||||||
|
return self.estimator_.version()
|
||||||
|
return SPODE(None, False).version()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def states_(self):
|
||||||
|
if hasattr(self, "fitted_"):
|
||||||
|
return sum(
|
||||||
|
[
|
||||||
|
len(item)
|
||||||
|
for model in self.estimators_
|
||||||
|
for _, item in model.model_.states.items()
|
||||||
|
]
|
||||||
|
) / len(self.estimators_)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def depth_(self):
|
||||||
|
return self.states_
|
||||||
|
|
||||||
|
def nodes_edges(self):
|
||||||
|
nodes = 0
|
||||||
|
edges = 0
|
||||||
|
if hasattr(self, "fitted_"):
|
||||||
|
nodes = sum([len(x.dag_) for x in self.estimators_])
|
||||||
|
edges = sum([len(x.dag_.edges()) for x in self.estimators_])
|
||||||
|
return nodes, edges
|
||||||
|
|
||||||
|
def plot(self, title=""):
|
||||||
|
warnings.simplefilter("ignore", UserWarning)
|
||||||
|
for idx, model in enumerate(self.estimators_):
|
||||||
|
model.plot(title=f"{idx} {title}")
|
||||||
|
|
||||||
|
def _train(self, kwargs):
|
||||||
|
"""Build boosted SPODEs"""
|
||||||
|
weights = [1 / self.n_samples_] * self.n_samples_
|
||||||
|
selected_features = []
|
||||||
|
# Step 0: Set the finish condition
|
||||||
|
for _ in range(self.n_features_in_):
|
||||||
|
# Step 1: Build ranking with mutual information
|
||||||
|
features = (
|
||||||
|
CSelectKBestWeighted(
|
||||||
|
self.X_, self.y_, weights, k=self.n_features_in_
|
||||||
|
)
|
||||||
|
.fit()
|
||||||
|
.get_features()
|
||||||
|
)
|
||||||
|
# Step 1.1: Select the feature to become the sparent
|
||||||
|
for n_feature in features:
|
||||||
|
if n_feature not in selected_features:
|
||||||
|
selected_features.append(n_feature)
|
||||||
|
break
|
||||||
|
feature = self.feature_names_in_[n_feature]
|
||||||
|
# Step 2: Build & train spode with the first feature as sparent
|
||||||
|
estimator = clone(self.estimator_)
|
||||||
|
_args = kwargs.copy()
|
||||||
|
_args["sparent"] = feature
|
||||||
|
_args["sample_weight"] = weights
|
||||||
|
_args["weighted"] = True
|
||||||
|
# print("I'm gonna build a spode with", feature)
|
||||||
|
# Step 2.1: build dataset
|
||||||
|
# Step 2.2: Train the model
|
||||||
|
estimator.fit(self.X_, self.y_, **_args)
|
||||||
|
# Step 3: Compute errors (epsilon sub m & alpha sub m)
|
||||||
|
# Explanation in https://medium.datadriveninvestor.com/understanding-adaboost-and-scikit-learns-algorithm-c8d8af5ace10
|
||||||
|
y_pred = estimator.predict(self.X_)
|
||||||
|
em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights)
|
||||||
|
am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1)
|
||||||
|
# Step 3.2: Update weights for next classifier
|
||||||
|
weights = [
|
||||||
|
wm * np.exp(am * (ym != yp))
|
||||||
|
for wm, ym, yp in zip(weights, self.y_, y_pred)
|
||||||
|
]
|
||||||
|
# Step 4: Add the new model
|
||||||
|
self.estimators_.append(estimator)
|
||||||
|
self.weights_ = weights
|
||||||
|
|
||||||
|
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||||
|
n_samples = X.shape[0]
|
||||||
|
n_estimators = len(self.estimators_)
|
||||||
|
result = np.empty((n_samples, n_estimators))
|
||||||
|
for index, estimator in enumerate(self.estimators_):
|
||||||
|
result[:, index] = estimator.predict(X)
|
||||||
return mode(result, axis=1, keepdims=False).mode.ravel()
|
return mode(result, axis=1, keepdims=False).mode.ravel()
|
||||||
|
93
bayesclass/feature_selection.py
Normal file
93
bayesclass/feature_selection.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
# import numpy as np
|
||||||
|
# from sklearn.feature_selection import mutual_info_classif
|
||||||
|
# from sklearn.utils.validation import check_X_y, check_is_fitted
|
||||||
|
# from sklearn.feature_selection._univariate_selection import (
|
||||||
|
# _BaseFilter,
|
||||||
|
# _clean_nans,
|
||||||
|
# )
|
||||||
|
|
||||||
|
# """
|
||||||
|
# Compute the weighted mutual information between each feature and the
|
||||||
|
# target.
|
||||||
|
# Based on
|
||||||
|
# Silviu Guiaşu,
|
||||||
|
# Weighted entropy,
|
||||||
|
# Reports on Mathematical Physics,
|
||||||
|
# Volume 2, Issue 3,
|
||||||
|
# 1971,
|
||||||
|
# Pages 165-179,
|
||||||
|
# ISSN 0034-4877,
|
||||||
|
# https://doi.org/10.1016/0034-4877(71)90002-4.
|
||||||
|
# (https://www.sciencedirect.com/science/article/pii/0034487771900024)
|
||||||
|
# Abstract: Weighted entropy is the measure of information supplied by a
|
||||||
|
# probablistic experiment whose elementary events are characterized both by their
|
||||||
|
# objective probabilities and by some qualitative (objective or subjective)
|
||||||
|
# weights. The properties, the axiomatics and the maximum value of the weighted
|
||||||
|
# entropy are given.
|
||||||
|
# """
|
||||||
|
|
||||||
|
|
||||||
|
# class SelectKBestWeighted(_BaseFilter):
|
||||||
|
# def __init__(self, *, k=10):
|
||||||
|
# super().__init__(score_func=mutual_info_classif)
|
||||||
|
# self.k = k
|
||||||
|
|
||||||
|
# def _check_params(self, X, y):
|
||||||
|
# if self.k > X.shape[1] or self.k < 1:
|
||||||
|
# raise ValueError(
|
||||||
|
# f"k must be between 1 and {X.shape[1]} got {self.k}."
|
||||||
|
# )
|
||||||
|
|
||||||
|
# def _get_support_mask(self):
|
||||||
|
# check_is_fitted(self)
|
||||||
|
|
||||||
|
# if self.k == "all":
|
||||||
|
# return np.ones(self.scores_.shape, dtype=bool)
|
||||||
|
# elif self.k == 0:
|
||||||
|
# return np.zeros(self.scores_.shape, dtype=bool)
|
||||||
|
# else:
|
||||||
|
# scores = _clean_nans(self.scores_)
|
||||||
|
# mask = np.zeros(scores.shape, dtype=bool)
|
||||||
|
|
||||||
|
# # Request a stable sort. Mergesort takes more memory (~40MB per
|
||||||
|
# # megafeature on x86-64).
|
||||||
|
# mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1
|
||||||
|
# return mask
|
||||||
|
|
||||||
|
# def fit(self, X, y, sample_weight):
|
||||||
|
# self.X_, self.y_ = check_X_y(X, y)
|
||||||
|
# self._check_params(X, y)
|
||||||
|
# self.n_features_in_ = X.shape[1]
|
||||||
|
# self.sample_weight_ = sample_weight
|
||||||
|
# # Compute the entropy of the target variable
|
||||||
|
# entropy_y = -np.sum(
|
||||||
|
# np.multiply(
|
||||||
|
# np.bincount(y, weights=sample_weight),
|
||||||
|
# np.log(np.bincount(y, weights=sample_weight)),
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
|
||||||
|
# # Compute the mutual information between each feature and the target
|
||||||
|
# mi = self.score_func(X, y)
|
||||||
|
|
||||||
|
# # Compute the weighted entropy of each feature
|
||||||
|
# entropy_weighted = []
|
||||||
|
# for i in range(X.shape[1]):
|
||||||
|
# # Compute the weighted frequency of each unique value of the
|
||||||
|
# # feature
|
||||||
|
# freq_weighted = np.bincount(X[:, i], weights=sample_weight)
|
||||||
|
# freq_weighted = freq_weighted[freq_weighted != 0]
|
||||||
|
|
||||||
|
# # Compute the weighted entropy of the feature
|
||||||
|
# entropy_weighted.append(
|
||||||
|
# -np.sum(np.multiply(freq_weighted, np.log(freq_weighted)))
|
||||||
|
# / np.sum(sample_weight)
|
||||||
|
# )
|
||||||
|
|
||||||
|
# # Compute the weighted mutual information between each feature and
|
||||||
|
# # the target
|
||||||
|
# mi_weighted = mi * entropy_weighted / entropy_y
|
||||||
|
|
||||||
|
# # Return the weighted mutual information scores
|
||||||
|
# self.scores_ = mi_weighted
|
||||||
|
# return self
|
Binary file not shown.
After Width: | Height: | Size: 55 KiB |
Binary file not shown.
After Width: | Height: | Size: 55 KiB |
Binary file not shown.
Before Width: | Height: | Size: 50 KiB After Width: | Height: | Size: 49 KiB |
Binary file not shown.
After Width: | Height: | Size: 49 KiB |
Binary file not shown.
After Width: | Height: | Size: 44 KiB |
38
bayesclass/tests/conftest.py
Normal file
38
bayesclass/tests/conftest.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import pytest
|
||||||
|
from sklearn.datasets import load_iris
|
||||||
|
from fimdlp.mdlp import FImdlp
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def iris():
|
||||||
|
dataset = load_iris()
|
||||||
|
X = dataset["data"]
|
||||||
|
y = dataset["target"]
|
||||||
|
features = dataset["feature_names"]
|
||||||
|
# To make iris dataset has the same values as our iris.arff dataset
|
||||||
|
patch = {(34, 3): (0.2, 0.1), (37, 1): (3.6, 3.1), (37, 2): (1.4, 1.5)}
|
||||||
|
for key, value in patch.items():
|
||||||
|
X[key] = value[1]
|
||||||
|
return X, y, features
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def data(iris):
|
||||||
|
return iris[0], iris[1]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def features(iris):
|
||||||
|
return iris[2]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def class_name():
|
||||||
|
return "class"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def data_disc(data):
|
||||||
|
clf = FImdlp()
|
||||||
|
X, y = data
|
||||||
|
return clf.fit_transform(X, y), y
|
@@ -1,6 +1,5 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.datasets import load_iris
|
|
||||||
from sklearn.preprocessing import KBinsDiscretizer
|
from sklearn.preprocessing import KBinsDiscretizer
|
||||||
from matplotlib.testing.decorators import image_comparison
|
from matplotlib.testing.decorators import image_comparison
|
||||||
from matplotlib.testing.conftest import mpl_test_settings
|
from matplotlib.testing.conftest import mpl_test_settings
|
||||||
@@ -10,28 +9,21 @@ from bayesclass.clfs import AODE
|
|||||||
from .._version import __version__
|
from .._version import __version__
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def data():
|
|
||||||
X, y = load_iris(return_X_y=True)
|
|
||||||
enc = KBinsDiscretizer(encode="ordinal")
|
|
||||||
return enc.fit_transform(X), y
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def clf():
|
def clf():
|
||||||
return AODE()
|
return AODE(random_state=17)
|
||||||
|
|
||||||
|
|
||||||
def test_AODE_default_hyperparameters(data, clf):
|
def test_AODE_default_hyperparameters(data_disc, clf):
|
||||||
# Test default values of hyperparameters
|
# Test default values of hyperparameters
|
||||||
assert not clf.show_progress
|
assert not clf.show_progress
|
||||||
assert clf.random_state is None
|
|
||||||
clf = AODE(show_progress=True, random_state=17)
|
|
||||||
assert clf.show_progress
|
|
||||||
assert clf.random_state == 17
|
assert clf.random_state == 17
|
||||||
clf.fit(*data)
|
clf = AODE(show_progress=True)
|
||||||
|
assert clf.show_progress
|
||||||
|
assert clf.random_state is None
|
||||||
|
clf.fit(*data_disc)
|
||||||
assert clf.class_name_ == "class"
|
assert clf.class_name_ == "class"
|
||||||
assert clf.features_ == [
|
assert clf.feature_names_in_ == [
|
||||||
"feature_0",
|
"feature_0",
|
||||||
"feature_1",
|
"feature_1",
|
||||||
"feature_2",
|
"feature_2",
|
||||||
@@ -42,50 +34,66 @@ def test_AODE_default_hyperparameters(data, clf):
|
|||||||
@image_comparison(
|
@image_comparison(
|
||||||
baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"]
|
baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"]
|
||||||
)
|
)
|
||||||
def test_AODE_plot(data, clf):
|
def test_AODE_plot(data_disc, features, clf):
|
||||||
# mpl_test_settings will automatically clean these internal side effects
|
# mpl_test_settings will automatically clean these internal side effects
|
||||||
mpl_test_settings
|
mpl_test_settings
|
||||||
dataset = load_iris(as_frame=True)
|
clf.fit(*data_disc, features=features)
|
||||||
clf.fit(*data, features=dataset["feature_names"])
|
|
||||||
clf.plot("AODE Iris")
|
clf.plot("AODE Iris")
|
||||||
|
|
||||||
|
|
||||||
def test_AODE_version(clf):
|
def test_AODE_version(clf, features, data_disc):
|
||||||
"""Check AODE version."""
|
"""Check AODE version."""
|
||||||
assert __version__ == clf.version()
|
assert __version__ == clf.version()
|
||||||
|
clf.fit(*data_disc, features=features)
|
||||||
|
assert __version__ == clf.version()
|
||||||
|
|
||||||
|
|
||||||
def test_AODE_nodes_leaves(clf):
|
def test_AODE_nodes_edges(clf, data_disc):
|
||||||
assert clf.nodes_leaves() == (0, 0)
|
assert clf.nodes_edges() == (0, 0)
|
||||||
|
clf.fit(*data_disc)
|
||||||
|
assert clf.nodes_leaves() == (20, 28)
|
||||||
|
|
||||||
|
|
||||||
def test_AODE_classifier(data, clf):
|
def test_AODE_states(clf, data_disc):
|
||||||
clf.fit(*data)
|
assert clf.states_ == 0
|
||||||
attribs = ["classes_", "X_", "y_", "features_", "class_name_"]
|
clf.fit(*data_disc)
|
||||||
|
assert clf.states_ == 19
|
||||||
|
assert clf.depth_ == clf.states_
|
||||||
|
|
||||||
|
|
||||||
|
def test_AODE_classifier(data_disc, clf):
|
||||||
|
clf.fit(*data_disc)
|
||||||
|
attribs = [
|
||||||
|
"feature_names_in_",
|
||||||
|
"class_name_",
|
||||||
|
"n_features_in_",
|
||||||
|
"X_",
|
||||||
|
"y_",
|
||||||
|
]
|
||||||
for attr in attribs:
|
for attr in attribs:
|
||||||
assert hasattr(clf, attr)
|
assert hasattr(clf, attr)
|
||||||
X = data[0]
|
X = data_disc[0]
|
||||||
y = data[1]
|
y = data_disc[1]
|
||||||
y_pred = clf.predict(X)
|
y_pred = clf.predict(X)
|
||||||
assert y_pred.shape == (X.shape[0],)
|
assert y_pred.shape == (X.shape[0],)
|
||||||
assert sum(y == y_pred) == 147
|
assert sum(y == y_pred) == 146
|
||||||
|
|
||||||
|
|
||||||
def test_AODE_wrong_num_features(data, clf):
|
def test_AODE_wrong_num_features(data_disc, clf):
|
||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
ValueError,
|
ValueError,
|
||||||
match="Number of features does not match the number of columns in X",
|
match="Number of features does not match the number of columns in X",
|
||||||
):
|
):
|
||||||
clf.fit(*data, features=["feature_1", "feature_2"])
|
clf.fit(*data_disc, features=["feature_1", "feature_2"])
|
||||||
|
|
||||||
|
|
||||||
def test_AODE_wrong_hyperparam(data, clf):
|
def test_AODE_wrong_hyperparam(data_disc, clf):
|
||||||
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
||||||
clf.fit(*data, wrong_param="wrong_param")
|
clf.fit(*data_disc, wrong_param="wrong_param")
|
||||||
|
|
||||||
|
|
||||||
def test_AODE_error_size_predict(data, clf):
|
def test_AODE_error_size_predict(data_disc, clf):
|
||||||
X, y = data
|
X, y = data_disc
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
X_diff_size = np.ones((10, X.shape[1] + 1))
|
X_diff_size = np.ones((10, X.shape[1] + 1))
|
||||||
|
123
bayesclass/tests/test_AODENew.py
Normal file
123
bayesclass/tests/test_AODENew.py
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
from matplotlib.testing.decorators import image_comparison
|
||||||
|
from matplotlib.testing.conftest import mpl_test_settings
|
||||||
|
|
||||||
|
|
||||||
|
from bayesclass.clfs import AODENew
|
||||||
|
from .._version import __version__
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def clf():
|
||||||
|
return AODENew(random_state=17)
|
||||||
|
|
||||||
|
|
||||||
|
def test_AODENew_default_hyperparameters(data, clf):
|
||||||
|
# Test default values of hyperparameters
|
||||||
|
assert not clf.show_progress
|
||||||
|
assert clf.random_state == 17
|
||||||
|
clf = AODENew(show_progress=True)
|
||||||
|
assert clf.show_progress
|
||||||
|
assert clf.random_state is None
|
||||||
|
clf.fit(*data)
|
||||||
|
assert clf.class_name_ == "class"
|
||||||
|
assert clf.feature_names_in_ == [
|
||||||
|
"feature_0",
|
||||||
|
"feature_1",
|
||||||
|
"feature_2",
|
||||||
|
"feature_3",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@image_comparison(
|
||||||
|
baseline_images=["line_dashes_AODENew"],
|
||||||
|
remove_text=True,
|
||||||
|
extensions=["png"],
|
||||||
|
)
|
||||||
|
def test_AODENew_plot(data, features, clf):
|
||||||
|
# mpl_test_settings will automatically clean these internal side effects
|
||||||
|
mpl_test_settings
|
||||||
|
clf.fit(*data, features=features)
|
||||||
|
clf.plot("AODE Iris")
|
||||||
|
|
||||||
|
|
||||||
|
def test_AODENew_version(clf, data):
|
||||||
|
"""Check AODENew version."""
|
||||||
|
assert __version__ == clf.version()
|
||||||
|
clf.fit(*data)
|
||||||
|
assert __version__ == clf.version()
|
||||||
|
|
||||||
|
|
||||||
|
def test_AODENew_nodes_edges(clf, data):
|
||||||
|
assert clf.nodes_edges() == (0, 0)
|
||||||
|
clf.fit(*data)
|
||||||
|
assert clf.nodes_leaves() == (20, 28)
|
||||||
|
|
||||||
|
|
||||||
|
def test_AODENew_states(clf, data):
|
||||||
|
assert clf.states_ == 0
|
||||||
|
clf.fit(*data)
|
||||||
|
assert clf.states_ == 17.75
|
||||||
|
assert clf.depth_ == clf.states_
|
||||||
|
|
||||||
|
|
||||||
|
def test_AODENew_classifier(data, clf):
|
||||||
|
clf.fit(*data)
|
||||||
|
attribs = [
|
||||||
|
"feature_names_in_",
|
||||||
|
"class_name_",
|
||||||
|
"n_features_in_",
|
||||||
|
"X_",
|
||||||
|
"y_",
|
||||||
|
]
|
||||||
|
for attr in attribs:
|
||||||
|
assert hasattr(clf, attr)
|
||||||
|
X = data[0]
|
||||||
|
y = data[1]
|
||||||
|
y_pred = clf.predict(X)
|
||||||
|
assert y_pred.shape == (X.shape[0],)
|
||||||
|
assert sum(y == y_pred) == 146
|
||||||
|
|
||||||
|
|
||||||
|
def test_AODENew_local_discretization(clf, data_disc):
|
||||||
|
expected_data = [
|
||||||
|
[-1, [0, -1], [0, -1], [0, -1]],
|
||||||
|
[[1, -1], -1, [1, -1], [1, -1]],
|
||||||
|
[[2, -1], [2, -1], -1, [2, -1]],
|
||||||
|
[[3, -1], [3, -1], [3, -1], -1],
|
||||||
|
]
|
||||||
|
clf.fit(*data_disc)
|
||||||
|
for idx, estimator in enumerate(clf.estimators_):
|
||||||
|
expected = expected_data[idx]
|
||||||
|
for feature in range(4):
|
||||||
|
computed = estimator.discretizer_.target_[feature]
|
||||||
|
if type(computed) == list:
|
||||||
|
for j, k in zip(expected[feature], computed):
|
||||||
|
assert j == k
|
||||||
|
else:
|
||||||
|
assert (
|
||||||
|
expected[feature]
|
||||||
|
== estimator.discretizer_.target_[feature]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_AODENew_wrong_num_features(data, clf):
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError,
|
||||||
|
match="Number of features does not match the number of columns in X",
|
||||||
|
):
|
||||||
|
clf.fit(*data, features=["feature_1", "feature_2"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_AODENew_wrong_hyperparam(data, clf):
|
||||||
|
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
||||||
|
clf.fit(*data, wrong_param="wrong_param")
|
||||||
|
|
||||||
|
|
||||||
|
def test_AODENew_error_size_predict(data, clf):
|
||||||
|
X, y = data
|
||||||
|
clf.fit(X, y)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
X_diff_size = np.ones((10, X.shape[1] + 1))
|
||||||
|
clf.predict(X_diff_size)
|
100
bayesclass/tests/test_BoostAODE.py
Normal file
100
bayesclass/tests/test_BoostAODE.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.preprocessing import KBinsDiscretizer
|
||||||
|
from matplotlib.testing.decorators import image_comparison
|
||||||
|
from matplotlib.testing.conftest import mpl_test_settings
|
||||||
|
|
||||||
|
|
||||||
|
from bayesclass.clfs import BoostAODE
|
||||||
|
from .._version import __version__
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def clf():
|
||||||
|
return BoostAODE(random_state=17)
|
||||||
|
|
||||||
|
|
||||||
|
def test_BoostAODE_default_hyperparameters(data_disc, clf):
|
||||||
|
# Test default values of hyperparameters
|
||||||
|
assert not clf.show_progress
|
||||||
|
assert clf.random_state == 17
|
||||||
|
clf = BoostAODE(show_progress=True)
|
||||||
|
assert clf.show_progress
|
||||||
|
assert clf.random_state is None
|
||||||
|
clf.fit(*data_disc)
|
||||||
|
assert clf.class_name_ == "class"
|
||||||
|
assert clf.feature_names_in_ == [
|
||||||
|
"feature_0",
|
||||||
|
"feature_1",
|
||||||
|
"feature_2",
|
||||||
|
"feature_3",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# @image_comparison(
|
||||||
|
# baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"]
|
||||||
|
# )
|
||||||
|
# def test_BoostAODE_plot(data_disc, features, clf):
|
||||||
|
# # mpl_test_settings will automatically clean these internal side effects
|
||||||
|
# mpl_test_settings
|
||||||
|
# clf.fit(*data_disc, features=features)
|
||||||
|
# clf.plot("AODE Iris")
|
||||||
|
|
||||||
|
|
||||||
|
# def test_BoostAODE_version(clf, features, data_disc):
|
||||||
|
# """Check AODE version."""
|
||||||
|
# assert __version__ == clf.version()
|
||||||
|
# clf.fit(*data_disc, features=features)
|
||||||
|
# assert __version__ == clf.version()
|
||||||
|
|
||||||
|
|
||||||
|
# def test_BoostAODE_nodes_edges(clf, data_disc):
|
||||||
|
# assert clf.nodes_edges() == (0, 0)
|
||||||
|
# clf.fit(*data_disc)
|
||||||
|
# assert clf.nodes_leaves() == (20, 28)
|
||||||
|
|
||||||
|
|
||||||
|
# def test_BoostAODE_states(clf, data_disc):
|
||||||
|
# assert clf.states_ == 0
|
||||||
|
# clf.fit(*data_disc)
|
||||||
|
# assert clf.states_ == 19
|
||||||
|
# assert clf.depth_ == clf.states_
|
||||||
|
|
||||||
|
|
||||||
|
# def test_BoostAODE_classifier(data_disc, clf):
|
||||||
|
# clf.fit(*data_disc)
|
||||||
|
# attribs = [
|
||||||
|
# "feature_names_in_",
|
||||||
|
# "class_name_",
|
||||||
|
# "n_features_in_",
|
||||||
|
# "X_",
|
||||||
|
# "y_",
|
||||||
|
# ]
|
||||||
|
# for attr in attribs:
|
||||||
|
# assert hasattr(clf, attr)
|
||||||
|
# X = data_disc[0]
|
||||||
|
# y = data_disc[1]
|
||||||
|
# y_pred = clf.predict(X)
|
||||||
|
# assert y_pred.shape == (X.shape[0],)
|
||||||
|
# assert sum(y == y_pred) == 146
|
||||||
|
|
||||||
|
|
||||||
|
# def test_BoostAODE_wrong_num_features(data_disc, clf):
|
||||||
|
# with pytest.raises(
|
||||||
|
# ValueError,
|
||||||
|
# match="Number of features does not match the number of columns in X",
|
||||||
|
# ):
|
||||||
|
# clf.fit(*data_disc, features=["feature_1", "feature_2"])
|
||||||
|
|
||||||
|
|
||||||
|
# def test_BoostAODE_wrong_hyperparam(data_disc, clf):
|
||||||
|
# with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
||||||
|
# clf.fit(*data_disc, wrong_param="wrong_param")
|
||||||
|
|
||||||
|
|
||||||
|
# def test_BoostAODE_error_size_predict(data_disc, clf):
|
||||||
|
# X, y = data_disc
|
||||||
|
# clf.fit(X, y)
|
||||||
|
# with pytest.raises(ValueError):
|
||||||
|
# X_diff_size = np.ones((10, X.shape[1] + 1))
|
||||||
|
# clf.predict(X_diff_size)
|
@@ -1,28 +1,21 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.datasets import load_iris
|
|
||||||
from sklearn.preprocessing import KBinsDiscretizer
|
from sklearn.preprocessing import KBinsDiscretizer
|
||||||
from matplotlib.testing.decorators import image_comparison
|
from matplotlib.testing.decorators import image_comparison
|
||||||
from matplotlib.testing.conftest import mpl_test_settings
|
from matplotlib.testing.conftest import mpl_test_settings
|
||||||
|
from pgmpy.models import BayesianNetwork
|
||||||
|
|
||||||
|
|
||||||
from bayesclass.clfs import KDB
|
from bayesclass.clfs import KDB
|
||||||
from .._version import __version__
|
from .._version import __version__
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def data():
|
|
||||||
X, y = load_iris(return_X_y=True)
|
|
||||||
enc = KBinsDiscretizer(encode="ordinal")
|
|
||||||
return enc.fit_transform(X), y
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def clf():
|
def clf():
|
||||||
return KDB(k=3)
|
return KDB(k=3, show_progress=False)
|
||||||
|
|
||||||
|
|
||||||
def test_KDB_default_hyperparameters(data, clf):
|
def test_KDB_default_hyperparameters(data_disc, clf):
|
||||||
# Test default values of hyperparameters
|
# Test default values of hyperparameters
|
||||||
assert not clf.show_progress
|
assert not clf.show_progress
|
||||||
assert clf.random_state is None
|
assert clf.random_state is None
|
||||||
@@ -31,9 +24,9 @@ def test_KDB_default_hyperparameters(data, clf):
|
|||||||
assert clf.show_progress
|
assert clf.show_progress
|
||||||
assert clf.random_state == 17
|
assert clf.random_state == 17
|
||||||
assert clf.k == 3
|
assert clf.k == 3
|
||||||
clf.fit(*data)
|
clf.fit(*data_disc)
|
||||||
assert clf.class_name_ == "class"
|
assert clf.class_name_ == "class"
|
||||||
assert clf.features_ == [
|
assert clf.feature_names_in_ == [
|
||||||
"feature_0",
|
"feature_0",
|
||||||
"feature_1",
|
"feature_1",
|
||||||
"feature_2",
|
"feature_2",
|
||||||
@@ -46,49 +39,85 @@ def test_KDB_version(clf):
|
|||||||
assert __version__ == clf.version()
|
assert __version__ == clf.version()
|
||||||
|
|
||||||
|
|
||||||
def test_KDB_nodes_leaves(clf):
|
def test_KDB_nodes_edges(clf, data_disc):
|
||||||
assert clf.nodes_leaves() == (0, 0)
|
assert clf.nodes_edges() == (0, 0)
|
||||||
|
clf.fit(*data_disc)
|
||||||
|
assert clf.nodes_leaves() == (5, 9)
|
||||||
|
|
||||||
|
|
||||||
def test_KDB_classifier(data, clf):
|
def test_KDB_states(clf, data_disc):
|
||||||
clf.fit(*data)
|
assert clf.states_ == 0
|
||||||
attribs = ["classes_", "X_", "y_", "features_", "class_name_"]
|
clf.fit(*data_disc)
|
||||||
|
assert clf.states_ == 19
|
||||||
|
assert clf.depth_ == clf.states_
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDB_classifier(data_disc, clf):
|
||||||
|
clf.fit(*data_disc)
|
||||||
|
attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
|
||||||
for attr in attribs:
|
for attr in attribs:
|
||||||
assert hasattr(clf, attr)
|
assert hasattr(clf, attr)
|
||||||
X = data[0]
|
X = data_disc[0]
|
||||||
y = data[1]
|
y = data_disc[1]
|
||||||
y_pred = clf.predict(X)
|
y_pred = clf.predict(X)
|
||||||
assert y_pred.shape == (X.shape[0],)
|
assert y_pred.shape == (X.shape[0],)
|
||||||
assert sum(y == y_pred) == 148
|
assert sum(y == y_pred) == 146
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDB_classifier_weighted(data_disc, clf):
|
||||||
|
sample_weight = [1] * data_disc[0].shape[0]
|
||||||
|
sample_weight[:50] = [0] * 50
|
||||||
|
clf.fit(*data_disc, sample_weight=sample_weight, weighted=True)
|
||||||
|
assert clf.score(*data_disc) == 0.64
|
||||||
|
|
||||||
|
|
||||||
@image_comparison(
|
@image_comparison(
|
||||||
baseline_images=["line_dashes_KDB"], remove_text=True, extensions=["png"]
|
baseline_images=["line_dashes_KDB"], remove_text=True, extensions=["png"]
|
||||||
)
|
)
|
||||||
def test_KDB_plot(data, clf):
|
def test_KDB_plot(data_disc, features, clf):
|
||||||
# mpl_test_settings will automatically clean these internal side effects
|
# mpl_test_settings will automatically clean these internal side effects
|
||||||
mpl_test_settings
|
mpl_test_settings
|
||||||
dataset = load_iris(as_frame=True)
|
clf.fit(*data_disc, features=features)
|
||||||
clf.fit(*data, features=dataset["feature_names"])
|
|
||||||
clf.plot("KDB Iris")
|
clf.plot("KDB Iris")
|
||||||
|
|
||||||
|
|
||||||
def test_KDB_wrong_num_features(data, clf):
|
def test_KDB_wrong_num_features(data_disc, clf):
|
||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
ValueError,
|
ValueError,
|
||||||
match="Number of features does not match the number of columns in X",
|
match="Number of features does not match the number of columns in X",
|
||||||
):
|
):
|
||||||
clf.fit(*data, features=["feature_1", "feature_2"])
|
clf.fit(*data_disc, features=["feature_1", "feature_2"])
|
||||||
|
|
||||||
|
|
||||||
def test_KDB_wrong_hyperparam(data, clf):
|
def test_KDB_wrong_hyperparam(data_disc, clf):
|
||||||
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
||||||
clf.fit(*data, wrong_param="wrong_param")
|
clf.fit(*data_disc, wrong_param="wrong_param")
|
||||||
|
|
||||||
|
|
||||||
def test_KDB_error_size_predict(data, clf):
|
def test_KDB_error_size_predict(data_disc, clf):
|
||||||
X, y = data
|
X, y = data_disc
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
X_diff_size = np.ones((10, X.shape[1] + 1))
|
X_diff_size = np.ones((10, X.shape[1] + 1))
|
||||||
clf.predict(X_diff_size)
|
clf.predict(X_diff_size)
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDB_dont_do_cycles():
|
||||||
|
clf = KDB(k=4)
|
||||||
|
dag = BayesianNetwork(show_progress=False)
|
||||||
|
clf.feature_names_in_ = [
|
||||||
|
"feature_0",
|
||||||
|
"feature_1",
|
||||||
|
"feature_2",
|
||||||
|
"feature_3",
|
||||||
|
]
|
||||||
|
nodes = list(range(4))
|
||||||
|
weights = np.ones((4, 4))
|
||||||
|
for idx in range(1, 4):
|
||||||
|
dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
|
||||||
|
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
|
||||||
|
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
|
||||||
|
dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
|
||||||
|
for idx in range(4):
|
||||||
|
clf._add_m_edges(dag, idx, nodes, weights)
|
||||||
|
assert len(dag.edges()) == 6
|
||||||
|
132
bayesclass/tests/test_KDBNew.py
Normal file
132
bayesclass/tests/test_KDBNew.py
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
from matplotlib.testing.decorators import image_comparison
|
||||||
|
from matplotlib.testing.conftest import mpl_test_settings
|
||||||
|
from pgmpy.models import BayesianNetwork
|
||||||
|
|
||||||
|
|
||||||
|
from bayesclass.clfs import KDBNew
|
||||||
|
from .._version import __version__
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def clf():
|
||||||
|
return KDBNew(k=3, show_progress=False)
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDBNew_default_hyperparameters(data, clf):
|
||||||
|
# Test default values of hyperparameters
|
||||||
|
assert not clf.show_progress
|
||||||
|
assert clf.random_state is None
|
||||||
|
assert clf.theta == 0.03
|
||||||
|
clf = KDBNew(show_progress=True, random_state=17, k=3)
|
||||||
|
assert clf.show_progress
|
||||||
|
assert clf.random_state == 17
|
||||||
|
assert clf.k == 3
|
||||||
|
clf.fit(*data)
|
||||||
|
assert clf.class_name_ == "class"
|
||||||
|
assert clf.feature_names_in_ == [
|
||||||
|
"feature_0",
|
||||||
|
"feature_1",
|
||||||
|
"feature_2",
|
||||||
|
"feature_3",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDBNew_version(clf):
|
||||||
|
"""Check KDBNew version."""
|
||||||
|
assert __version__ == clf.version()
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDBNew_nodes_edges(clf, data):
|
||||||
|
assert clf.nodes_edges() == (0, 0)
|
||||||
|
clf.fit(*data)
|
||||||
|
assert clf.nodes_leaves() == (5, 9)
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDBNew_states(clf, data):
|
||||||
|
assert clf.states_ == 0
|
||||||
|
clf.fit(*data)
|
||||||
|
assert clf.states_ == 22
|
||||||
|
assert clf.depth_ == clf.states_
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDBNew_classifier(data, clf):
|
||||||
|
clf.fit(*data)
|
||||||
|
attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
|
||||||
|
for attr in attribs:
|
||||||
|
assert hasattr(clf, attr)
|
||||||
|
X = data[0]
|
||||||
|
y = data[1]
|
||||||
|
y_pred = clf.predict(X)
|
||||||
|
assert y_pred.shape == (X.shape[0],)
|
||||||
|
assert sum(y == y_pred) == 145
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDBNew_local_discretization(clf, data):
|
||||||
|
expected = [[1, -1], -1, [0, 1, 3, -1], [1, -1]]
|
||||||
|
clf.fit(*data)
|
||||||
|
for feature in range(4):
|
||||||
|
computed = clf.estimator_.discretizer_.target_[feature]
|
||||||
|
if type(computed) == list:
|
||||||
|
for j, k in zip(expected[feature], computed):
|
||||||
|
assert j == k
|
||||||
|
else:
|
||||||
|
assert (
|
||||||
|
expected[feature]
|
||||||
|
== clf.estimator_.discretizer_.target_[feature]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@image_comparison(
|
||||||
|
baseline_images=["line_dashes_KDBNew"],
|
||||||
|
remove_text=True,
|
||||||
|
extensions=["png"],
|
||||||
|
)
|
||||||
|
def test_KDBNew_plot(data, features, class_name, clf):
|
||||||
|
# mpl_test_settings will automatically clean these internal side effects
|
||||||
|
mpl_test_settings
|
||||||
|
clf.fit(*data, features=features, class_name=class_name)
|
||||||
|
clf.plot("KDBNew Iris")
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDBNew_wrong_num_features(data, clf):
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError,
|
||||||
|
match="Number of features does not match the number of columns in X",
|
||||||
|
):
|
||||||
|
clf.fit(*data, features=["feature_1", "feature_2"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDBNew_wrong_hyperparam(data, clf):
|
||||||
|
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
||||||
|
clf.fit(*data, wrong_param="wrong_param")
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDBNew_error_size_predict(data, clf):
|
||||||
|
X, y = data
|
||||||
|
clf.fit(X, y)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
X_diff_size = np.ones((10, X.shape[1] + 1))
|
||||||
|
clf.predict(X_diff_size)
|
||||||
|
|
||||||
|
|
||||||
|
def test_KDBNew_dont_do_cycles():
|
||||||
|
clf = KDBNew(k=4)
|
||||||
|
dag = BayesianNetwork(show_progress=False)
|
||||||
|
clf.feature_names_in_ = [
|
||||||
|
"feature_0",
|
||||||
|
"feature_1",
|
||||||
|
"feature_2",
|
||||||
|
"feature_3",
|
||||||
|
]
|
||||||
|
nodes = list(range(4))
|
||||||
|
weights = np.ones((4, 4))
|
||||||
|
for idx in range(1, 4):
|
||||||
|
dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
|
||||||
|
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
|
||||||
|
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
|
||||||
|
dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
|
||||||
|
for idx in range(4):
|
||||||
|
clf._add_m_edges(dag, idx, nodes, weights)
|
||||||
|
assert len(dag.edges()) == 6
|
@@ -1,7 +1,5 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.datasets import load_iris
|
|
||||||
from sklearn.preprocessing import KBinsDiscretizer
|
|
||||||
from matplotlib.testing.decorators import image_comparison
|
from matplotlib.testing.decorators import image_comparison
|
||||||
from matplotlib.testing.conftest import mpl_test_settings
|
from matplotlib.testing.conftest import mpl_test_settings
|
||||||
|
|
||||||
@@ -10,29 +8,22 @@ from bayesclass.clfs import TAN
|
|||||||
from .._version import __version__
|
from .._version import __version__
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def data():
|
|
||||||
X, y = load_iris(return_X_y=True)
|
|
||||||
enc = KBinsDiscretizer(encode="ordinal")
|
|
||||||
return enc.fit_transform(X), y
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def clf():
|
def clf():
|
||||||
return TAN()
|
return TAN(random_state=17, show_progress=False)
|
||||||
|
|
||||||
|
|
||||||
def test_TAN_default_hyperparameters(data, clf):
|
def test_TAN_default_hyperparameters(data_disc, clf):
|
||||||
# Test default values of hyperparameters
|
# Test default values of hyperparameters
|
||||||
assert not clf.show_progress
|
assert not clf.show_progress
|
||||||
assert clf.random_state is None
|
|
||||||
clf = TAN(show_progress=True, random_state=17)
|
|
||||||
assert clf.show_progress
|
|
||||||
assert clf.random_state == 17
|
assert clf.random_state == 17
|
||||||
clf.fit(*data)
|
clf = TAN(show_progress=True)
|
||||||
|
assert clf.show_progress
|
||||||
|
assert clf.random_state is None
|
||||||
|
clf.fit(*data_disc)
|
||||||
assert clf.head_ == 0
|
assert clf.head_ == 0
|
||||||
assert clf.class_name_ == "class"
|
assert clf.class_name_ == "class"
|
||||||
assert clf.features_ == [
|
assert clf.feature_names_in_ == [
|
||||||
"feature_0",
|
"feature_0",
|
||||||
"feature_1",
|
"feature_1",
|
||||||
"feature_2",
|
"feature_2",
|
||||||
@@ -45,59 +36,73 @@ def test_TAN_version(clf):
|
|||||||
assert __version__ == clf.version()
|
assert __version__ == clf.version()
|
||||||
|
|
||||||
|
|
||||||
def test_TAN_nodes_leaves(clf):
|
def test_TAN_nodes_edges(clf, data_disc):
|
||||||
assert clf.nodes_leaves() == (0, 0)
|
assert clf.nodes_edges() == (0, 0)
|
||||||
|
clf.fit(*data_disc, head="random")
|
||||||
|
assert clf.nodes_leaves() == (5, 7)
|
||||||
|
|
||||||
|
|
||||||
def test_TAN_random_head(data):
|
def test_TAN_states(clf, data_disc):
|
||||||
clf = TAN(random_state=17)
|
assert clf.states_ == 0
|
||||||
clf.fit(*data, head="random")
|
clf.fit(*data_disc)
|
||||||
|
assert clf.states_ == 19
|
||||||
|
assert clf.depth_ == clf.states_
|
||||||
|
|
||||||
|
|
||||||
|
def test_TAN_random_head(clf, data_disc):
|
||||||
|
clf.fit(*data_disc, head="random")
|
||||||
assert clf.head_ == 3
|
assert clf.head_ == 3
|
||||||
|
|
||||||
|
|
||||||
def test_TAN_classifier(data, clf):
|
def test_TAN_classifier(data_disc, clf):
|
||||||
clf.fit(*data)
|
clf.fit(*data_disc)
|
||||||
attribs = ["classes_", "X_", "y_", "head_", "features_", "class_name_"]
|
attribs = [
|
||||||
|
"classes_",
|
||||||
|
"X_",
|
||||||
|
"y_",
|
||||||
|
"head_",
|
||||||
|
"feature_names_in_",
|
||||||
|
"class_name_",
|
||||||
|
]
|
||||||
for attr in attribs:
|
for attr in attribs:
|
||||||
assert hasattr(clf, attr)
|
assert hasattr(clf, attr)
|
||||||
X = data[0]
|
X = data_disc[0]
|
||||||
y = data[1]
|
y = data_disc[1]
|
||||||
y_pred = clf.predict(X)
|
y_pred = clf.predict(X)
|
||||||
assert y_pred.shape == (X.shape[0],)
|
assert y_pred.shape == (X.shape[0],)
|
||||||
assert sum(y == y_pred) == 147
|
assert sum(y == y_pred) == 146
|
||||||
|
|
||||||
|
|
||||||
@image_comparison(
|
@image_comparison(
|
||||||
baseline_images=["line_dashes_TAN"], remove_text=True, extensions=["png"]
|
baseline_images=["line_dashes_TAN"], remove_text=True, extensions=["png"]
|
||||||
)
|
)
|
||||||
def test_TAN_plot(data, clf):
|
def test_TAN_plot(data_disc, features, clf):
|
||||||
# mpl_test_settings will automatically clean these internal side effects
|
# mpl_test_settings will automatically clean these internal side effects
|
||||||
mpl_test_settings
|
mpl_test_settings
|
||||||
dataset = load_iris(as_frame=True)
|
clf.fit(*data_disc, features=features, head=0)
|
||||||
clf.fit(*data, features=dataset["feature_names"], head=0)
|
|
||||||
clf.plot("TAN Iris head=0")
|
clf.plot("TAN Iris head=0")
|
||||||
|
|
||||||
|
|
||||||
def test_KDB_wrong_num_features(data, clf):
|
def test_TAN_wrong_num_features(data_disc, clf):
|
||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
ValueError,
|
ValueError,
|
||||||
match="Number of features does not match the number of columns in X",
|
match="Number of features does not match the number of columns in X",
|
||||||
):
|
):
|
||||||
clf.fit(*data, features=["feature_1", "feature_2"])
|
clf.fit(*data_disc, features=["feature_1", "feature_2"])
|
||||||
|
|
||||||
|
|
||||||
def test_TAN_wrong_hyperparam(data, clf):
|
def test_TAN_wrong_hyperparam(data_disc, clf):
|
||||||
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
||||||
clf.fit(*data, wrong_param="wrong_param")
|
clf.fit(*data_disc, wrong_param="wrong_param")
|
||||||
|
|
||||||
|
|
||||||
def test_TAN_head_out_of_range(data, clf):
|
def test_TAN_head_out_of_range(data_disc, clf):
|
||||||
with pytest.raises(ValueError, match="Head index out of range"):
|
with pytest.raises(ValueError, match="Head index out of range"):
|
||||||
clf.fit(*data, head=4)
|
clf.fit(*data_disc, head=4)
|
||||||
|
|
||||||
|
|
||||||
def test_TAN_error_size_predict(data, clf):
|
def test_TAN_error_size_predict(data_disc, clf):
|
||||||
X, y = data
|
X, y = data_disc
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
X_diff_size = np.ones((10, X.shape[1] + 1))
|
X_diff_size = np.ones((10, X.shape[1] + 1))
|
||||||
|
120
bayesclass/tests/test_TANNew.py
Normal file
120
bayesclass/tests/test_TANNew.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
from matplotlib.testing.decorators import image_comparison
|
||||||
|
from matplotlib.testing.conftest import mpl_test_settings
|
||||||
|
|
||||||
|
|
||||||
|
from bayesclass.clfs import TANNew
|
||||||
|
from .._version import __version__
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def clf():
|
||||||
|
return TANNew(random_state=17)
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_default_hyperparameters(data, clf):
|
||||||
|
# Test default values of hyperparameters
|
||||||
|
assert not clf.show_progress
|
||||||
|
assert clf.random_state == 17
|
||||||
|
clf = TANNew(show_progress=True)
|
||||||
|
assert clf.show_progress
|
||||||
|
assert clf.random_state is None
|
||||||
|
clf.fit(*data)
|
||||||
|
assert clf.head_ == 0
|
||||||
|
assert clf.class_name_ == "class"
|
||||||
|
assert clf.feature_names_in_ == [
|
||||||
|
"feature_0",
|
||||||
|
"feature_1",
|
||||||
|
"feature_2",
|
||||||
|
"feature_3",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_version(clf):
|
||||||
|
"""Check TANNew version."""
|
||||||
|
assert __version__ == clf.version()
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_nodes_edges(clf, data):
|
||||||
|
assert clf.nodes_edges() == (0, 0)
|
||||||
|
clf.fit(*data, head="random")
|
||||||
|
assert clf.nodes_leaves() == (5, 7)
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_states(clf, data):
|
||||||
|
assert clf.states_ == 0
|
||||||
|
clf.fit(*data)
|
||||||
|
assert clf.states_ == 18
|
||||||
|
assert clf.depth_ == clf.states_
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_random_head(clf, data):
|
||||||
|
clf.fit(*data, head="random")
|
||||||
|
assert clf.head_ == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_local_discretization(clf, data):
|
||||||
|
expected = [-1, [0, -1], [0, -1], [1, -1]]
|
||||||
|
clf.fit(*data)
|
||||||
|
for feature in range(4):
|
||||||
|
assert (
|
||||||
|
expected[feature] == clf.estimator_.discretizer_.target_[feature]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_classifier(data, clf):
|
||||||
|
clf.fit(*data)
|
||||||
|
attribs = [
|
||||||
|
"classes_",
|
||||||
|
"X_",
|
||||||
|
"y_",
|
||||||
|
"head_",
|
||||||
|
"feature_names_in_",
|
||||||
|
"class_name_",
|
||||||
|
]
|
||||||
|
for attr in attribs:
|
||||||
|
assert hasattr(clf, attr)
|
||||||
|
X = data[0]
|
||||||
|
y = data[1]
|
||||||
|
y_pred = clf.predict(X)
|
||||||
|
assert y_pred.shape == (X.shape[0],)
|
||||||
|
assert sum(y == y_pred) == 146
|
||||||
|
|
||||||
|
|
||||||
|
@image_comparison(
|
||||||
|
baseline_images=["line_dashes_TANNew"],
|
||||||
|
remove_text=True,
|
||||||
|
extensions=["png"],
|
||||||
|
)
|
||||||
|
def test_TANNew_plot(data, features, clf):
|
||||||
|
# mpl_test_settings will automatically clean these internal side effects
|
||||||
|
mpl_test_settings
|
||||||
|
clf.fit(*data, features=features, head=0)
|
||||||
|
clf.plot("TANNew Iris head=0")
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_wrong_num_features(data, clf):
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError,
|
||||||
|
match="Number of features does not match the number of columns in X",
|
||||||
|
):
|
||||||
|
clf.fit(*data, features=["feature_1", "feature_2"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_wrong_hyperparam(data, clf):
|
||||||
|
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
|
||||||
|
clf.fit(*data, wrong_param="wrong_param")
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_head_out_of_range(data, clf):
|
||||||
|
with pytest.raises(ValueError, match="Head index out of range"):
|
||||||
|
clf.fit(*data, head=4)
|
||||||
|
|
||||||
|
|
||||||
|
def test_TANNew_error_size_predict(data, clf):
|
||||||
|
X, y = data
|
||||||
|
clf.fit(X, y)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
X_diff_size = np.ones((10, X.shape[1] + 1))
|
||||||
|
clf.predict(X_diff_size)
|
@@ -1,14 +1,29 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from sklearn.utils.estimator_checks import check_estimator
|
from sklearn.utils.estimator_checks import check_estimator
|
||||||
|
|
||||||
from bayesclass.clfs import TAN, KDB, AODE
|
from bayesclass.clfs import BayesBase, TAN, KDB, AODE
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("estimator", [TAN(), KDB(k=2), AODE()])
|
def test_more_tags():
|
||||||
# @pytest.mark.parametrize("estimator", [AODE()])
|
expected = {
|
||||||
def test_all_estimators(estimator):
|
"requires_positive_X": True,
|
||||||
|
"requires_positive_y": True,
|
||||||
|
"preserve_dtype": [np.int32, np.int64],
|
||||||
|
"requires_y": True,
|
||||||
|
}
|
||||||
|
clf = BayesBase(None, True)
|
||||||
|
computed = clf._more_tags()
|
||||||
|
for key, value in expected.items():
|
||||||
|
assert key in computed
|
||||||
|
assert computed[key] == value
|
||||||
|
|
||||||
|
|
||||||
|
# @pytest.mark.parametrize("estimators", [TAN(), KDB(k=2), AODE()])
|
||||||
|
@pytest.mark.parametrize("estimators", [AODE()])
|
||||||
|
def test_all_estimators(estimators):
|
||||||
i = 0
|
i = 0
|
||||||
for estimator, test in check_estimator(estimator, generate_only=True):
|
for estimator, test in check_estimator(estimators, generate_only=True):
|
||||||
print(i := i + 1, test)
|
print(i := i + 1, test)
|
||||||
# test(estimator)
|
# test(estimator)
|
||||||
|
32
patch_pgmpy_0.1.22.diff
Normal file
32
patch_pgmpy_0.1.22.diff
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
diff --git a/pgmpy/models/BayesianNetwork.py b/pgmpy/models/BayesianNetwork.py
|
||||||
|
index bd90122d..70ae38f7 100644
|
||||||
|
--- a/pgmpy/models/BayesianNetwork.py
|
||||||
|
+++ b/pgmpy/models/BayesianNetwork.py
|
||||||
|
@@ -27,7 +27,7 @@ class BayesianNetwork(DAG):
|
||||||
|
Base class for Bayesian Models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
- def __init__(self, ebunch=None, latents=set()):
|
||||||
|
+ def __init__(self, ebunch=None, latents=set(), show_progress=False):
|
||||||
|
"""
|
||||||
|
Initializes a Bayesian Model.
|
||||||
|
A models stores nodes and edges with conditional probability
|
||||||
|
@@ -95,6 +95,7 @@ class BayesianNetwork(DAG):
|
||||||
|
>>> len(G) # number of nodes in graph
|
||||||
|
3
|
||||||
|
"""
|
||||||
|
+ self.show_progress = show_progress
|
||||||
|
super(BayesianNetwork, self).__init__(ebunch=ebunch, latents=latents)
|
||||||
|
self.cpds = []
|
||||||
|
self.cardinalities = defaultdict(int)
|
||||||
|
@@ -738,7 +739,9 @@ class BayesianNetwork(DAG):
|
||||||
|
show_progress=False,
|
||||||
|
)
|
||||||
|
for index, data_point in tqdm(
|
||||||
|
- data_unique.iterrows(), total=data_unique.shape[0]
|
||||||
|
+ data_unique.iterrows(),
|
||||||
|
+ total=data_unique.shape[0],
|
||||||
|
+ disable=not self.show_progress,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
@@ -1,5 +1,5 @@
|
|||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools", "setuptools-scm", "wheel"]
|
requires = ["setuptools", "setuptools-scm", "cython", "wheel", "torch"]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
[tool.setuptools]
|
[tool.setuptools]
|
||||||
@@ -25,6 +25,7 @@ dependencies = [
|
|||||||
"pgmpy",
|
"pgmpy",
|
||||||
"networkx",
|
"networkx",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
|
"fimdlp",
|
||||||
]
|
]
|
||||||
requires-python = ">=3.8"
|
requires-python = ">=3.8"
|
||||||
classifiers = [
|
classifiers = [
|
||||||
@@ -38,9 +39,7 @@ classifiers = [
|
|||||||
"Operating System :: OS Independent",
|
"Operating System :: OS Independent",
|
||||||
"Programming Language :: Python",
|
"Programming Language :: Python",
|
||||||
"Programming Language :: Python",
|
"Programming Language :: Python",
|
||||||
"Programming Language :: Python :: 3.8",
|
"Programming Language :: Python :: 3.11",
|
||||||
"Programming Language :: Python :: 3.9",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
@@ -60,7 +59,7 @@ show_missing = true
|
|||||||
|
|
||||||
[tool.black]
|
[tool.black]
|
||||||
line-length = 79
|
line-length = 79
|
||||||
target_version = ['py38', 'py39', 'py310']
|
target_version = ['py311']
|
||||||
include = '\.pyi?$'
|
include = '\.pyi?$'
|
||||||
exclude = '''
|
exclude = '''
|
||||||
/(
|
/(
|
||||||
|
@@ -1,5 +1,6 @@
|
|||||||
numpy
|
numpy
|
||||||
scipy
|
scipy
|
||||||
|
pandas
|
||||||
scikit-learn
|
scikit-learn
|
||||||
matplotlib
|
matplotlib
|
||||||
networkx
|
networkx
|
||||||
|
41
setup.py
Normal file
41
setup.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
"""
|
||||||
|
Calling
|
||||||
|
$python setup.py build_ext --inplace
|
||||||
|
will build the extension library in the current file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from setuptools import Extension, setup
|
||||||
|
from torch.utils.cpp_extension import (
|
||||||
|
BuildExtension,
|
||||||
|
CppExtension,
|
||||||
|
include_paths,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
setup(
|
||||||
|
ext_modules=[
|
||||||
|
Extension(
|
||||||
|
name="bayesclass.cppSelectFeatures",
|
||||||
|
sources=[
|
||||||
|
"bayesclass/cSelectFeatures.pyx",
|
||||||
|
"bayesclass/FeatureSelect.cpp",
|
||||||
|
],
|
||||||
|
language="c++",
|
||||||
|
include_dirs=["bayesclass"],
|
||||||
|
extra_compile_args=[
|
||||||
|
"-std=c++17",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
CppExtension(
|
||||||
|
name="bayesclass.BayesNet",
|
||||||
|
sources=[
|
||||||
|
"bayesclass/BayesNetwork.pyx",
|
||||||
|
"bayesclass/Network.cc",
|
||||||
|
"bayesclass/Node.cc",
|
||||||
|
"bayesclass/Metrics.cc",
|
||||||
|
],
|
||||||
|
include_dirs=include_paths(),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
cmdclass={"build_ext": BuildExtension},
|
||||||
|
)
|
Reference in New Issue
Block a user