Complete CFS tested with Python mufs

This commit is contained in:
Ricardo Montañana Gómez 2023-10-13 12:29:25 +02:00
parent 40d1dad5d8
commit 5022a4dc90
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
8 changed files with 123 additions and 35 deletions

18
.vscode/c_cpp_properties.json vendored Normal file
View File

@ -0,0 +1,18 @@
{
"configurations": [
{
"name": "Mac",
"includePath": [
"${workspaceFolder}/**"
],
"defines": [],
"macFrameworkPath": [
"/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
],
"cStandard": "c17",
"cppStandard": "c++17",
"compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json"
}
],
"version": 4
}

View File

@ -60,24 +60,13 @@ namespace bayesnet {
{
return scoresKBest;
}
template <class T>
vector<pair<T, T>> Metrics::doCombinations(const vector<T>& source)
{
vector<pair<T, T>> result;
for (int i = 0; i < source.size(); ++i) {
T temp = source[i];
for (int j = i + 1; j < source.size(); ++j) {
result.push_back({ temp, source[j] });
}
}
return result;
}
torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights)
{
auto result = vector<double>();
auto source = vector<string>(features);
source.push_back(className);
auto combinations = doCombinations<string>(source);
auto combinations = doCombinations(source);
// Compute class prior
auto margin = torch::zeros({ classNumStates }, torch::kFloat);
for (int value = 0; value < classNumStates; ++value) {
@ -123,6 +112,11 @@ namespace bayesnet {
torch::Tensor counts = feature.bincount(weights);
double totalWeight = counts.sum().item<double>();
torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
// cout << "Probs: ";
// for (int i = 0; i < probs.size(0); ++i) {
// cout << probs[i].item<double>() << ", ";
// }
// cout << endl;
torch::Tensor logProbs = torch::log(probs);
torch::Tensor entropy = -probs * logProbs;
return entropy.nansum().item<double>();

View File

@ -18,7 +18,17 @@ namespace bayesnet {
double entropy(const Tensor& feature, const Tensor& weights);
vector<string> features;
template <class T>
vector<pair<T, T>> doCombinations(const vector<T>& source);
vector<pair<T, T>> doCombinations(const vector<T>& source)
{
vector<pair<T, T>> result;
for (int i = 0; i < source.size(); ++i) {
T temp = source[i];
for (int j = i + 1; j < source.size(); ++j) {
result.push_back({ temp, source[j] });
}
}
return result;
}
public:
Metrics() = default;
Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);

View File

@ -2,11 +2,11 @@
#include <functional>
#include <limits.h>
#include "BoostAODE.h"
#include "BayesMetrics.h"
#include "Colors.h"
#include "Folding.h"
#include "Paths.h"
#include <openssl/evp.h>
#include "CFS.h"
namespace bayesnet {
BoostAODE::BoostAODE() : Ensemble() {}
@ -98,13 +98,15 @@ namespace bayesnet {
}
}
output += "]";
Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
int maxFeatures = 0;
auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
// std::size_t str_hash = std::hash<std::string>{}(output);
string str_hash = sha256(output);
stringstream oss;
oss << platform::Paths::cfs() << str_hash << ".json";
string name = oss.str();
ifstream file(name);
Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
if (file.is_open()) {
nlohmann::json cfsFeatures = nlohmann::json::parse(file);
file.close();

View File

@ -17,14 +17,22 @@ namespace bayesnet {
*/
auto x = samples.index({ a, "..." });
auto y = samples.index({ b, "..." });
return 2.0 * mutualInformation(y, x, weights) / (entropy(x, weights) + entropy(y, weights));
auto mu = mutualInformation(x, y, weights);
// cout << "Mutual Information: (" << a << ", " << b << ") = " << mu << endl;
auto hx = entropy(x, weights);
// cout << "Entropy X: " << hx << endl;
auto hy = entropy(y, weights);
// cout << "Entropy Y: " << hy << endl;
return 2.0 * mu / (hx + hy);
}
void CFS::computeSuLabels()
{
// Compute Simmetrical Uncertainty between features and labels
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
// cout << "SuLabels" << endl;
for (int i = 0; i < features.size(); ++i) {
suLabels[i] = symmetricalUncertainty(i, -1);
suLabels.push_back(symmetricalUncertainty(i, -1));
// cout << i << " -> " << suLabels[i] << endl;
}
}
@ -44,7 +52,7 @@ namespace bayesnet {
}
double rff = 0;
int n = cfsFeatures.size();
for (const auto& item : doCombinations<int>(cfsFeatures)) {
for (const auto& item : doCombinations(cfsFeatures)) {
rff += computeSuFeatures(item.first, item.second);
}
return rcf / sqrt(n + (n * n - n) * rff);
@ -58,25 +66,58 @@ namespace bayesnet {
auto feature = featureOrder[0];
cfsFeatures.push_back(feature);
cfsScores.push_back(suLabels[feature]);
cfsFeatures.erase(cfsFeatures.begin());
while (continueCondition) {
double merit = numeric_limits<double>::lowest();
int bestFeature = -1;
for (auto feature : featureOrder) {
cfsFeatures.push_back(feature);
auto meritNew = computeMerit(); // Compute merit with cfsFeatures
//cout << "MeritNew: " << meritNew << " Merit: " << merit << endl;
if (meritNew > merit) {
merit = meritNew;
bestFeature = feature;
}
cfsFeatures.pop_back();
}
if (bestFeature == -1) {
throw runtime_error("Feature not found");
}
cfsFeatures.push_back(bestFeature);
cfsScores.push_back(merit);
featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), feature), featureOrder.end());
featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
continueCondition = computeContinueCondition(featureOrder);
}
fitted = true;
}
void CFS::test()
{
cout << "H(y): " << entropy(samples.index({ -1, "..." }), weights) << endl;
cout << "y: ";
auto y = samples.index({ -1, "..." });
for (int i = 0; i < y.size(0); ++i) {
cout << y[i].item<double>() << ", ";
}
cout << endl;
computeSuLabels();
// cout << "Probabilites of features: " << endl;
// for (const auto& featureName : features) {
// int featureIdx = find(features.begin(), features.end(), featureName) - features.begin();
// cout << featureName << "(" << featureIdx << "): ";
// auto feature = samples.index({ featureIdx, "..." });
// torch::Tensor counts = feature.bincount(weights);
// double totalWeight = counts.sum().item<double>();
// torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
// for (int i = 0; i < probs.size(0); ++i) {
// cout << probs[i].item<double>() << ", ";
// }
// cout << endl;
// // for (int i = 0; i < x.size(0); ++i) {
// // cout << x[i].item<double>() << ", ";
// // }
// // cout << endl;
// }
}
bool CFS::computeContinueCondition(const vector<int>& featureOrder)
{
if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) {

View File

@ -11,6 +11,7 @@ namespace bayesnet {
CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
virtual ~CFS() {};
void fit();
void test();
vector<int> getFeatures() const;
vector<double> getScores() const;
private:

View File

@ -9,7 +9,7 @@ add_executable(b_main main.cc Folding.cc Experiment.cc Datasets.cc Dataset.cc Mo
add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
add_executable(b_list list.cc Datasets.cc Dataset.cc)
add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc)
add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc)
add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc )
target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
target_link_libraries(b_manage "${TORCH_LIBRARIES}" libxlsxwriter.so ArffFiles mdlp stdc++fs)

View File

@ -7,6 +7,7 @@
#include "Network.h"
#include "ArffFiles.h"
#include "CPPFImdlp.h"
#include "CFS.h"
using namespace std;
using namespace platform;
@ -191,22 +192,43 @@ int main()
// }
// cout << "***********************************************************************************************" << endl;
// }
const string file_name = "iris";
auto net = bayesnet::Network();
// const string file_name = "iris";
// auto net = bayesnet::Network();
// auto dt = Datasets(true, "Arff");
// auto raw = RawDatasets("iris", true);
// auto [X, y] = dt.getVectors(file_name);
// cout << "Dataset dims " << raw.dataset.sizes() << endl;
// cout << "weights dims " << raw.weights.sizes() << endl;
// cout << "States dims " << raw.statest.size() << endl;
// cout << "features: ";
// for (const auto& feature : raw.featurest) {
// cout << feature << ", ";
// net.addNode(feature);
// }
// net.addNode(raw.classNamet);
// cout << endl;
// net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest);
auto dt = Datasets(true, "Arff");
auto raw = RawDatasets("iris", true);
auto [X, y] = dt.getVectors(file_name);
cout << "Dataset dims " << raw.dataset.sizes() << endl;
cout << "weights dims " << raw.weights.sizes() << endl;
cout << "States dims " << raw.statest.size() << endl;
cout << "features: ";
for (const auto& feature : raw.featurest) {
cout << feature << ", ";
net.addNode(feature);
for (const auto& name : dt.getNames()) {
//for (const auto& name : { "iris" }) {
auto [X, y] = dt.getTensors(name);
auto features = dt.getFeatures(name);
auto states = dt.getStates(name);
auto className = dt.getClassName(name);
int maxFeatures = 0;
auto classNumStates = states.at(className).size();
torch::Tensor weights = torch::full({ X.size(1) }, 1.0 / X.size(1), torch::kDouble);
auto dataset = X;
auto yresized = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
dataset = torch::cat({ dataset, yresized }, 0);
auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, classNumStates, weights);
cfs.fit();
cout << "Dataset: " << name << " CFS features: ";
for (const auto& feature : cfs.getFeatures()) {
cout << feature << ", ";
}
cout << "end." << endl;
}
net.addNode(raw.classNamet);
cout << endl;
net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest);
}