diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..6faaf51 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "Mac", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [], + "macFrameworkPath": [ + "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks" + ], + "cStandard": "c17", + "cppStandard": "c++17", + "compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 294f0bf..0a4515f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,7 +45,6 @@ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") # CMakes modules # -------------- set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH}) - include(AddGitSubmodule) if (CODE_COVERAGE) enable_testing() @@ -65,7 +64,11 @@ endif (ENABLE_CLANG_TIDY) add_git_submodule("lib/mdlp") add_git_submodule("lib/argparse") add_git_submodule("lib/json") -find_library(XLSXWRITER_LIB libxlsxwriter.dylib PATHS /usr/local/lib) + + +find_library(XLSXWRITER_LIB NAMES libxlsxwriter.dylib libxlsxwriter.so PATHS ${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/lib) +message("XLSXWRITER_LIB=${XLSXWRITER_LIB}") + # Subdirectories # -------------- diff --git a/README.md b/README.md index 426be8d..ad2660c 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,9 @@ export BOOST_ROOT=/path/to/library/ ```bash cd lib/libxlsxwriter make -sudo make install +make install DESTDIR=/home/rmontanana/Code PREFIX= ``` -It has to be installed in /usr/local/lib otherwise CMakeLists.txt has to be modified accordingly - Environment variable has to be set: ```bash diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 623656e..6bd3bbb 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -60,17 +60,7 @@ namespace bayesnet { { return scoresKBest; } - vector> Metrics::doCombinations(const vector& source) - { - vector> result; - for (int i = 0; i < source.size(); ++i) { - string temp = source[i]; - for (int j = i + 1; j < source.size(); ++j) { - result.push_back({ temp, source[j] }); - } - } - return result; - } + torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights) { auto result = vector(); diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index 01841a7..66016a6 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -8,20 +8,39 @@ namespace bayesnet { using namespace torch; class Metrics { private: - Tensor samples; // nxm tensor used to fit the model - vector features; - string className; int classNumStates = 0; vector scoresKBest; vector featuresKBest; // sorted indices of the features - double entropy(const Tensor& feature, const Tensor& weights); double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); - vector> doCombinations(const vector&); + protected: + Tensor samples; // n+1xm tensor used to fit the model where samples[-1] is the y vector + string className; + double entropy(const Tensor& feature, const Tensor& weights); + vector features; + template + vector> doCombinations(const vector& source) + { + vector> result; + for (int i = 0; i < source.size(); ++i) { + T temp = source[i]; + for (int j = i + 1; j < source.size(); ++j) { + result.push_back({ temp, source[j] }); + } + } + return result; + } + template + T pop_first(vector& v) + { + T temp = v[0]; + v.erase(v.begin()); + return temp; + } public: Metrics() = default; Metrics(const torch::Tensor& samples, const vector& features, const string& className, const int classNumStates); Metrics(const vector>& vsamples, const vector& labels, const vector& features, const string& className, const int classNumStates); - vector SelectKBestWeighted(const torch::Tensor& weights, bool ascending=false, unsigned k = 0); + vector SelectKBestWeighted(const torch::Tensor& weights, bool ascending = false, unsigned k = 0); vector getScoresKBest() const; double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); vector conditionalEdgeWeights(vector& weights); // To use in Python diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index c976408..fb38a7c 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -1,36 +1,22 @@ -#include "BoostAODE.h" #include -#include "BayesMetrics.h" +#include +#include +#include "BoostAODE.h" #include "Colors.h" #include "Folding.h" -#include +#include "Paths.h" +#include "CFS.h" +#include "FCBF.h" +#include "IWSS.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} void BoostAODE::buildModel(const torch::Tensor& weights) { // Models shall be built in trainModel - } - void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters) - { - // Check if hyperparameters are valid - const vector validKeys = { "repeatSparent", "maxModels", "ascending", "convergence" }; - checkHyperparameters(validKeys, hyperparameters); - if (hyperparameters.contains("repeatSparent")) { - repeatSparent = hyperparameters["repeatSparent"]; - } - if (hyperparameters.contains("maxModels")) { - maxModels = hyperparameters["maxModels"]; - } - if (hyperparameters.contains("ascending")) { - ascending = hyperparameters["ascending"]; - } - if (hyperparameters.contains("convergence")) { - convergence = hyperparameters["convergence"]; - } - } - void BoostAODE::validationInit() - { + models.clear(); + n_models = 0; + // Prepare the validation dataset auto y_ = dataset.index({ -1, "..." }); if (convergence) { // Prepare train & validation sets from train data @@ -56,18 +42,79 @@ namespace bayesnet { X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }); y_train = y_; } - + } + void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters) + { + // Check if hyperparameters are valid + const vector validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "threshold", "select_features" }; + checkHyperparameters(validKeys, hyperparameters); + if (hyperparameters.contains("repeatSparent")) { + repeatSparent = hyperparameters["repeatSparent"]; + } + if (hyperparameters.contains("maxModels")) { + maxModels = hyperparameters["maxModels"]; + } + if (hyperparameters.contains("ascending")) { + ascending = hyperparameters["ascending"]; + } + if (hyperparameters.contains("convergence")) { + convergence = hyperparameters["convergence"]; + } + if (hyperparameters.contains("threshold")) { + threshold = hyperparameters["threshold"]; + } + if (hyperparameters.contains("select_features")) { + auto selectedAlgorithm = hyperparameters["select_features"]; + vector algos = { "IWSS", "FCBF", "CFS" }; + selectFeatures = true; + algorithm = selectedAlgorithm; + if (find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) { + throw invalid_argument("Invalid selectFeatures value [IWSS, FCBF, CFS]"); + } + } + } + unordered_set BoostAODE::initializeModels() + { + unordered_set featuresUsed; + Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); + int maxFeatures = 0; + if (algorithm == "CFS") { + featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_); + } else if (algorithm == "IWSS") { + if (threshold < 0 || threshold >0.5) { + throw invalid_argument("Invalid threshold value for IWSS [0, 0.5]"); + } + featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold); + } else if (algorithm == "FCBF") { + if (threshold < 1e-7 || threshold > 1) { + throw invalid_argument("Invalid threshold value [1e-7, 1]"); + } + featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold); + } + featureSelector->fit(); + auto cfsFeatures = featureSelector->getFeatures(); + for (const int& feature : cfsFeatures) { + // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl; + featuresUsed.insert(feature); + unique_ptr model = std::make_unique(feature); + model->fit(dataset, features, className, states, weights_); + models.push_back(std::move(model)); + significanceModels.push_back(1.0); + n_models++; + } + delete featureSelector; + return featuresUsed; } void BoostAODE::trainModel(const torch::Tensor& weights) { - models.clear(); - n_models = 0; + unordered_set featuresUsed; + if (selectFeatures) { + featuresUsed = initializeModels(); + } if (maxModels == 0) maxModels = .1 * n > 10 ? .1 * n : n; - validationInit(); Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); bool exitCondition = false; - unordered_set featuresUsed; // Variables to control the accuracy finish condition double priorAccuracy = 0.0; double delta = 1.0; @@ -86,16 +133,16 @@ namespace bayesnet { unique_ptr model; auto feature = featureSelection[0]; if (!repeatSparent || featuresUsed.size() < featureSelection.size()) { - bool found = false; - for (auto feat : featureSelection) { + bool used = true; + for (const auto& feat : featureSelection) { if (find(featuresUsed.begin(), featuresUsed.end(), feat) != featuresUsed.end()) { continue; } - found = true; + used = false; feature = feat; break; } - if (!found) { + if (used) { exitCondition = true; continue; } @@ -135,7 +182,7 @@ namespace bayesnet { count++; } } - exitCondition = n_models == maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance; + exitCondition = n_models >= maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance; } if (featuresUsed.size() != features.size()) { status = WARNING; diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h index 61e2e95..dd1cf75 100644 --- a/src/BayesNet/BoostAODE.h +++ b/src/BayesNet/BoostAODE.h @@ -1,7 +1,9 @@ #ifndef BOOSTAODE_H #define BOOSTAODE_H #include "Ensemble.h" +#include #include "SPODE.h" +#include "FeatureSelect.h" namespace bayesnet { class BoostAODE : public Ensemble { public: @@ -15,11 +17,16 @@ namespace bayesnet { private: torch::Tensor dataset_; torch::Tensor X_train, y_train, X_test, y_test; - void validationInit(); - bool repeatSparent = false; + unordered_set initializeModels(); + // Hyperparameters + bool repeatSparent = false; // if true, a feature can be selected more than once int maxModels = 0; bool ascending = false; //Process KBest features ascending or descending order bool convergence = false; //if true, stop when the model does not improve + bool selectFeatures = false; // if true, use feature selection + string algorithm = ""; // Selected feature selection algorithm + FeatureSelect* featureSelector = nullptr; + double threshold = -1; }; } #endif \ No newline at end of file diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc new file mode 100644 index 0000000..f2ffc1e --- /dev/null +++ b/src/BayesNet/CFS.cc @@ -0,0 +1,72 @@ +#include "CFS.h" +#include +#include "bayesnetUtils.h" +namespace bayesnet { + void CFS::fit() + { + initialize(); + computeSuLabels(); + auto featureOrder = argsort(suLabels); // sort descending order + auto continueCondition = true; + auto feature = featureOrder[0]; + selectedFeatures.push_back(feature); + selectedScores.push_back(suLabels[feature]); + selectedFeatures.erase(selectedFeatures.begin()); + while (continueCondition) { + double merit = numeric_limits::lowest(); + int bestFeature = -1; + for (auto feature : featureOrder) { + selectedFeatures.push_back(feature); + // Compute merit with selectedFeatures + auto meritNew = computeMeritCFS(); + if (meritNew > merit) { + merit = meritNew; + bestFeature = feature; + } + selectedFeatures.pop_back(); + } + if (bestFeature == -1) { + // meritNew has to be nan due to constant features + break; + } + selectedFeatures.push_back(bestFeature); + selectedScores.push_back(merit); + featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end()); + continueCondition = computeContinueCondition(featureOrder); + } + fitted = true; + } + bool CFS::computeContinueCondition(const vector& featureOrder) + { + if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) { + return false; + } + if (selectedScores.size() >= 5) { + /* + "To prevent the best first search from exploring the entire + feature subset search space, a stopping criterion is imposed. + The search will terminate if five consecutive fully expanded + subsets show no improvement over the current best subset." + as stated in Mark A.Hall Thesis + */ + double item_ant = numeric_limits::lowest(); + int num = 0; + vector lastFive(selectedScores.end() - 5, selectedScores.end()); + for (auto item : lastFive) { + if (item_ant == numeric_limits::lowest()) { + item_ant = item; + } + if (item > item_ant) { + break; + } else { + num++; + item_ant = item; + } + } + if (num == 5) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/src/BayesNet/CFS.h b/src/BayesNet/CFS.h new file mode 100644 index 0000000..36b7c52 --- /dev/null +++ b/src/BayesNet/CFS.h @@ -0,0 +1,21 @@ +#ifndef CFS_H +#define CFS_H +#include +#include +#include "FeatureSelect.h" +using namespace std; +namespace bayesnet { + class CFS : public FeatureSelect { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : + FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights) + { + } + virtual ~CFS() {}; + void fit() override; + private: + bool computeContinueCondition(const vector& featureOrder); + }; +} +#endif \ No newline at end of file diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index 2a120f3..cc0f5a5 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc - Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) + Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/BayesNet/FCBF.cc b/src/BayesNet/FCBF.cc new file mode 100644 index 0000000..db935af --- /dev/null +++ b/src/BayesNet/FCBF.cc @@ -0,0 +1,44 @@ +#include "bayesnetUtils.h" +#include "FCBF.h" +namespace bayesnet { + + FCBF::FCBF(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) : + FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold) + { + if (threshold < 1e-7) { + throw std::invalid_argument("Threshold cannot be less than 1e-7"); + } + } + void FCBF::fit() + { + initialize(); + computeSuLabels(); + auto featureOrder = argsort(suLabels); // sort descending order + auto featureOrderCopy = featureOrder; + for (const auto& feature : featureOrder) { + // Don't self compare + featureOrderCopy.erase(featureOrderCopy.begin()); + if (suLabels.at(feature) == 0.0) { + // The feature has been removed from the list + continue; + } + if (suLabels.at(feature) < threshold) { + break; + } + // Remove redundant features + for (const auto& featureCopy : featureOrderCopy) { + double value = computeSuFeatures(feature, featureCopy); + if (value >= suLabels.at(featureCopy)) { + // Remove feature from list + suLabels[featureCopy] = 0.0; + } + } + selectedFeatures.push_back(feature); + selectedScores.push_back(suLabels[feature]); + if (selectedFeatures.size() == maxFeatures) { + break; + } + } + fitted = true; + } +} \ No newline at end of file diff --git a/src/BayesNet/FCBF.h b/src/BayesNet/FCBF.h new file mode 100644 index 0000000..aa7ff47 --- /dev/null +++ b/src/BayesNet/FCBF.h @@ -0,0 +1,18 @@ +#ifndef FCBF_H +#define FCBF_H +#include +#include +#include "FeatureSelect.h" +using namespace std; +namespace bayesnet { + class FCBF : public FeatureSelect { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + FCBF(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold); + virtual ~FCBF() {}; + void fit() override; + private: + double threshold = -1; + }; +} +#endif \ No newline at end of file diff --git a/src/BayesNet/FeatureSelect.cc b/src/BayesNet/FeatureSelect.cc new file mode 100644 index 0000000..11d929b --- /dev/null +++ b/src/BayesNet/FeatureSelect.cc @@ -0,0 +1,79 @@ +#include "FeatureSelect.h" +#include +#include "bayesnetUtils.h" +namespace bayesnet { + FeatureSelect::FeatureSelect(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : + Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights) + + { + } + void FeatureSelect::initialize() + { + selectedFeatures.clear(); + selectedScores.clear(); + } + double FeatureSelect::symmetricalUncertainty(int a, int b) + { + /* + Compute symmetrical uncertainty. Normalize* information gain (mutual + information) with the entropies of the features in order to compensate + the bias due to high cardinality features. *Range [0, 1] + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + */ + auto x = samples.index({ a, "..." }); + auto y = samples.index({ b, "..." }); + auto mu = mutualInformation(x, y, weights); + auto hx = entropy(x, weights); + auto hy = entropy(y, weights); + return 2.0 * mu / (hx + hy); + } + void FeatureSelect::computeSuLabels() + { + // Compute Simmetrical Uncertainty between features and labels + // https://en.wikipedia.org/wiki/Symmetric_uncertainty + for (int i = 0; i < features.size(); ++i) { + suLabels.push_back(symmetricalUncertainty(i, -1)); + } + } + double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature) + { + // Compute Simmetrical Uncertainty between features + // https://en.wikipedia.org/wiki/Symmetric_uncertainty + try { + return suFeatures.at({ firstFeature, secondFeature }); + } + catch (const out_of_range& e) { + double result = symmetricalUncertainty(firstFeature, secondFeature); + suFeatures[{firstFeature, secondFeature}] = result; + return result; + } + } + double FeatureSelect::computeMeritCFS() + { + double result; + double rcf = 0; + for (auto feature : selectedFeatures) { + rcf += suLabels[feature]; + } + double rff = 0; + int n = selectedFeatures.size(); + for (const auto& item : doCombinations(selectedFeatures)) { + rff += computeSuFeatures(item.first, item.second); + } + return rcf / sqrt(n + (n * n - n) * rff); + } + vector FeatureSelect::getFeatures() const + { + if (!fitted) { + throw runtime_error("FeatureSelect not fitted"); + } + return selectedFeatures; + } + vector FeatureSelect::getScores() const + { + if (!fitted) { + throw runtime_error("FeatureSelect not fitted"); + } + return selectedScores; + } +} \ No newline at end of file diff --git a/src/BayesNet/FeatureSelect.h b/src/BayesNet/FeatureSelect.h new file mode 100644 index 0000000..46923c9 --- /dev/null +++ b/src/BayesNet/FeatureSelect.h @@ -0,0 +1,31 @@ +#ifndef FEATURE_SELECT_H +#define FEATURE_SELECT_H +#include +#include +#include "BayesMetrics.h" +using namespace std; +namespace bayesnet { + class FeatureSelect : public Metrics { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + FeatureSelect(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights); + virtual ~FeatureSelect() {}; + virtual void fit() = 0; + vector getFeatures() const; + vector getScores() const; + protected: + void initialize(); + void computeSuLabels(); + double computeSuFeatures(const int a, const int b); + double symmetricalUncertainty(int a, int b); + double computeMeritCFS(); + const torch::Tensor& weights; + int maxFeatures; + vector selectedFeatures; + vector selectedScores; + vector suLabels; + map, double> suFeatures; + bool fitted = false; + }; +} +#endif \ No newline at end of file diff --git a/src/BayesNet/IWSS.cc b/src/BayesNet/IWSS.cc new file mode 100644 index 0000000..f39f137 --- /dev/null +++ b/src/BayesNet/IWSS.cc @@ -0,0 +1,47 @@ +#include "IWSS.h" +#include +#include "bayesnetUtils.h" +namespace bayesnet { + IWSS::IWSS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) : + FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold) + { + if (threshold < 0 || threshold > .5) { + throw std::invalid_argument("Threshold has to be in [0, 0.5]"); + } + } + void IWSS::fit() + { + initialize(); + computeSuLabels(); + auto featureOrder = argsort(suLabels); // sort descending order + auto featureOrderCopy = featureOrder; + // Add first and second features to result + // First with its own score + auto first_feature = pop_first(featureOrderCopy); + selectedFeatures.push_back(first_feature); + selectedScores.push_back(suLabels.at(first_feature)); + // Second with the score of the candidates + selectedFeatures.push_back(pop_first(featureOrderCopy)); + auto merit = computeMeritCFS(); + selectedScores.push_back(merit); + for (const auto feature : featureOrderCopy) { + selectedFeatures.push_back(feature); + // Compute merit with selectedFeatures + auto meritNew = computeMeritCFS(); + double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0; + if (meritNew > merit || delta < threshold) { + if (meritNew > merit) { + merit = meritNew; + } + selectedScores.push_back(meritNew); + } else { + selectedFeatures.pop_back(); + break; + } + if (selectedFeatures.size() == maxFeatures) { + break; + } + } + fitted = true; + } +} \ No newline at end of file diff --git a/src/BayesNet/IWSS.h b/src/BayesNet/IWSS.h new file mode 100644 index 0000000..88a1034 --- /dev/null +++ b/src/BayesNet/IWSS.h @@ -0,0 +1,18 @@ +#ifndef IWSS_H +#define IWSS_H +#include +#include +#include "FeatureSelect.h" +using namespace std; +namespace bayesnet { + class IWSS : public FeatureSelect { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + IWSS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold); + virtual ~IWSS() {}; + void fit() override; + private: + double threshold = -1; + }; +} +#endif \ No newline at end of file diff --git a/src/BayesNet/Node.h b/src/BayesNet/Node.h index 6758c5c..4979007 100644 --- a/src/BayesNet/Node.h +++ b/src/BayesNet/Node.h @@ -14,8 +14,8 @@ namespace bayesnet { int numStates; // number of states of the variable torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ... vector dimensions; // dimensions of the cpTable - public: vector> combinations(const vector&); + public: explicit Node(const string&); void clear(); void addParent(Node*); diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 4111c34..3a565e1 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -9,14 +9,9 @@ add_executable(b_main main.cc Folding.cc Experiment.cc Datasets.cc Dataset.cc Mo add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc) add_executable(b_list list.cc Datasets.cc Dataset.cc) add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc) -add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc) +add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc ) target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") -if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux") - target_link_libraries(b_manage "${TORCH_LIBRARIES}" libxlsxwriter.so ArffFiles mdlp stdc++fs) - target_link_libraries(b_best Boost::boost libxlsxwriter.so stdc++fs) -else() - target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp) - target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}") -endif() +target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp) +target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}") target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}") -target_link_libraries(testx ArffFiles mdlp BayesNet "${TORCH_LIBRARIES}") \ No newline at end of file +target_link_libraries(testx ArffFiles BayesNet "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/Dataset.cc b/src/Platform/Dataset.cc index 02a36f9..f75fdbc 100644 --- a/src/Platform/Dataset.cc +++ b/src/Platform/Dataset.cc @@ -212,14 +212,4 @@ namespace platform { } return Xd; } - vector Dataset::split(const string& text, char delimiter) - { - vector result; - stringstream ss(text); - string token; - while (getline(ss, token, delimiter)) { - result.push_back(token); - } - return result; - } } \ No newline at end of file diff --git a/src/Platform/Dataset.h b/src/Platform/Dataset.h index fbc577e..21b619e 100644 --- a/src/Platform/Dataset.h +++ b/src/Platform/Dataset.h @@ -5,6 +5,7 @@ #include #include #include "CPPFImdlp.h" +#include "Utils.h" namespace platform { using namespace std; @@ -62,7 +63,6 @@ namespace platform { public: Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; explicit Dataset(const Dataset&); - static vector split(const string& text, char delimiter); string getName() const; string getClassName() const; vector getFeatures() const; diff --git a/src/Platform/Datasets.cc b/src/Platform/Datasets.cc index 717ccbc..4f53a2b 100644 --- a/src/Platform/Datasets.cc +++ b/src/Platform/Datasets.cc @@ -13,7 +13,7 @@ namespace platform { if (line.empty() || line[0] == '#') { continue; } - vector tokens = Dataset::split(line, ','); + vector tokens = split(line, ','); string name = tokens[0]; string className; if (tokens.size() == 1) { diff --git a/src/Platform/DotEnv.h b/src/Platform/DotEnv.h index c481310..7d5ee2b 100644 --- a/src/Platform/DotEnv.h +++ b/src/Platform/DotEnv.h @@ -4,7 +4,11 @@ #include #include #include -#include "Dataset.h" +#include +#include +#include "Utils.h" + +//#include "Dataset.h" namespace platform { class DotEnv { private: @@ -51,7 +55,7 @@ namespace platform { auto seeds_str = env["seeds"]; seeds_str = trim(seeds_str); seeds_str = seeds_str.substr(1, seeds_str.size() - 2); - auto seeds_str_split = Dataset::split(seeds_str, ','); + auto seeds_str_split = split(seeds_str, ','); transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) { return stoi(str); }); diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index dced445..311dbc7 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -3,7 +3,7 @@ #include "Datasets.h" #include "Models.h" #include "ReportConsole.h" -#include "DotEnv.h" +#include "Paths.h" namespace platform { using json = nlohmann::json; string get_date() @@ -134,8 +134,7 @@ namespace platform { } void Experiment::cross_validation(const string& fileName) { - auto env = platform::DotEnv(); - auto datasets = platform::Datasets(discretized, env.get("source_data")); + auto datasets = platform::Datasets(discretized, Paths::datasets()); // Get dataset auto [X, y] = datasets.getTensors(fileName); auto states = datasets.getStates(fileName); diff --git a/src/Platform/Paths.h b/src/Platform/Paths.h index 926568e..16d459c 100644 --- a/src/Platform/Paths.h +++ b/src/Platform/Paths.h @@ -1,11 +1,18 @@ #ifndef PATHS_H #define PATHS_H #include +#include "DotEnv.h" namespace platform { class Paths { public: static std::string results() { return "results/"; } static std::string excel() { return "excel/"; } + static std::string cfs() { return "cfs/"; } + static std::string datasets() + { + auto env = platform::DotEnv(); + return env.get("source_data"); + } }; } #endif \ No newline at end of file diff --git a/src/Platform/ReportBase.cc b/src/Platform/ReportBase.cc index 5f113a5..acb5581 100644 --- a/src/Platform/ReportBase.cc +++ b/src/Platform/ReportBase.cc @@ -58,8 +58,7 @@ namespace platform { } } else { if (data["score_name"].get() == "accuracy") { - auto env = platform::DotEnv(); - auto dt = Datasets(false, env.get("source_data")); + auto dt = Datasets(false, Paths::datasets()); dt.loadDataset(dataset); auto numClasses = dt.getNClasses(dataset); if (numClasses == 2) { diff --git a/src/Platform/ReportConsole.cc b/src/Platform/ReportConsole.cc index bb08ef3..c8e6890 100644 --- a/src/Platform/ReportConsole.cc +++ b/src/Platform/ReportConsole.cc @@ -53,13 +53,9 @@ namespace platform { const string status = compareResult(r["dataset"].get(), r["score"].get()); cout << status; cout << setw(12) << right << setprecision(6) << fixed << r["time"].get() << "±" << setw(6) << setprecision(4) << fixed << r["time_std"].get() << " "; - try { - cout << r["hyperparameters"].get(); - } - catch (const exception& err) { - cout << r["hyperparameters"]; - } + cout << r["hyperparameters"].dump(); cout << endl; + cout << flush; lastResult = r; totalScore += r["score"].get(); odd = !odd; diff --git a/src/Platform/Utils.h b/src/Platform/Utils.h new file mode 100644 index 0000000..3e24f05 --- /dev/null +++ b/src/Platform/Utils.h @@ -0,0 +1,19 @@ +#ifndef UTILS_H +#define UTILS_H +#include +#include +#include +namespace platform { + //static vector split(const string& text, char delimiter); + static std::vector split(const std::string& text, char delimiter) + { + std::vector result; + std::stringstream ss(text); + std::string token; + while (std::getline(ss, token, delimiter)) { + result.push_back(token); + } + return result; + } +} +#endif \ No newline at end of file diff --git a/src/Platform/list.cc b/src/Platform/list.cc index 8c386a5..581ee5f 100644 --- a/src/Platform/list.cc +++ b/src/Platform/list.cc @@ -3,7 +3,6 @@ #include "Paths.h" #include "Colors.h" #include "Datasets.h" -#include "DotEnv.h" using namespace std; const int BALANCE_LENGTH = 75; @@ -28,8 +27,7 @@ void outputBalance(const string& balance) int main(int argc, char** argv) { - auto env = platform::DotEnv(); - auto data = platform::Datasets(false, env.get("source_data")); + auto data = platform::Datasets(false, platform::Paths::datasets()); locale mylocale(cout.getloc(), new separated); locale::global(mylocale); cout.imbue(mylocale); diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 62470c5..ecdf258 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -12,7 +12,7 @@ using namespace std; using json = nlohmann::json; -argparse::ArgumentParser manageArguments(int argc, char** argv) +argparse::ArgumentParser manageArguments() { auto env = platform::DotEnv(); argparse::ArgumentParser program("main"); @@ -48,44 +48,40 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) }}); auto seed_values = env.getSeeds(); program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values); + return program; +} + +int main(int argc, char** argv) +{ + string file_name, model_name, title; + json hyperparameters_json; + bool discretize_dataset, stratified, saveResults; + vector seeds; + vector filesToTest; + int n_folds; + auto program = manageArguments(); try { program.parse_args(argc, argv); - auto file_name = program.get("dataset"); - auto model_name = program.get("model"); - auto discretize_dataset = program.get("discretize"); - auto stratified = program.get("stratified"); - auto n_folds = program.get("folds"); - auto seeds = program.get>("seeds"); - auto title = program.get("title"); + file_name = program.get("dataset"); + model_name = program.get("model"); + discretize_dataset = program.get("discretize"); + stratified = program.get("stratified"); + n_folds = program.get("folds"); + seeds = program.get>("seeds"); auto hyperparameters = program.get("hyperparameters"); - auto saveResults = program.get("save"); + hyperparameters_json = json::parse(hyperparameters); + title = program.get("title"); if (title == "" && file_name == "") { throw runtime_error("title is mandatory if dataset is not provided"); } + saveResults = program.get("save"); } catch (const exception& err) { cerr << err.what() << endl; cerr << program; exit(1); } - return program; -} - -int main(int argc, char** argv) -{ - auto program = manageArguments(argc, argv); - auto file_name = program.get("dataset"); - auto model_name = program.get("model"); - auto discretize_dataset = program.get("discretize"); - auto stratified = program.get("stratified"); - auto n_folds = program.get("folds"); - auto seeds = program.get>("seeds"); - auto hyperparameters = program.get("hyperparameters"); - vector filesToTest; - auto env = platform::DotEnv(); - auto datasets = platform::Datasets(discretize_dataset, env.get("source_data")); - auto title = program.get("title"); - auto saveResults = program.get("save"); + auto datasets = platform::Datasets(discretize_dataset, platform::Paths::datasets()); if (file_name != "") { if (!datasets.isDataset(file_name)) { cerr << "Dataset " << file_name << " not found" << endl; @@ -102,12 +98,12 @@ int main(int argc, char** argv) /* * Begin Processing */ - + auto env = platform::DotEnv(); auto experiment = platform::Experiment(); experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("14.0.3"); experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform")); experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy"); - experiment.setHyperparameters(json::parse(hyperparameters)); + experiment.setHyperparameters(hyperparameters_json); for (auto seed : seeds) { experiment.addRandomSeed(seed); } diff --git a/src/Platform/testx.cpp b/src/Platform/testx.cpp index 43ab29c..dfd6a21 100644 --- a/src/Platform/testx.cpp +++ b/src/Platform/testx.cpp @@ -1,5 +1,6 @@ #include "Folding.h" #include +#include "nlohmann/json.hpp" #include "map" #include #include @@ -7,6 +8,9 @@ #include "Network.h" #include "ArffFiles.h" #include "CPPFImdlp.h" +#include "CFS.h" +#include "IWSS.h" +#include "FCBF.h" using namespace std; using namespace platform; @@ -191,22 +195,54 @@ int main() // } // cout << "***********************************************************************************************" << endl; // } - const string file_name = "iris"; - auto net = bayesnet::Network(); + // const string file_name = "iris"; + // auto net = bayesnet::Network(); + // auto dt = Datasets(true, "Arff"); + // auto raw = RawDatasets("iris", true); + // auto [X, y] = dt.getVectors(file_name); + // cout << "Dataset dims " << raw.dataset.sizes() << endl; + // cout << "weights dims " << raw.weights.sizes() << endl; + // cout << "States dims " << raw.statest.size() << endl; + // cout << "features: "; + // for (const auto& feature : raw.featurest) { + // cout << feature << ", "; + // net.addNode(feature); + // } + // net.addNode(raw.classNamet); + // cout << endl; + // net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); auto dt = Datasets(true, "Arff"); - auto raw = RawDatasets("iris", true); - auto [X, y] = dt.getVectors(file_name); - cout << "Dataset dims " << raw.dataset.sizes() << endl; - cout << "weights dims " << raw.weights.sizes() << endl; - cout << "States dims " << raw.statest.size() << endl; - cout << "features: "; - for (const auto& feature : raw.featurest) { - cout << feature << ", "; - net.addNode(feature); + nlohmann::json output; + for (const auto& name : dt.getNames()) { + // for (const auto& name : { "iris" }) { + auto [X, y] = dt.getTensors(name); + auto features = dt.getFeatures(name); + auto states = dt.getStates(name); + auto className = dt.getClassName(name); + int maxFeatures = 0; + auto classNumStates = states.at(className).size(); + torch::Tensor weights = torch::full({ X.size(1) }, 1.0 / X.size(1), torch::kDouble); + auto dataset = X; + auto yresized = torch::transpose(y.view({ y.size(0), 1 }), 0, 1); + dataset = torch::cat({ dataset, yresized }, 0); + auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, classNumStates, weights); + auto fcbf = bayesnet::FCBF(dataset, features, className, maxFeatures, classNumStates, weights, 1e-7); + auto iwss = bayesnet::IWSS(dataset, features, className, maxFeatures, classNumStates, weights, 0.5); + cout << "Dataset: " << setw(20) << name << flush; + cfs.fit(); + cout << " CFS: " << setw(4) << cfs.getFeatures().size() << flush; + fcbf.fit(); + cout << " FCBF: " << setw(4) << fcbf.getFeatures().size() << flush; + iwss.fit(); + cout << " IWSS: " << setw(4) << iwss.getFeatures().size() << flush; + cout << endl; + output[name]["CFS"] = cfs.getFeatures(); + output[name]["FCBF"] = fcbf.getFeatures(); + output[name]["IWSS"] = iwss.getFeatures(); } - net.addNode(raw.classNamet); - cout << endl; - net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); + ofstream file("features_cpp.json"); + file << output; + file.close(); }