Line data Source code
1 : // ***************************************************************
2 : // SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
3 : // SPDX-FileType: SOURCE
4 : // SPDX-License-Identifier: MIT
5 : // ***************************************************************
6 :
7 : #include <ArffFiles.h>
8 : #include "Proposal.h"
9 :
10 : namespace bayesnet {
11 636 : Proposal::Proposal(torch::Tensor& dataset_, std::vector<std::string>& features_, std::string& className_) : pDataset(dataset_), pFeatures(features_), pClassName(className_) {}
12 300 : Proposal::~Proposal()
13 : {
14 2844 : for (auto& [key, value] : discretizers) {
15 2544 : delete value;
16 : }
17 300 : }
18 342 : void Proposal::checkInput(const torch::Tensor& X, const torch::Tensor& y)
19 : {
20 342 : if (!torch::is_floating_point(X)) {
21 0 : throw std::invalid_argument("X must be a floating point tensor");
22 : }
23 342 : if (torch::is_floating_point(y)) {
24 0 : throw std::invalid_argument("y must be an integer tensor");
25 : }
26 342 : }
27 318 : map<std::string, std::vector<int>> Proposal::localDiscretizationProposal(const map<std::string, std::vector<int>>& oldStates, Network& model)
28 : {
29 : // order of local discretization is important. no good 0, 1, 2...
30 : // although we rediscretize features after the local discretization of every feature
31 318 : auto order = model.topological_sort();
32 318 : auto& nodes = model.getNodes();
33 318 : map<std::string, std::vector<int>> states = oldStates;
34 318 : std::vector<int> indicesToReDiscretize;
35 318 : bool upgrade = false; // Flag to check if we need to upgrade the model
36 2664 : for (auto feature : order) {
37 2346 : auto nodeParents = nodes[feature]->getParents();
38 2346 : if (nodeParents.size() < 2) continue; // Only has class as parent
39 1986 : upgrade = true;
40 1986 : int index = find(pFeatures.begin(), pFeatures.end(), feature) - pFeatures.begin();
41 1986 : indicesToReDiscretize.push_back(index); // We need to re-discretize this feature
42 1986 : std::vector<std::string> parents;
43 6030 : transform(nodeParents.begin(), nodeParents.end(), back_inserter(parents), [](const auto& p) { return p->getName(); });
44 : // Remove class as parent as it will be added later
45 1986 : parents.erase(remove(parents.begin(), parents.end(), pClassName), parents.end());
46 : // Get the indices of the parents
47 1986 : std::vector<int> indices;
48 1986 : indices.push_back(-1); // Add class index
49 4044 : transform(parents.begin(), parents.end(), back_inserter(indices), [&](const auto& p) {return find(pFeatures.begin(), pFeatures.end(), p) - pFeatures.begin(); });
50 : // Now we fit the discretizer of the feature, conditioned on its parents and the class i.e. discretizer.fit(X[index], X[indices] + y)
51 1986 : std::vector<std::string> yJoinParents(Xf.size(1));
52 6030 : for (auto idx : indices) {
53 1437960 : for (int i = 0; i < Xf.size(1); ++i) {
54 4301748 : yJoinParents[i] += to_string(pDataset.index({ idx, i }).item<int>());
55 : }
56 : }
57 1986 : auto arff = ArffFiles();
58 1986 : auto yxv = arff.factorize(yJoinParents);
59 3972 : auto xvf_ptr = Xf.index({ index }).data_ptr<float>();
60 1986 : auto xvf = std::vector<mdlp::precision_t>(xvf_ptr, xvf_ptr + Xf.size(1));
61 1986 : discretizers[feature]->fit(xvf, yxv);
62 2706 : }
63 318 : if (upgrade) {
64 : // Discretize again X (only the affected indices) with the new fitted discretizers
65 2304 : for (auto index : indicesToReDiscretize) {
66 3972 : auto Xt_ptr = Xf.index({ index }).data_ptr<float>();
67 1986 : auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
68 7944 : pDataset.index_put_({ index, "..." }, torch::tensor(discretizers[pFeatures[index]]->transform(Xt)));
69 1986 : auto xStates = std::vector<int>(discretizers[pFeatures[index]]->getCutPoints().size() + 1);
70 1986 : iota(xStates.begin(), xStates.end(), 0);
71 : //Update new states of the feature/node
72 1986 : states[pFeatures[index]] = xStates;
73 1986 : }
74 318 : const torch::Tensor weights = torch::full({ pDataset.size(1) }, 1.0 / pDataset.size(1), torch::kDouble);
75 318 : model.fit(pDataset, weights, pFeatures, pClassName, states);
76 318 : }
77 636 : return states;
78 1440192 : }
79 348 : map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y)
80 : {
81 : // Discretize the continuous input data and build pDataset (Classifier::dataset)
82 348 : int m = Xf.size(1);
83 348 : int n = Xf.size(0);
84 348 : map<std::string, std::vector<int>> states;
85 348 : pDataset = torch::zeros({ n + 1, m }, torch::kInt32);
86 348 : auto yv = std::vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0));
87 : // discretize input data by feature(row)
88 2916 : for (auto i = 0; i < pFeatures.size(); ++i) {
89 2568 : auto* discretizer = new mdlp::CPPFImdlp();
90 5136 : auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
91 2568 : auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
92 2568 : discretizer->fit(Xt, yv);
93 10272 : pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->transform(Xt)));
94 2568 : auto xStates = std::vector<int>(discretizer->getCutPoints().size() + 1);
95 2568 : iota(xStates.begin(), xStates.end(), 0);
96 2568 : states[pFeatures[i]] = xStates;
97 2568 : discretizers[pFeatures[i]] = discretizer;
98 2568 : }
99 348 : int n_classes = torch::max(y).item<int>() + 1;
100 348 : auto yStates = std::vector<int>(n_classes);
101 348 : iota(yStates.begin(), yStates.end(), 0);
102 348 : states[pClassName] = yStates;
103 1044 : pDataset.index_put_({ n, "..." }, y);
104 696 : return states;
105 5832 : }
106 252 : torch::Tensor Proposal::prepareX(torch::Tensor& X)
107 : {
108 252 : auto Xtd = torch::zeros_like(X, torch::kInt32);
109 2064 : for (int i = 0; i < X.size(0); ++i) {
110 1812 : auto Xt = std::vector<float>(X[i].data_ptr<float>(), X[i].data_ptr<float>() + X.size(1));
111 1812 : auto Xd = discretizers[pFeatures[i]]->transform(Xt);
112 5436 : Xtd.index_put_({ i }, torch::tensor(Xd, torch::kInt32));
113 1812 : }
114 252 : return Xtd;
115 1812 : }
116 : }
|