diff --git a/BUILD.bazel b/BUILD.bazel index a2561811..8a474563 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -48,6 +48,7 @@ cc_library( "include/albatross/serialize/Ransac", "include/albatross/src/cereal/ThreadPool.hpp", "include/albatross/src/cereal/block_utils.hpp", + "include/albatross/src/cereal/block_diagonal.hpp", "include/albatross/src/cereal/dataset.hpp", "include/albatross/src/cereal/distribution.hpp", "include/albatross/src/cereal/eigen.hpp", @@ -127,8 +128,10 @@ cc_library( "include/albatross/src/models/gp.hpp", "include/albatross/src/models/least_squares.hpp", "include/albatross/src/models/null_model.hpp", + "include/albatross/src/models/pic_gp.hpp", "include/albatross/src/models/ransac.hpp", "include/albatross/src/models/ransac_gp.hpp", + "include/albatross/src/models/sparse_common.hpp", "include/albatross/src/models/sparse_gp.hpp", "include/albatross/src/samplers/callbacks.hpp", "include/albatross/src/samplers/ensemble.hpp", @@ -228,6 +231,7 @@ swift_cc_test( "tests/test_models.cc", "tests/test_models.h", "tests/test_parameter_handling_mixin.cc", + "tests/test_pic_gp.cc", "tests/test_prediction.cc", "tests/test_radial.cc", "tests/test_random_utils.cc", diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index ab62240f..cc08b75b 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -1,12 +1,12 @@ workspace(name = "albatross") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") - + http_archive( name = "rules_swiftnav", - sha256 = "8e00b694b6dce9c91d811e2e9e13e148af5f1dd329b85b7c5d2d9d2238aa92dc", - strip_prefix = "rules_swiftnav-1c51c97743c9632169dd7e5a228c6ce915893236", - url = "https://github.com/swift-nav/rules_swiftnav/archive/1c51c97743c9632169dd7e5a228c6ce915893236.tar.gz", + sha256 = "2f91790be7845e2eeb746dd33c59d1ba5d6ee85360a3218d2a4a77ead79f81b9", + strip_prefix = "rules_swiftnav-030c1c5050e7541ce66cd82f523939a525b1551c", + url = "https://github.com/swift-nav/rules_swiftnav/archive/030c1c5050e7541ce66cd82f523939a525b1551c.tar.gz", ) load("@rules_swiftnav//cc:repositories.bzl", "register_swift_cc_toolchains", "swift_cc_toolchain") diff --git a/doc/src/crappy-pic.rst b/doc/src/crappy-pic.rst new file mode 100644 index 00000000..0005dc14 --- /dev/null +++ b/doc/src/crappy-pic.rst @@ -0,0 +1,164 @@ +################################################# +PIC math scratchpad +################################################# + +.. _crappy-pic: + +------------------------------------------------------------------- +Relationship between :math:`\sigma_\cancel{B}` and :math:`\sigma_f` +------------------------------------------------------------------- + +It seems like what we need first of all is some way to get the PITC posterior variance based only on :math:`\cancel{B}` using only the full PITC posterior and calculations that scale only with :math:`|B|`. I'm not totally finished working out how :math:`(\sigma^2_{*})^{\cancel{B} \cancel{B}}` fits into the whole posterior :math:`(\sigma^2_*)^{PIC}` + +Observe that since :math:`\mathbf{\Lambda}` is a block-diagonal matrix, its inverse is too, so when it's in the middle of a quadratic form, the cross terms disappear. As far as I can tell, this is the only place we can additively separate :math:`\mathbf{Q}_{\cancel{B} \cancel{B}}` and :math:`\mathbf{K}_{B B}` without explicit cross terms. Start with the usual application of Woodbury's lemma to the PITC posterior covariance: + +.. math:: + + \newcommand{Lam}{\mathbf{\Lambda}} + \newcommand{Kuu}{\mathbf{K}_{uu}} + \newcommand{Kuf}{\mathbf{K}_{uf}} + \newcommand{Kfu}{\mathbf{K}_{fu}} + (\sigma_*^2) &= K_* - \mathbf{Q}_{* f} \left(\mathbf{K}_{fu} \mathbf{K}_{uu}^{-1} \mathbf{K}_{uf} + \mathbf{\Lambda} \right)^{-1} \mathbf{Q}_{f *} \\ + &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \Kuf \left(\Kfu \Kuu^{-1} \Kuf + \Lam\right)^{-1} \Kfu \Kuu^{-1} \mathbf{K}_{u*} \\ + &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \mathbf{K}_{u*} + +Now break up :math:`\Kuf` and :math:`\Lam`: + +.. math:: + + \Kuf &= \begin{bmatrix} \mathbf{K}_{u \cancel{B}} & \mathbf{K}_{u B} \end{bmatrix} \\ + \Lam &= \begin{bmatrix} \Lam_{\cancel{B}} & \mathbf{0} \\ \mathbf{0} & \Lam_{B} \end{bmatrix} \\ + +so that + +.. math:: + + \Kuf \Lam^{-1} \Kfu &= \begin{bmatrix} \mathbf{K}_{u \cancel{B}} & \mathbf{K}_{u B} \end{bmatrix} + \begin{bmatrix} \Lam_{\cancel{B}}^{-1} & \mathbf{0} \\ \mathbf{0} & \Lam_{B}^{-1} \end{bmatrix} + \begin{bmatrix} \mathbf{K}_{\cancel{B} u} \\ \mathbf{K}_{B u} \end{bmatrix} \\ + &= \mathbf{K}_{u \cancel{B}} \Lam_{\cancel{B}}^{-1} \mathbf{K}_{\cancel{B} u} + + \mathbf{K}_{u B} \Lam_{B}^{-1} \mathbf{K}_{B u} \\ + \mathbf{K}_{u \cancel{B}} \Lam_{\cancel{B}}^{-1} \mathbf{K}_{\cancel{B} u} &= \Kuf \Lam^{-1} \Kfu - \mathbf{K}_{u B} \Lam_{B}^{-1} \mathbf{K}_{B u} + +This last one is the inverse term in the training-data dependent part of the PITC posterior covariance using only the training points in :math:`\cancel{B}`, which I think is what we want to combine with the dense posterior from the training points in :math:`B`: + +.. math:: + (\sigma_*^2)^{\cancel{B} \cancel{B}} = K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \Kuu + \mathbf{K}_{u \cancel{B}} \Lam_{\cancel{B}}^{-1} \mathbf{K}_{\cancel{B} u} \right)^{-1} \mathbf{K}_{u*} + +which we can write in terms of only the full PITC prior and the prior for group :math:`B`: + +.. math:: + (\sigma_*^2)^{\cancel{B} \cancel{B}} = K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \Kuu + \Kuf \Lam^{-1} \Kfu - \mathbf{K}_{u B} \Lam_{B}^{-1} \mathbf{K}_{B u} \right)^{-1} \mathbf{K}_{u*} + +If you set :math:`B = \emptyset`, removing the correction for the training points in block :math:`B` entirely, you have the original PITC posterior covariance based on the full set of training points :math:`f`, as expected. Now we only have to compute :math:`\Kuf \Lam^{-1} \Kfu` (the term that scales with the full training set :math:`f`) once, and nothing that scales with :math:`|\cancel{B}|`. This still involves an :math:`O(M^3)` decomposition for each group :math:`B`. I have been assuming our group size :math:`|B|` is significantly smaller than the number of inducing points :math:`M`, so it's not great, but an interesting thing happens if we use the "downdate" form of Woodbury's lemma in the opposite direction to the usual way: + +.. math:: + \left(A - B C^{-1} B^T \right)^{-1} &= A^{-1} + A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + +Letting :math:`\mathbf{A} = \Kuu + \Kuf \Lam^{-1} \Kfu` (playing the part of :math:`A` in Woodbury's lemma): + +.. math:: + + (\sigma_*^2)^{\cancel{B} \cancel{B}} &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \mathbf{A}^{-1} + \mathbf{A}^{-1} \mathbf{K}_{u B} \left(\Lam_{B} - \mathbf{K}_{B u} \mathbf{A}^{-1} \mathbf{K}_{u B}\right)^{-1} \mathbf{K}_{B u} \mathbf{A}^{-1} \right) \mathbf{K}_{u*} \\ + &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \mathbf{A}^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \mathbf{A}^{-1} \mathbf{K}_{u B} \left(\Lam_{B} - \mathbf{K}_{B u} \mathbf{A}^{-1} \mathbf{K}_{u B}\right)^{-1} \mathbf{K}_{B u} \mathbf{A}^{-1} \mathbf{K}_{u*} \\ + +I find the use of Woodbury's lemma to downdate a bit uncomfortable -- does this work with real symmetric matrices? But we would only need to decompose :math:`\mathbf{A}` (and also :math:`\Kuu`) at a cost of :math:`O(M^3)` once; the inversion to be calculated per group costs :math:`O(|B|^3)`. I also don't immediately spot a slick QR decomposition way to do this per-group term; it's pretty symmetrical. The only thing that jumps out is precomputing :math:`\mathbf{K}_{B u} \mathbf{A}^{-\frac{1}{2}}`. + +---------------------------------------------- +Derivation of Woodbury's Lemma for Differences +---------------------------------------------- + +This is specifically for symmetric matrices, to convince myself the downdate formula is legit. We would like to compute a rank-:math:`k` downdate to the inverse of a matrix :math:`A`. + +.. math:: + \left(A - B C^{-1} B^T \right)^{-1} &= A^{-1} + A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + +To be extra clear, let's say :math:`A \in \mathbb{R}^{n \times n}`, :math:`B \in \mathbb{R}^{n \times k}`, :math:`C \in \mathbb{R}^{k \times k}`. We put the relevant matrices into a block matrix :math:`M \in \mathbb{R}^{m \times m}` for :math:`m = n + k`, and label the conforming blocks of its inverse: + +.. math:: + M &= \begin{pmatrix} A & B \\ B^T & C\end{pmatrix} \\ + M^{-1} &= \begin{pmatrix} W & X \\ X^T & Y\end{pmatrix} \\ + M M^{-1} &= \begin{pmatrix} \mathbf{I} & \mathbf{0} \\ \mathbf{0} & \mathbf{I} \end{pmatrix} \\ + M^{-1} M &= \begin{pmatrix} \mathbf{I} & \mathbf{0} \\ \mathbf{0} & \mathbf{I} \end{pmatrix} + +The blocks of :math:`M M^{-1}` yield the following equations: + +.. math:: + AW + BX^T &= \mathbf{I} \\ + AX + BY &= \mathbf{0} \\ + B^T W + CX^T &= \mathbf{0} \\ + B^T X + CY &= \mathbf{I} \\ + +Rearrange the middle two: + +.. math:: + X &= -A^{-1} B Y \\ + X^T &= -C^{-1} B^T W + +If we do the same for :math:`M^{-1} M`: + +.. math:: + X &= -W B C^{-1} \\ + X^T &= -Y B^T A^{-1} + +These blocks are equal: + +.. math:: + C^{-1} B^T W &= Y B^T A^{-1} \\ + A^{-1} B Y &= W B C^{-1} \\ + +Now use the middle two equations from :math:`M M^{-1}` and plug them into the first and last equations irrespectively: + +.. math:: + W - A^{-1} B C^{-1} B^T W &= A^{1} \\ + \left(I - A^{-1} B C^{-1} B^T\right) W &= A^{-1} \\ + -C^{-1} B^T A B Y + Y &= C^{-1} \\ + \left(I - C^{-1} B^T A^{-1} B\right)Y &= C^{-1} \\ + +Assuming :math:`\left(I - A^{-1} B C^{-1} B^T\right)` and :math:`\left(I - C^{-1} B^T A^{-1} B\right)` are invertible, rearrange: + +.. math:: + W &= \left(I - A^{-1} B C^{-1} B^T\right)^{-1} A^{-1} \\ + &= \left(A - B C^{-1} B^T\right)^{-1} \\ + Y &= \left(I - C^{-1} B^T A^{-1} B\right)^{-1} C^{-1} \\ + &= \left(C - B^T A^{-1} B\right)^{-1} + +Now use equality of the off-diagonal blocks from the two ways to multiply :math:`M` and :math:`M^{-1}` above (don't you wish Sphinx would number equations?) and substitute: + +.. math:: + C^{-1} B^T \left(A - B C^{-1} B^T\right)^{-1} &= \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + \left(A - B C^{-1} B^T\right)^{-1} B C^{-1} &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} \\ + +Let's look just at the term :math:`\left(A - B C^{-1} B^T\right)` and right-multiply by :math:`-A^{-1}`: + +.. math:: + \left(A - B C^{-1} B^T\right) \left(-A^{-1}\right) &= -\mathbf{I} + B C^{-1} B^T A^{-1} \\ + \mathbf{I} + \left(A - B C^{-1} B^T\right) \left(-A^{-1}\right) &= B C^{-1} B^T A^{-1} + +Now let's return to the previous equation and right-multiply by :math:`B^T A^{-1}`: + +.. math:: + \left(A - B C^{-1} B^T\right)^{-1} B C^{-1} B^T A^{-1} &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + +Substitute the previous result for :math:`B C^{-1} B^T A^{-1}`: + +.. math:: + \left(A - B C^{-1} B^T\right)^{-1} \left( \mathbf{I} + \left(A - B C^{-1} B^T\right) \left(-A^{-1}\right) \right) &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + \left(A - B C^{-1} B^T\right)^{-1} - A^{-1} &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + \left(A - B C^{-1} B^T \right)^{-1} &= A^{-1} + A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \blacksquare + +which is the difference form of Woodbury's lemma, or a rank-:math:`k` downdate of :math:`A^{-1}`. The main assumption seems to be that :math:`\left(I - A^{-1} B C^{-1} B^T\right)` and :math:`\left(I - C^{-1} B^T A^{-1} B\right)` are invertible. + +------------------------ +Blockwise inversion of S +------------------------ + +I tried to invert :math:`\mathbf{S}` blockwise; it didn't work out but I'm leaving it in here just to look back at: + +.. math:: + \newcommand{VV}{\mathbf{V}} + \mathbf{U} &= \mathbf{Q}_{* \cancel{B}} \mathbf{S}^{-1}_{\cancel{B} B} \VV_{B *} \\ + &= \mathbf{Q}_{* \cancel{B}} \left( \mathbf{Q}_{\cancel{B} \cancel{B}}^{-1} \mathbf{Q}_{\cancel{B} B} \left(\mathbf{Q}_{B B} - \mathbf{Q}_{B \cancel{B}} \mathbf{Q}_{\cancel{B} \cancel{B}}^{-1} \mathbf{Q}_{\cancel{B} B}\right)^{-1} \right) \VV_{B *} + + +How are we going to avoid the :math:`O(|\cancel{B}|^3)` inversion for :math:`\mathbf{Q}_{\cancel{B} \cancel{B}}`? Also we are going to get cross-terms with both :math:`\mathbf{Q}_{* \cancel{B}}` and :math:`\mathbf{Q}_{B B}` involved, but maybe they are OK because they are only :math:`O(|\cancel{B}| |B|)`? \ No newline at end of file diff --git a/doc/src/index.rst b/doc/src/index.rst index e6176d7c..d82bfc21 100644 --- a/doc/src/index.rst +++ b/doc/src/index.rst @@ -97,6 +97,8 @@ and plot the results (though this'll require a numerical python environment), custom-models references + crappy-pic + ######### Credit diff --git a/doc/src/sparse-gp-details.rst b/doc/src/sparse-gp-details.rst index ce5bab0a..288aece6 100644 --- a/doc/src/sparse-gp-details.rst +++ b/doc/src/sparse-gp-details.rst @@ -338,3 +338,98 @@ However, we found that while the posterior mean predictions were numerically sta You should be able to find this implementation and details using git history. +-------------------- +PIC +-------------------- + +The way I started partitioning the covariance term into blocks is as follows: + +.. math:: + + (\sigma_*^2)^{PIC} &= K_* - \tilde{\mathbf{K}}^{PIC}_{*f} \left[ \tilde{\mathbf{K}}^{PITC}_{ff} \right]^{-1} \tilde{\mathbf{K}}^{PIC}_{f*} + \sigma^2 \\ + &= K_* - \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{K}_{* B} \end{bmatrix} \left(\mathbf{Q}_{ff} - \mathtt{blkdiag}(\mathbf{K}_{ff} - \mathbf{Q}_{ff})\right)^{-1} \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{K}_{B *} \end{bmatrix} \\ + &= K_* - \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{K}_{* B} \end{bmatrix} \left(\mathbf{K}_{fu} \mathbf{K}_{uu}^{-1} \mathbf{K}_{uf} + \mathbf{\Lambda} \right)^{-1} \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{K}_{B *} \end{bmatrix} + +The problem with doing this for PIC covariance (vs. PIC mean and PITC) is that we can't left-multiply the whole thing by :math:`\mathbf{K}_{uu}^{-1} \mathbf{K}_{uf}` (which in those instances leads to applying Woodbury's lemma to reduce the inverse to the size of the number of inducing points :math:`M`) because :math:`\mathbf{K}_{*B}` is not a low-rank approximation using the inducing points. We can instead break up the inverse term into blocks: + +.. math:: + \newcommand{VV}{\mathbf{V}} + (\sigma_*^2)^{PIC} &= K_* - + \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{K}_{* B}\end{bmatrix} + \begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1} + \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{K}_{B *}\end{bmatrix} + +If we substitute :math:`\mathbf{K}_{* B} = \mathbf{Q}_{* B} + \VV_{* B}` as with the mean, it doesn't work out nicely: + +.. math:: + (\sigma_*^2)^{PIC} &= K_* - + \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{Q}_{* B} + \VV_{* B} \end{bmatrix} + \underbrace{\begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1}}_{\mathbf{S}^{-1}} + \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{Q}_{B *} + \VV_{B *}\end{bmatrix} \\ + &= K_* - \mathbf{Q}_{**}^{PITC} - \underbrace{\mathbf{Q}_{* f} \mathbf{S}^{-1}_{f B} \VV_{B *}}_{\mathbf{U}} - \mathbf{U}^T - \mathbf{V}_{* B} \mathbf{S}^{-1}_{B B} \mathbf{V}_{B *} + +Now we have 3 correction terms to apply to the posterior PITC covariance. The best thing I can think of is to apply Woodbury's lemma, but in the opposite direction to usual: + +.. math:: + \newcommand{Lam}{\mathbf{\Lambda}} + \newcommand{Kuu}{\mathbf{K}_{uu}} + \newcommand{Kuf}{\mathbf{K}_{uf}} + \newcommand{Kfu}{\mathbf{K}_{fu}} + \mathbf{S}^{-1} &= \left(\Kfu \Kuu^{-1} \Kuf + \Lam\right)^{-1} \\ + &= \Lam^{-1} - \Lam^{-1} \Kfu \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \Kuf \Lam^{-1} + +which involves decomposing the block-diagonal matrix :math:`\Lam` with blocks of size :math:`|B|` and a matrix the size :math:`M` of the inducing point set. In practice after we precompute terms, we have a sequence of triangular factors that we can subset as needed to pick out :math:`B` and :math:`\cancel{B}`. (Confusingly, one of these useful decompositions is the QR decomposition of the unrelated matrix :math:`B` in the PITC derivation above.) + +.. math:: + \mathbf{U} &= \mathbf{Q}_{* f} \mathbf{S}^{-1}_{f B} \VV_{B *} \\ + &= \mathbf{Q}_{* f} \left( \Lam^{-1} - \Lam^{-1} \Kfu \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \Kuf \Lam^{-1} \right)_{f B} \VV_{B *} + +This looks appealingly like we could just keep combining instances of the QR decomposition of :math:`B`, but that would leave out the :math:`\mathbf{K}_{uu}` part. + +The open question is how to efficiently distill this into various correction terms for each group that don't require operations that scale with :math:`\cancel{B}`, since the paper promises :math:`O((|B| + M)^2)` for predictive covariances after precomputation. In principle, using sparse vectors / matrices for the :math:`*` target components, combined with Eigen's expression templates, should bring the complexity of some of these repeated solves for mostly empty vectors (for :math:`B`) down to :math:`O(|B|)`, and likewise for inducing points. + +At this point, our cross-terms are preceded by :math:`Q_{* f}`, which expands to :math:`K_{*u} K_{uu}^{-1} K_{u f}`. So actually we should be able to precompute everything except :math:`K_{*u}`, leaving prediction-time computations to scale with :math:`M`! + +So for the variance, we must precompute: + + - :math:`P^TLDL^TP = \Lam` + - :math:`\mathbf{G} = \Lam^{-1} \Kfu` + - :math:`L_{uu} L_{uu}^T = \Kuu` + - :math:`QRP^T = B = \begin{bmatrix}\Lam^{-\frac{1}{2}}\Kfu \\ L_{uu} \end{bmatrix}` such that :math:`\mathbf{S}^{-1} = \Lam^{-1} - \mathbf{G} \left(B^T B\right)^{-1} \mathbf{G}^T`, and blocks :math:`\mathbf{S}^{-1}_{a b}` can be got by choosing the right rows / columns with which to do permutations and back-substitutions. + - :math:`\mathbf{W} = \Kuu^{-1} \mathbf{K}_{u f} \mathbf{S}^{-1}` + +For the mean, we compute the information vector :math:`v` as in PITC. + +then for each group :math:`B`: + + - :math:`\mathbf{Y}_B = \Kuu^{-1} \mathbf{K}_{u B}` + - :math:`v_b = \Lam_{B B}^{-1} \left( y_B - \mathbf{K}_{B u} v \right)` + +Given that we have already computed :math:`\Kfu`, we can use :math:`\mathbf{K}_{u B}` and :math:`\mathbf{K}_{u \cancel{B}}` efficiently in Eigen using sparse matrices with a single entry per nonzero row or column to be used. + +Then at prediction time, we must compute: + + - :math:`\mathbf{K}_{* B}`, :math:`O(|B|)` + - :math:`\mathbf{K}_{* u}`, :math:`O(M)` + - :math:`\mathbf{Q}_{* B} = \mathbf{K}_{* u} \mathbf{Y}_B`, :math:`O(M^2)` with the existing decomposition + - :math:`\VV_{* B} = \mathbf{K}_{* B} - \mathbf{Q}_{* B}`, :math:`O(|B|^2)` + - :math:`\mathbf{Q}_{**}^{PITC}` as with PITC + - :math:`\mathbf{U} = \mathbf{K}_{* u} \mathbf{W}_B \VV_{B *}`, :math:`O(M + |B|)`? + - :math:`\VV_{* B} \mathbf{S}^{-1}_{B B} \VV_{B *}`, :math:`O(|B|^2)` + +To compute :math:`\mathbf{V}_{* B} \mathbf{S}^{-1}_{B B} \mathbf{V}_{B *}`, we form a (mostly zero) column of :math:`\mathbf{V}` for each feature, break the two terms of :math:`\mathbf{S}^{-1}` into symmetric parts, multiply by :math:`\mathbf{V}` and subtract, here in excruciating notational detail: + +.. math:: + \VV_{* B} \mathbf{S}^{-1} \VV_{B *} &= \VV_{* B} \left( \Lam^{-1} - \Lam^{-1} \Kfu \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \Kuf \Lam^{-1} \right)_{B B} \VV_{B *} \\ + &= \VV_{* B} \left( \left(P_\Lam L_\Lam^{-T} D_\Lam^{-\frac{1}{2}}\right) \underbrace{\left(D_\Lam^{-\frac{1}{2}} L_\Lam^{-1} P_\Lam^T\right)}_{Z_\Lam} - \mathbf{G}^T (B^T B)^{-1} \mathbf{G} \right) \VV_{B *} \\ + &= \VV_{* B} \left( \mathbf{Z}_\Lam^T \mathbf{Z}_\Lam - \mathbf{G}^T (P_u R_u^{-1} R_u^{-T} P_u^T) \mathbf{G} \right) \VV_{B *} \\ + &= \VV_{* B} \left( \mathbf{Z}_\Lam^T \mathbf{Z}_\Lam - \mathbf{G}^T (P_u R_u^{-1}) \underbrace{(R_u^{-T} P_u^T) \mathbf{G}}_{\mathbf{Z}_u} \right) \VV_{B *} \\ + &= \VV_{* B} \left( \mathbf{Z}_\Lam^T \mathbf{Z}_\Lam - \mathbf{Z}_u^T \mathbf{Z}_u \right) \VV_{B *} \\ + &= \VV_{* B} \mathbf{Z}_\Lam^T \underbrace{\mathbf{Z}_\Lam \VV_{B *}}_{\mathbf{\xi}_\Lam} - \VV_{* B} \mathbf{Z}_u^T \underbrace{\mathbf{Z}_u \VV_{B *}}_{\mathbf{\xi}_u} \\ + &= \mathbf{\xi}_\Lam^T \mathbf{\xi}_\Lam - \mathbf{\xi}_u^T \mathbf{\xi}_u \\ + +Note that the left-hand (subscript :math:`\Lam`) term is the decomposition of a block-diagonal matrix, so it will only contain cross-terms between features corresponding to the same local block. This calculation can be done blockwise. The right-hand (subscript :math:`u`) term projects the features through the local block of the training dataset and then through the inducing points, so the cross-terms are not in general sparse, and this calculation involves a lot more careful indexing. + +The same breakdown of :math:`\mathbf{S}^{-1}` can be used to compute :math:`\mathbf{W}` during the fit step. + +The notation :math:`\mathbf{W}_B` indicates that the relevant columns of :math:`\mathbf{W}` are used on the right-hand side. This is mathematically equivalent to making :math:`\VV_{B *}` have dimension :math:`N \times p` and be zero outside block :math:`B`. Computationally the right factor must be a sparse object to preserve the desired asymptotics. \ No newline at end of file diff --git a/include/albatross/SparseGP b/include/albatross/SparseGP index 2a26aab2..0f14113e 100644 --- a/include/albatross/SparseGP +++ b/include/albatross/SparseGP @@ -18,11 +18,14 @@ #include #include "GP" +// #include "Common" #include "linalg/Utils" #include #include #include +#include #include +#include #endif diff --git a/include/albatross/serialize/GP b/include/albatross/serialize/GP index f54633c5..2ecee09c 100644 --- a/include/albatross/serialize/GP +++ b/include/albatross/serialize/GP @@ -18,6 +18,7 @@ #include "../src/cereal/block_utils.hpp" #include "../src/cereal/serializable_ldlt.hpp" +#include "../src/cereal/block_diagonal.hpp" #include "../src/cereal/gp.hpp" #include "../src/cereal/representations.hpp" #include "../src/cereal/serializable_spqr.hpp" diff --git a/include/albatross/src/cereal/block_diagonal.hpp b/include/albatross/src/cereal/block_diagonal.hpp new file mode 100644 index 00000000..12279858 --- /dev/null +++ b/include/albatross/src/cereal/block_diagonal.hpp @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2019 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef ALBATROSS_SRC_CEREAL_BLOCK_DIAGONAL_HPP_ +#define ALBATROSS_SRC_CEREAL_BLOCK_DIAGONAL_HPP_ + +namespace cereal { + +template +inline void serialize(Archive &archive, + const albatross::BlockDiagonalLDLT &ldlt, + const std::uint32_t ) { + archive(cereal::make_nvp("blocks", ldlt.blocks)); +} + +} // namespace cereal + +#endif /* ALBATROSS_SRC_CEREAL_BLOCK_DIAGONAL_HPP_ */ diff --git a/include/albatross/src/cereal/gp.hpp b/include/albatross/src/cereal/gp.hpp index b3926397..a50e5e55 100644 --- a/include/albatross/src/cereal/gp.hpp +++ b/include/albatross/src/cereal/gp.hpp @@ -17,7 +17,9 @@ using albatross::Fit; using albatross::GaussianProcessBase; using albatross::GPFit; using albatross::LinearCombination; + using albatross::SparseGPFit; +using albatross::PICGPFit; #ifndef GP_SERIALIZATION_VERSION #define GP_SERIALIZATION_VERSION 2 @@ -51,6 +53,28 @@ inline void serialize(Archive &archive, Fit> &fit, } } +template +inline void +serialize(Archive &archive, + Fit> &fit, + const std::uint32_t ) { + archive(cereal::make_nvp("train_features", fit.train_features)); + archive(cereal::make_nvp("inducing_features", fit.inducing_features)); + archive(cereal::make_nvp("train_covariance", fit.train_covariance)); + archive(cereal::make_nvp("sigma_R", fit.sigma_R)); + archive(cereal::make_nvp("P", fit.P)); + archive(cereal::make_nvp("mean_w", fit.mean_w)); + archive(cereal::make_nvp("W", fit.W)); + archive(cereal::make_nvp("covariance_Y", fit.covariance_Y)); + archive(cereal::make_nvp("Z", fit.Z)); + archive(cereal::make_nvp("A_ldlt", fit.A_ldlt)); + archive(cereal::make_nvp("measurement_groups", fit.measurement_groups)); + archive(cereal::make_nvp("information", fit.information)); + archive(cereal::make_nvp("numerical_rank", fit.numerical_rank)); + archive(cereal::make_nvp("cols_Bs", fit.cols_Bs)); +} + template inline void save(Archive &archive, diff --git a/include/albatross/src/covariance_functions/measurement.hpp b/include/albatross/src/covariance_functions/measurement.hpp index 732093d4..51dbd1d5 100644 --- a/include/albatross/src/covariance_functions/measurement.hpp +++ b/include/albatross/src/covariance_functions/measurement.hpp @@ -110,6 +110,18 @@ MeasurementOnly measurement_only(const SubCovariance &cov) { return MeasurementOnly(cov); } + // template R without_measurement(T &&t); + +template T without_measurement(Measurement &&m) { + return m.value; +} +template const T &without_measurement(const Measurement &m) { + return m.value; +} + +template T without_measurement(T &&t) { return t; } +template const T &without_measurement(const T &t) { return t; } + } // namespace albatross #endif /* INCLUDE_ALBATROSS_SRC_COVARIANCE_FUNCTIONS_MEASUREMENT_HPP_ */ diff --git a/include/albatross/src/models/pic_gp.hpp b/include/albatross/src/models/pic_gp.hpp new file mode 100644 index 00000000..61534b6a --- /dev/null +++ b/include/albatross/src/models/pic_gp.hpp @@ -0,0 +1,1152 @@ +/* + * Copyright (C) 2024 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef INCLUDE_ALBATROSS_MODELS_PIC_GP_H_ +#define INCLUDE_ALBATROSS_MODELS_PIC_GP_H_ + +namespace albatross { + +template +class PICGaussianProcessRegression; + +template +struct PICGPFit {}; + +template +using PICGroupIndexType = + typename std::result_of::type; + +template struct PICGroup { + // Row indices are into the original set of features into the data + // used to form S. + Eigen::Index initial_row; + Eigen::Index block_size; + Eigen::Index block_index; + albatross::RegressionDataset dataset; +}; + +template +using PICGroupMap = std::map, + PICGroup>; + +template +struct Fit> { + using PermutationIndices = Eigen::Matrix; + using GroupIndexType = PICGroupIndexType; + using GroupMap = PICGroupMap; + + std::vector train_features; + std::vector inducing_features; + Eigen::SerializableLDLT train_covariance; + Eigen::MatrixXd sigma_R; + Eigen::PermutationMatrixX P; + std::vector mean_w; + Eigen::MatrixXd W; + std::vector covariance_Y; + Eigen::MatrixXd Z; + BlockDiagonalLDLT A_ldlt; + GroupMap measurement_groups; + Eigen::VectorXd information; + Eigen::Index numerical_rank; + std::vector> cols_Bs; + GrouperFunction grouper; + + Fit(){}; + + Fit(const std::vector &features_, + const std::vector &inducing_features_, + const Eigen::SerializableLDLT &train_covariance_, + const Eigen::MatrixXd &sigma_R_, const Eigen::PermutationMatrixX &P_, + const std::vector &mean_w_, const Eigen::MatrixXd &W_, + const std::vector &covariance_Y_, + const Eigen::MatrixXd &Z_, const BlockDiagonalLDLT &A_ldlt_, + const GroupMap &measurement_groups_, const Eigen::VectorXd &information_, + Eigen::Index numerical_rank_, + const std::vector> cols_Bs_, + GrouperFunction grouper_) + : train_features(features_), inducing_features(inducing_features_), + train_covariance(train_covariance_), sigma_R(sigma_R_), P(P_), + mean_w(mean_w_), W(W_), covariance_Y(covariance_Y_), Z(Z_), + A_ldlt(A_ldlt_), measurement_groups(measurement_groups_), + information(information_), numerical_rank(numerical_rank_), + cols_Bs(cols_Bs_), grouper(grouper_) {} + + void shift_mean(const Eigen::VectorXd &mean_shift) { + ALBATROSS_ASSERT(mean_shift.size() == information.size()); + information += train_covariance.solve(mean_shift); + } + + bool operator==( + const Fit> + &other) const { + return (train_features == other.train_features && + inducing_features == other.inducing_features && + train_covariance == other.train_covariance && + sigma_R == other.sigma_R && P.indices() == other.P.indices() && + mean_w == other.mean_w && W == W && + covariance_Y == other.covariance_Y && Z == other.Z && + A_ldlt == other.A_ldlt && + measurement_groups == other.measurement_groups && + information == other.information && + numerical_rank == other.numerical_rank && cols_Bs == other.cols_Bs); + } +}; + +/* + * This class implements an approximation technique for Gaussian processes + * which relies on an assumption that all observations are independent (or + * groups of observations are independent) conditional on a set of inducing + * points. The method is based off: + * + * [1] Sparse Gaussian Processes using Pseudo-inputs + * Edward Snelson, Zoubin Ghahramani + * http://www.gatsby.ucl.ac.uk/~snelson/SPGP_up.pdf + * + * Though the code uses notation closer to that used in this (excellent) + * overview of these methods: + * + * [2] A Unifying View of Sparse Approximate Gaussian Process Regression + * Joaquin Quinonero-Candela, Carl Edward Rasmussen + * http://www.jmlr.org/papers/volume6/quinonero-candela05a/quinonero-candela05a.pdf + * + * Very broadly speaking this method starts with a prior over the observations, + * + * [f] ~ N(0, K_ff) + * + * where K_ff(i, j) = covariance_function(features[i], features[j]) and f + * represents the function value. + * + * It then uses a set of inducing points, u, and makes some assumptions about + * the conditional distribution: + * + * [f|u] ~ N(K_fu K_uu^-1 u, K_ff - Q_ff) + * + * Where Q_ff = K_fu K_uu^-1 K_uf represents the variance in f that is + * explained by u. + * + * For FITC (Fully Independent Training Contitional) the assumption is that + * K_ff - Qff is diagonal, for PITC (Partially Independent Training Conditional) + * that it is block diagonal. These assumptions lead to an efficient way of + * inferring the posterior distribution for some new location f*, + * + * [f*|f=y] ~ N(K_*u S K_uf A^-1 y, K_** - Q_** + K_*u S K_u*) + * + * Where S = (K_uu + K_uf A^-1 K_fu)^-1 and A = diag(K_ff - Q_ff) and "diag" + * may mean diagonal or block diagonal. Regardless we end up with O(m^2n) + * complexity instead of O(n^3) of direct Gaussian processes. (Note that in [2] + * S is called sigma and A is lambda.) + * + * Of course, the implementation details end up somewhat more complex in order + * to improve numerical stability. Here we use an approach based off the QR + * decomposition which is described in + * + * Stable and Efficient Gaussian Process Calculations + * http://www.jmlr.org/papers/volume10/foster09a/foster09a.pdf + * + * A more detailed (but more likely to be out of date) description of + * the details can be found on the albatross documentation. A short + * description follows. It starts by setting up the Sparse Gaussian process + * covariances + * + * [f|u] ~ N(K_fu K_uu^-1 u, K_ff - Q_ff) + * + * We then set, + * + * A = K_ff - Q_ff + * = K_ff - K_fu K_uu^-1 K_uf + * + * which can be thought of as the covariance in the training data which + * is not be explained by the inducing points. The fundamental + * assumption in these sparse Gaussian processes is that A is sparse, in + * this case block diagonal. + * + * We then build a matrix B and use its QR decomposition (with pivoting P) + * + * B = |A^-1/2 K_fu| = |Q_1| R P^T + * |K_uu^{T/2} | |Q_2| + * + * After which we can get the information vector (see _fit_impl) + * + * v = (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1 y + * = (B^T B) B^T A^-1/2 y + * = P R^-1 Q_1^T A^-1/2 y + * + * and can make predictions for new locations (see _predict_impl), + * + * [f*|f=y] ~ N(K_*u S K_uf A^-1 y, K_** - Q_** + K_*u S K_u*) + * ~ N(m, C) + * + * where we have + * + * m = K_*u S K_uf A^-1 y + * = K_*u v + * + * and + * + * C = K_** - Q_** + K_*u S K_u* + * + * using + * + * Q_** = K_*u K_uu^-1 K_u* + * = (K_uu^{-1/2} K_u*)^T (K_uu^{-1/2} K_u*) + * = Q_sqrt^T Q_sqrt + * and + * + * K_*u S K_u* = K_*u (K_uu + K_uf A^-1 K_fu)^-1 K_u* + * = K_*u (B^T B)^-1 K_u* + * = K_*u (P R R^T P^T)^-1 K_u* + * = (P R^-T K_u*)^T (P R^-T K_u*) + * = S_sqrt^T S_sqrt + */ +template +class PICGaussianProcessRegression + : public GaussianProcessBase> { +public: + using Base = GaussianProcessBase< + CovFunc, MeanFunc, + PICGaussianProcessRegression>; + + template + using GroupIndexType = PICGroupIndexType; + + template + using GroupMap = PICGroupMap; + + PICGaussianProcessRegression() : Base() { initialize_params(); }; + + PICGaussianProcessRegression(const CovFunc &covariance_function, + const MeanFunc &mean_function) + : Base(covariance_function, mean_function) { + initialize_params(); + }; + PICGaussianProcessRegression(CovFunc &&covariance_function, + MeanFunc &&mean_function) + : Base(std::move(covariance_function), std::move(mean_function)) { + initialize_params(); + }; + + PICGaussianProcessRegression( + const CovFunc &covariance_function, const MeanFunc &mean_function, + const GrouperFunction &independent_group_function, + const InducingPointStrategy &inducing_point_strategy, + const std::string &model_name) + : Base(covariance_function, mean_function, model_name), + inducing_point_strategy_(inducing_point_strategy), + independent_group_function_(independent_group_function) { + initialize_params(); + }; + PICGaussianProcessRegression(CovFunc &&covariance_function, + MeanFunc &&mean_function, + GrouperFunction &&independent_group_function, + InducingPointStrategy &&inducing_point_strategy, + const std::string &model_name) + : Base(std::move(covariance_function), std::move(mean_function), + model_name), + inducing_point_strategy_(std::move(inducing_point_strategy)), + independent_group_function_(std::move(independent_group_function)) { + initialize_params(); + }; + + void initialize_params() { + measurement_nugget_ = { + details::DEFAULT_NUGGET, + LogScaleUniformPrior(PARAMETER_EPSILON, PARAMETER_MAX)}; + inducing_nugget_ = {details::DEFAULT_NUGGET, + LogScaleUniformPrior(PARAMETER_EPSILON, PARAMETER_MAX)}; + } + + ParameterStore get_params() const override { + auto params = map_join(this->mean_function_.get_params(), + this->covariance_function_.get_params()); + params[details::measurement_nugget_name()] = measurement_nugget_; + params[details::inducing_nugget_name()] = inducing_nugget_; + return params; + } + + void set_param(const std::string &name, const Parameter ¶m) override { + if (name == details::measurement_nugget_name()) { + measurement_nugget_ = param; + return; + } else if (name == details::inducing_nugget_name()) { + inducing_nugget_ = param; + return; + } + const bool success = set_param_if_exists_in_any( + name, param, &this->covariance_function_, &this->mean_function_); + ALBATROSS_ASSERT(success); + } + + template ::type, + GrouperFunction>::value>::type> + void update_grouper_function(NewGrouper &&f) { + independent_group_function_ = std::forward(f); + } + + template + auto + _update_impl(const Fit> &old_fit, + const std::vector &features, + const MarginalDistribution &targets) const { + assert(false && "Cannot call update() on a PIC GP yet!"); + + BlockDiagonalLDLT A_ldlt; + Eigen::SerializableLDLT K_uu_ldlt; + Eigen::MatrixXd K_fu; + Eigen::VectorXd y; + Eigen::SerializableLDLT K_PITC_ldlt; + compute_internal_components(old_fit.train_features, features, targets, + &A_ldlt, &K_uu_ldlt, &K_fu, &y, &K_PITC_ldlt); + + const Eigen::Index n_old = old_fit.sigma_R.rows(); + const Eigen::Index n_new = A_ldlt.rows(); + const Eigen::Index k = old_fit.sigma_R.cols(); + Eigen::MatrixXd B = Eigen::MatrixXd::Zero(n_old + n_new, k); + + ALBATROSS_ASSERT(n_old == k); + + // Form: + // B = |R_old P_old^T| = |Q_1| R P^T + // |A^{-1/2} K_fu| |Q_2| + for (Eigen::Index i = 0; i < old_fit.permutation_indices.size(); ++i) { + const Eigen::Index &pi = old_fit.permutation_indices.coeff(i); + B.col(pi).topRows(i + 1) = old_fit.sigma_R.col(i).topRows(i + 1); + } + B.bottomRows(n_new) = A_ldlt.sqrt_solve(K_fu); + const auto B_qr = QRImplementation::compute(B, Base::threads_.get()); + + // Form: + // y_aug = |R_old P_old^T v_old| + // |A^{-1/2} y | + ALBATROSS_ASSERT(old_fit.information.size() == n_old); + Eigen::VectorXd y_augmented(n_old + n_new); + for (Eigen::Index i = 0; i < old_fit.permutation_indices.size(); ++i) { + y_augmented[i] = + old_fit.information[old_fit.permutation_indices.coeff(i)]; + } + y_augmented.topRows(n_old) = + old_fit.sigma_R.template triangularView() * + y_augmented.topRows(n_old); + + y_augmented.bottomRows(n_new) = A_ldlt.sqrt_solve(y, Base::threads_.get()); + const Eigen::VectorXd v = B_qr->solve(y_augmented); + + Eigen::MatrixXd R = get_R(*B_qr); + if (B_qr->rank() < B_qr->cols()) { + // Inflate the diagonal of R in an attempt to avoid singularity + R.diagonal() += + Eigen::VectorXd::Constant(B_qr->cols(), details::cSparseRNugget); + } + using FitType = + Fit>; + return FitType(old_fit.train_features, old_fit.train_covariance, R, + get_P(*B_qr), v, B_qr->rank()); + } + + // Here we create the QR decomposition of: + // + // B = |A^-1/2 K_fu| = |Q_1| R P^T + // |K_uu^{T/2} | |Q_2| + // + // which corresponds to the inverse square root of Sigma + // + // Sigma = (B^T B)^-1 + std::unique_ptr + compute_sigma_qr(const Eigen::SerializableLDLT &K_uu_ldlt, + const BlockDiagonalLDLT &A_ldlt, + const Eigen::MatrixXd &K_fu) const { + Eigen::MatrixXd B(A_ldlt.rows() + K_uu_ldlt.rows(), K_uu_ldlt.rows()); + B.topRows(A_ldlt.rows()) = A_ldlt.sqrt_solve(K_fu); + B.bottomRows(K_uu_ldlt.rows()) = K_uu_ldlt.sqrt_transpose(); + return QRImplementation::compute(B, Base::threads_.get()); + }; + + template < + typename FeatureType, + std::enable_if_t< + has_call_operator::value, int> = 0> + auto _fit_impl(const std::vector &features, + const MarginalDistribution &targets) const { + // Determine the set of inducing points, u. + const auto u = + inducing_point_strategy_(this->covariance_function_, features); + ALBATROSS_ASSERT(u.size() > 0 && "Empty inducing points!"); + + BlockDiagonalLDLT A_ldlt; + Eigen::SerializableLDLT K_uu_ldlt; + Eigen::MatrixXd K_fu; + Eigen::VectorXd y; + GroupMap measurement_groups; + compute_internal_components(u, features, targets, &A_ldlt, &K_uu_ldlt, + &K_fu, &y, &measurement_groups); + auto B_qr = compute_sigma_qr(K_uu_ldlt, A_ldlt, K_fu); + + // To make a prediction, we will need to compute cross-terms with + // sub-blocks of S^-1, where + // + // S^-1 = A^-1 - A^-1 K_fu (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1 + // \____________________________________________/ + // Z^T Z + // + // The inverse part of the right-hand term of this is B_qr above, + // and we already have A_ldlt, so we can form Z. + // + // Now to compute sub-blocks of S^-1, we just index into Z^T Z + // appropriately, and if the term is diagonal, also solve using + // the blocks of A^-1. + const Eigen::MatrixXd Z = sqrt_solve(*B_qr, A_ldlt.solve(K_fu).transpose()); + + Eigen::VectorXd y_augmented = Eigen::VectorXd::Zero(B_qr->rows()); + y_augmented.topRows(y.size()) = A_ldlt.sqrt_solve(y, Base::threads_.get()); + const Eigen::VectorXd v = B_qr->solve(y_augmented); + + using InducingPointFeatureType = typename std::decay::type; + + const Eigen::MatrixXd W = K_uu_ldlt.solve( + (A_ldlt.solve(K_fu) - Z.transpose() * Z * K_fu).transpose()); + + // TODO(@peddie): a lot of this can be batched + std::vector covariance_Y(measurement_groups.size()); + std::vector covariance_Ynot(measurement_groups.size()); + std::vector mean_w(measurement_groups.size()); + std::vector> cols_Bs(measurement_groups.size()); + std::vector> cols_Cs(measurement_groups.size()); + const auto block_start_indices = A_ldlt.block_to_row_map(); + + const auto precompute_block = [&](std::size_t block_number, + Eigen::Index start_row) -> void { + const Eigen::Index block_size = A_ldlt.blocks[block_number].rows(); + // K_fu is already computed, so we can form K_uB and K_uA by + // appropriate use of sparse indexing matrices and avoid an O(N + // M) operation. + // + // This nonsense would be a lot more straightforward with Eigen + // 3.4's slicing and indexing API. + Eigen::SparseMatrix cols_B(cast::to_index(features.size()), block_size); + cols_B.reserve(block_size); + Eigen::SparseMatrix cols_C(cast::to_index(features.size()), + cast::to_index(features.size()) - block_size); + cols_C.reserve(cast::to_index(features.size()) - block_size); + + for (Eigen::Index i = 0; i < block_size; ++i) { + cols_B.insert(start_row + i, i) = 1.; + } + cols_B.makeCompressed(); + + for (Eigen::Index i = 0, j = 0; i < cast::to_index(features.size());) { + if (i == start_row) { + i += block_size; + continue; + } + cols_C.insert(i, j) = 1.; + i++; + j++; + } + cols_C.makeCompressed(); + + Eigen::Index col = 0; + for (Eigen::Index k = 0; k < cols_C.outerSize(); ++k) { + for (decltype(cols_C)::InnerIterator it(cols_C, k); it; ++it) { + assert(it.col() == col++); + } + } + + cols_Bs[block_number] = cols_B; + // v_b \in R^b = A_BB^-1 (y_b - K_Bu v) + Eigen::MatrixXd ydiff_b = cols_B.transpose() * (y - K_fu * v); + const Eigen::MatrixXd mean_w_full = + A_ldlt.blocks[block_number].solve(ydiff_b); + mean_w[block_number] = mean_w_full; + // Y \in R^(u x b) = K_uu^-1 K_uB + covariance_Y[block_number] = K_uu_ldlt.solve(K_fu.transpose() * cols_B); + }; + + apply_map(block_start_indices, precompute_block, Base::threads_.get()); + + using FitType = + Fit>; + return FitType(features, u, K_uu_ldlt, get_R(*B_qr), get_P(*B_qr), mean_w, + W, covariance_Y, Z, A_ldlt, measurement_groups, v, + B_qr->rank(), cols_Bs, independent_group_function_); + } + + template + auto fit_from_prediction(const std::vector &new_inducing_points, + const JointDistribution &prediction_) const { + FitModel>> + output(*this, + Fit>()); + Fit> &new_fit = + output.get_fit(); + + new_fit.train_features = new_inducing_points; + + const Eigen::MatrixXd K_zz = + this->covariance_function_(new_inducing_points, Base::threads_.get()); + new_fit.train_covariance = Eigen::SerializableLDLT(K_zz); + + // We're going to need to take the sqrt of the new covariance which + // could be extremely small, so here we add a small nugget to avoid + // numerical instability + JointDistribution prediction(prediction_); + prediction.covariance.diagonal() += Eigen::VectorXd::Constant( + cast::to_index(prediction.size()), 1, details::DEFAULT_NUGGET); + new_fit.information = new_fit.train_covariance.solve(prediction.mean); + + // Here P is the posterior covariance at the new inducing points. If + // we consider the case where we rebase and then use the resulting fit + // to predict the new inducing points then we see that the predictive + // covariance (see documentation above) would be,: + // + // C = K_zz - Q_zz + K_zz Sigma K_zz + // + // We can use this, knowing that at the inducing points K_zz = Q_zz, to + // derive our updated Sigma, + // + // C = K_zz - K_zz + K_zz Sigma K_zz + // C = K_zz Sigma K_zz + // Sigma = K_zz^-1 C K_zz^-1 + // + // And since we need to store Sigma in sqrt form we get, + // + // Sigma = (B_z^T B_z)^-1 + // = K_zz^-1 C K_zz^-1 + // + // So by setting: + // + // B_z = C^{-1/2} K_z + // + // We can then compute and store the QR decomposition of B + // as we do in a normal fit. + const Eigen::SerializableLDLT C_ldlt(prediction.covariance); + const Eigen::MatrixXd sigma_inv_sqrt = C_ldlt.sqrt_solve(K_zz); + const auto B_qr = QRImplementation::compute(sigma_inv_sqrt, nullptr); + + new_fit.P = get_P(*B_qr); + new_fit.sigma_R = get_R(*B_qr); + new_fit.numerical_rank = B_qr->rank(); + + return output; + } + + // This is included to allow the SparseGP + // to be compatible with fits generated + // using a standard GP. + using Base::_predict_impl; + + template + Eigen::VectorXd _predict_impl( + const std::vector &features, + const Fit> + &sparse_gp_fit, + PredictTypeIdentity &&) const { + const auto find_group = [this, &sparse_gp_fit](const auto &feature) { + const auto group = sparse_gp_fit.measurement_groups.find( + independent_group_function_(without_measurement(feature))); + if (group == sparse_gp_fit.measurement_groups.end()) { + std::cerr << "Group mapping failure for feature '" + << without_measurement(feature) << "' (group index '" + << independent_group_function_(without_measurement(feature)) + << "')!" << std::endl; + assert(group != sparse_gp_fit.measurement_groups.end() && + "TODO(@peddie): the group function in a PIC GP model must " + "cover the entire feature domain in any fit."); + } + return group; + }; + const Eigen::MatrixXd K_up = + this->covariance_function_(sparse_gp_fit.inducing_features, features); + Eigen::VectorXd mean_correction = Eigen::VectorXd::Zero(features.size()); + for (Eigen::Index j = 0; j < features.size(); ++j) { + const auto group = find_group(features[j]); + const std::vector fvec = {features[j]}; + + const Eigen::VectorXd features_cov = + this->covariance_function_(group->second.dataset.features, fvec); + const Eigen::VectorXd kpuy = + K_up.transpose().row(j) * + sparse_gp_fit.covariance_Y[group->second.block_index]; + const Eigen::VectorXd Vbp = features_cov - kpuy; + mean_correction[j] = + Vbp.dot(sparse_gp_fit.mean_w[group->second.block_index]); + } + + Eigen::VectorXd mean = + K_up.transpose() * sparse_gp_fit.information + mean_correction; + + this->mean_function_.add_to(features, &mean); + return mean; + } + + template + MarginalDistribution _predict_impl( + const std::vector &features, + const Fit> + &sparse_gp_fit, + PredictTypeIdentity &&) const { + // std::cout << "_predict_impl() for MarginalDistribution" << std::endl; + const auto K_up = + this->covariance_function_(sparse_gp_fit.inducing_features, features); + Eigen::VectorXd mean = gp_mean_prediction(K_up, sparse_gp_fit.information); + this->mean_function_.add_to(features, &mean); + + Eigen::MatrixXd WV = Eigen::MatrixXd::Zero( + sparse_gp_fit.inducing_features.size(), features.size()); + + const auto find_group = [this, &sparse_gp_fit](const auto &feature) { + const auto group = sparse_gp_fit.measurement_groups.find( + independent_group_function_(without_measurement(feature))); + if (group == sparse_gp_fit.measurement_groups.end()) { + std::cerr << "Group mapping failure for feature '" + << without_measurement(feature) << "' (group index '" + << independent_group_function_(without_measurement(feature)) + << "')!" << std::endl; + assert(group != sparse_gp_fit.measurement_groups.end() && + "TODO(@peddie): the group function in a PIC GP model must " + "cover the entire feature domain in any fit."); + } + return group; + }; + + Eigen::VectorXd mean_correction = Eigen::VectorXd::Zero(features.size()); + std::vector feature_to_block; + for (Eigen::Index j = 0; j < features.size(); ++j) { + const auto group = find_group(features[j]); + feature_to_block.push_back( + std::distance(sparse_gp_fit.measurement_groups.begin(), group)); + const std::vector fvec = {features[j]}; + + const Eigen::VectorXd features_cov = + this->covariance_function_(group->second.dataset.features, fvec); + const Eigen::VectorXd kpuy = + K_up.transpose().row(j) * + sparse_gp_fit.covariance_Y[group->second.block_index]; + const Eigen::VectorXd Vbp = features_cov - kpuy; + mean_correction[j] = + Vbp.dot(sparse_gp_fit.mean_w[group->second.block_index]); + const Eigen::VectorXd wvj = + sparse_gp_fit.W * sparse_gp_fit.cols_Bs[feature_to_block[j]] * Vbp; + WV.col(j) = wvj; + } + + Eigen::VectorXd VSV_diag(features.size()); + + for (Eigen::Index row = 0; row < VSV_diag.size(); ++row) { + assert(row < feature_to_block.size()); + const auto row_group = find_group(features[row]); + // TODO(@peddie): these are K, not V! + const Eigen::VectorXd Q_row_p = + K_up.transpose().row(row) * + sparse_gp_fit.covariance_Y[row_group->second.block_index]; + const std::vector row_fvec = {features[row]}; + const Eigen::VectorXd V_row_p = + this->covariance_function_(row_group->second.dataset.features, + row_fvec) - + Q_row_p; + + Eigen::VectorXd xi_a = sparse_gp_fit.A_ldlt.blocks[feature_to_block[row]] + .sqrt_solve(V_row_p) + .col(0); + + Eigen::VectorXd xi_z = + sparse_gp_fit.Z.block(0, row_group->second.initial_row, + sparse_gp_fit.Z.rows(), + row_group->second.block_size) * + V_row_p; + + VSV_diag(row) = xi_a.dot(xi_a) - xi_z.dot(xi_z); + } + + const Eigen::VectorXd U_diag = (K_up.transpose() * WV).diagonal(); + + Eigen::VectorXd marginal_variance(cast::to_index(features.size())); + for (Eigen::Index i = 0; i < marginal_variance.size(); ++i) { + marginal_variance[i] = this->covariance_function_( + features[cast::to_size(i)], features[cast::to_size(i)]); + } + + const Eigen::MatrixXd Q_sqrt = + sparse_gp_fit.train_covariance.sqrt_solve(K_up); + const Eigen::VectorXd Q_diag = + Q_sqrt.cwiseProduct(Q_sqrt).array().colwise().sum(); + marginal_variance -= Q_diag; + + const Eigen::MatrixXd S_sqrt = + sqrt_solve(sparse_gp_fit.sigma_R, sparse_gp_fit.P, K_up); + const Eigen::VectorXd S_diag = + S_sqrt.cwiseProduct(S_sqrt).array().colwise().sum(); + marginal_variance += S_diag; + + mean += mean_correction; + + return MarginalDistribution(mean, + marginal_variance - (2 * U_diag + VSV_diag)); + } + + template + JointDistribution _predict_impl( + const std::vector &features, + const Fit> + &sparse_gp_fit, + PredictTypeIdentity &&) const { + const auto find_group = [this, &sparse_gp_fit](const auto &feature) { + const auto group = sparse_gp_fit.measurement_groups.find( + independent_group_function_(without_measurement(feature))); + if (group == sparse_gp_fit.measurement_groups.end()) { + std::cerr << "Group mapping failure for feature '" + << without_measurement(feature) << "' (group index '" + << independent_group_function_(without_measurement(feature)) + << "')!" << std::endl; + assert(group != sparse_gp_fit.measurement_groups.end() && + "TODO(@peddie): the group function in a PIC GP model must " + "cover the entire feature domain in any fit."); + } + return group; + }; + + // K_up + const Eigen::MatrixXd K_up = + this->covariance_function_(sparse_gp_fit.inducing_features, features); + const Eigen::MatrixXd prior_cov = this->covariance_function_(features); + + Eigen::MatrixXd WV = Eigen::MatrixXd::Zero( + cast::to_index(sparse_gp_fit.inducing_features.size()), cast::to_index(features.size())); + + Eigen::VectorXd mean_correction = Eigen::VectorXd::Zero(cast::to_index(features.size())); + std::vector feature_to_block; + for (Eigen::Index j = 0; j < cast::to_index(features.size()); ++j) { + const auto group = find_group(features[cast::to_size(j)]); + feature_to_block.push_back(cast::to_size( + std::distance(sparse_gp_fit.measurement_groups.begin(), group))); + const std::vector fvec = {features[cast::to_size(j)]}; + + const Eigen::VectorXd features_cov = + this->covariance_function_(group->second.dataset.features, fvec); + const Eigen::VectorXd kpuy = + K_up.transpose().row(j) * + sparse_gp_fit.covariance_Y[cast::to_size(group->second.block_index)]; + const Eigen::VectorXd Vbp = features_cov - kpuy; + mean_correction[j] = + Vbp.dot(sparse_gp_fit.mean_w[cast::to_size(group->second.block_index)]); + const Eigen::VectorXd wvj = + sparse_gp_fit.W * sparse_gp_fit.cols_Bs[feature_to_block[cast::to_size(j)]] * Vbp; + WV.col(j) = wvj; + } + + Eigen::MatrixXd VSV(features.size(), features.size()); + + for (std::size_t row = 0; row < features.size(); ++row) { + for (std::size_t col = 0; col <= row; ++col) { + Eigen::Index row_index = cast::to_index(row); + Eigen::Index col_index = cast::to_index(col); + const auto row_group = find_group(features[row]); + const auto column_group = find_group(features[col]); + // TODO(@peddie): these are K, not V! + const Eigen::VectorXd Q_row_p = + K_up.transpose().row(row_index) * + sparse_gp_fit.covariance_Y[cast::to_size(row_group->second.block_index)]; + const std::vector row_fvec = {features[row]}; + const Eigen::VectorXd V_row_p = + this->covariance_function_(row_group->second.dataset.features, + row_fvec) - + Q_row_p; + const Eigen::VectorXd Q_column_p = + K_up.transpose().row(col_index) * + sparse_gp_fit.covariance_Y[cast::to_size(column_group->second.block_index)]; + const std::vector column_fvec = {features[col]}; + const Eigen::VectorXd V_column_p = + this->covariance_function_(column_group->second.dataset.features, + column_fvec) - + Q_column_p; + + VSV(row_index, col_index) = 0.; + + assert(row < feature_to_block.size()); + assert(col < feature_to_block.size()); + if (feature_to_block[row] == feature_to_block[col]) { + VSV(row_index, col_index) = + sparse_gp_fit.A_ldlt.blocks[feature_to_block[row]] + .sqrt_solve(V_row_p) + .col(0) + .dot(sparse_gp_fit.A_ldlt.blocks[feature_to_block[col]] + .sqrt_solve(V_column_p) + .col(0)); + } + + const Eigen::MatrixXd rowblock = sparse_gp_fit.Z.block( + 0, row_group->second.initial_row, sparse_gp_fit.Z.rows(), + row_group->second.block_size); + const Eigen::MatrixXd columnblock = sparse_gp_fit.Z.block( + 0, column_group->second.initial_row, sparse_gp_fit.Z.rows(), + column_group->second.block_size); + + VSV(row_index, col_index) -= (rowblock * V_row_p).dot(columnblock * V_column_p); + } + } + + VSV.triangularView() = VSV.transpose(); + + const Eigen::MatrixXd U = K_up.transpose() * WV; + + const Eigen::MatrixXd S_sqrt = + sqrt_solve(sparse_gp_fit.sigma_R, sparse_gp_fit.P, K_up); + + const Eigen::MatrixXd Q_sqrt = + sparse_gp_fit.train_covariance.sqrt_solve(K_up); + + const Eigen::MatrixXd max_explained = Q_sqrt.transpose() * Q_sqrt; + const Eigen::MatrixXd unexplained = S_sqrt.transpose() * S_sqrt; + + const Eigen::MatrixXd pitc_covariance = + prior_cov - max_explained + unexplained; + + const Eigen::MatrixXd pic_correction = U + U.transpose() + VSV; + + JointDistribution pred(K_up.transpose() * sparse_gp_fit.information + + mean_correction, + pitc_covariance - pic_correction); + + this->mean_function_.add_to(features, &pred.mean); + + // Debug comparison + + // Eigen::MatrixXd K_pic(sparse_gp_fit.train_features.size(), + // features.size()); for (std::size_t i = 0; i < features.size(); ++i) { + // const auto group = find_group(features[i]); + // K_pic.col(i) = sparse_gp_fit.cols_Cs[feature_to_block[i]] * + // K_up.transpose() * + // sparse_gp_fit.covariance_Ynot[feature_to_block[i]]; + // std::cout << "Ynot: K_pic.col(" << i << "): " << + // K_pic.col(i).transpose() + // << std::endl; + // K_pic.col(i).segment(group->second.initial_row, + // group->second.block_size) = + // this->covariance_function_(std::vector{features[i]}, + // group->second.dataset.features); + // std::cout << "K*: K_pic.col(" << i << "): " << K_pic.col(i).transpose() + // << std::endl; + // } + + // std::cout << "K_pic (" << K_pic.rows() << "x" << K_pic.cols() << "):\n" + // << K_pic << std::endl; + + // const Eigen::MatrixXd K_pic_pitc = + // sparse_gp_fit.K_PITC_ldlt.sqrt_solve(K_pic); + // JointDistribution alt_pred(K_up.transpose() * sparse_gp_fit.information, + // prior_cov - K_pic_pitc.transpose() * + // K_pic_pitc); + + // std::cout << "alt covariance (" << alt_pred.covariance.rows() << "x" + // << alt_pred.covariance.cols() << "):\n" + // << alt_pred.covariance << std::endl; + + return pred; + } + + template + double log_likelihood(const RegressionDataset &dataset) const { + const auto u = + inducing_point_strategy_(this->covariance_function_, dataset.features); + + BlockDiagonalLDLT A_ldlt; + Eigen::SerializableLDLT K_uu_ldlt; + Eigen::MatrixXd K_fu; + Eigen::VectorXd y; + compute_internal_components(u, dataset.features, dataset.targets, &A_ldlt, + &K_uu_ldlt, &K_fu, &y); + const auto B_qr = compute_sigma_qr(K_uu_ldlt, A_ldlt, K_fu); + // The log likelihood for y ~ N(0, K) is: + // + // L = 1/2 (n log(2 pi) + log(|K|) + y^T K^-1 y) + // + // where in our case we have + // K = A + Q_ff + // and + // Q_ff = K_fu K_uu^-1 K_uf + // + // First we get the determinant, |K|: + // + // |K| = |A + Q_ff| + // = |K_uu + K_uf A^-1 K_fu| |K_uu^-1| |A| + // = |P R^T Q^T Q R P^T| |K_uu^-1| |A| + // = |R^T R| |K_uu^-1| |A| + // = |R|^2 |A| / |K_uu| + // + // Where the first equality comes from the matrix determinant lemma. + // https://en.wikipedia.org/wiki/Matrix_determinant_lemma#Generalization + // + // After which we can take the log: + // + // log(|K|) = 2 log(|R|) + log(|A|) - log(|K_uu|) + // + const double log_det_a = A_ldlt.log_determinant(); + + const double log_det_r = + B_qr->matrixR().diagonal().array().cwiseAbs().log().sum(); + const double log_det_K_uu = K_uu_ldlt.log_determinant(); + const double log_det = log_det_a + 2 * log_det_r - log_det_K_uu; + + // q = y^T K^-1 y + // = y^T (A + Q_ff)^-1 y + // = y^T (A^-1 - A^-1 K_fu (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1) y + // = y^T A^-1 y - y^T A^-1 K_fu (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1) y + // = y^T A^-1 y - y^T A^-1 K_fu (R^T R)^-1 K_uf A^-1) y + // = y^T A^-1 y - (R^-T K_uf A^-1 y)^T (R^-T K_uf A^-1 y) + // = y^T y_a - y_b^T y_b + // + // with y_b = R^-T K_uf y_a + const Eigen::VectorXd y_a = A_ldlt.solve(y); + + Eigen::VectorXd y_b = K_fu.transpose() * y_a; + y_b = sqrt_solve(*B_qr, y_b); + + double log_quadratic = y.transpose() * y_a; + log_quadratic -= y_b.transpose() * y_b; + + const double rank = cast::to_double(y.size()); + const double log_dimension = rank * log(2 * M_PI); + + return -0.5 * (log_det + log_quadratic + log_dimension) + + this->prior_log_likelihood(); + } + + InducingPointStrategy get_inducing_point_strategy() const { + return inducing_point_strategy_; + } + +private: + // This method takes care of a lot of the common book keeping required to + // setup the Sparse Gaussian Process problem. Namely, we want to get from + // possibly unordered features to a structured representation + // in the form K_ff = A_ff + Q_ff where Q_ff = K_fu K_uu^-1 K_uf and + // A_ff is block diagonal and is formed by subtracting Q_ff from K_ff. + // + template + void compute_internal_components( + const std::vector &inducing_features, + const std::vector &out_of_order_features, + const MarginalDistribution &out_of_order_targets, + BlockDiagonalLDLT *A_ldlt, Eigen::SerializableLDLT *K_uu_ldlt, + Eigen::MatrixXd *K_fu, Eigen::VectorXd *y, + GroupMap *measurement_groups) const { + + ALBATROSS_ASSERT(A_ldlt != nullptr); + ALBATROSS_ASSERT(K_uu_ldlt != nullptr); + ALBATROSS_ASSERT(K_fu != nullptr); + ALBATROSS_ASSERT(y != nullptr); + + const auto indexer = + group_by(out_of_order_features, independent_group_function_).indexers(); + + const auto out_of_order_measurement_features = + as_measurements(out_of_order_features); + + std::vector reordered_inds; + BlockDiagonal K_ff; + Eigen::Index begin_row_index = 0; + Eigen::Index block_index = 0; + // TODO(@peddie): compute these blocks asynchronously? + for (const auto &pair : indexer) { + reordered_inds.insert(reordered_inds.end(), pair.second.begin(), + pair.second.end()); + measurement_groups->operator[](pair.first) = PICGroup{ + begin_row_index, static_cast(pair.second.size()), + block_index, + albatross::RegressionDataset( + subset(out_of_order_features, pair.second), + out_of_order_targets.subset(pair.second))}; + // measurement_groups->emplace( + // std::piecewise_construct, std::forward_as_tuple(pair.first), + // std::forward_as_tuple( + // begin_row_index, static_cast(pair.second.size()), + // block_index, + // albatross::RegressionDataset( + // subset(out_of_order_features, pair.second), + // out_of_order_targets.subset(pair.second)))); + auto subset_features = + subset(out_of_order_measurement_features, pair.second); + K_ff.blocks.emplace_back(this->covariance_function_(subset_features)); + K_ff.blocks.back().diagonal() += + subset(out_of_order_targets.covariance.diagonal(), pair.second); + begin_row_index += pair.second.size(); + ++block_index; + } + + const auto features = + subset(out_of_order_measurement_features, reordered_inds); + auto targets = subset(out_of_order_targets, reordered_inds); + *y = targets.mean; + + this->mean_function_.remove_from( + subset(out_of_order_features, reordered_inds), &targets.mean); + + *K_fu = this->covariance_function_(features, inducing_features, + Base::threads_.get()); + + auto K_uu = + this->covariance_function_(inducing_features, Base::threads_.get()); + + K_uu.diagonal() += + inducing_nugget_.value * Eigen::VectorXd::Ones(K_uu.rows()); + + *K_uu_ldlt = K_uu.ldlt(); + // P is such that: + // Q_ff = K_fu K_uu^-1 K_uf + // = K_fu K_uu^-T/2 K_uu^-1/2 K_uf + // = P^T P + const Eigen::MatrixXd P = K_uu_ldlt->sqrt_solve(K_fu->transpose()); + + // We only need the diagonal blocks of Q_ff to get A + BlockDiagonal Q_ff_diag; + Eigen::Index i = 0; + for (const auto &pair : indexer) { + Eigen::Index cols = cast::to_index(pair.second.size()); + auto P_cols = P.block(0, i, P.rows(), cols); + Q_ff_diag.blocks.emplace_back(P_cols.transpose() * P_cols); + i += cols; + } + auto A = K_ff - Q_ff_diag; + + // It's possible that the inducing points will perfectly describe + // some of the data, in which case we need to add a bit of extra + // noise to make sure lambda is invertible. + for (auto &b : A.blocks) { + b.diagonal() += + measurement_nugget_.value * Eigen::VectorXd::Ones(b.rows()); + } + + *A_ldlt = A.ldlt(); + + // const Eigen::MatrixXd K_uuuf = K_uu_ldlt->sqrt_solve(K_fu->transpose()); + // Eigen::MatrixXd Kfuf = K_uuuf.transpose() * K_uuuf; + // Eigen::Index row_offset = 0; + // Eigen::Index col_offset = 0; + // for (auto &b : A.blocks) { + // Kfuf.block(row_offset, col_offset, b.rows(), b.cols()) += b; + // row_offset += b.rows(); + // col_offset += b.cols(); + // } + + // // std::cout << "Kfuf (" << Kfuf.rows() << "x" << Kfuf.cols() << "):\n" + // // << Kfuf.format(Eigen::FullPrecision) << std::endl; + + // *K_PITC_ldlt = Kfuf.ldlt(); + } + + Parameter measurement_nugget_; + Parameter inducing_nugget_; + InducingPointStrategy inducing_point_strategy_; + GrouperFunction independent_group_function_; +}; + +// rebase_inducing_points takes a Sparse GP which was fit using some set of +// inducing points and creates a new fit relative to new inducing points. +// Note that this will NOT be the equivalent to having fit the model with +// the new inducing points since some information may have been lost in +// the process. +template +auto rebase_inducing_points( + const FitModel>> + &fit_model, + const std::vector &new_inducing_points) { + return fit_model.get_model().fit_from_prediction( + new_inducing_points, fit_model.predict(new_inducing_points).joint()); +} + +template +using SparseQRPicGaussianProcessRegression = + PICGaussianProcessRegression; + +template +auto pic_gp_from_covariance_and_mean( + CovFunc &&covariance_function, MeanFunc &&mean_function, + GrouperFunction &&grouper_function, InducingPointStrategy &&strategy, + const std::string &model_name, + QRImplementation qr __attribute__((unused)) = DenseQRImplementation{}) { + return PICGaussianProcessRegression< + typename std::decay::type, typename std::decay::type, + typename std::decay::type, + typename std::decay::type, + typename std::decay::type>( + std::forward(covariance_function), + std::forward(mean_function), + std::forward(grouper_function), + std::forward(strategy), model_name); +}; + +template +auto pic_gp_from_covariance_and_mean( + CovFunc &&covariance_function, MeanFunc &&mean_function, + GrouperFunction &&grouper_function, const std::string &model_name, + QRImplementation qr = DenseQRImplementation{}) { + return pic_gp_from_covariance_and_mean( + std::forward(covariance_function), + std::forward(mean_function), + std::forward(grouper_function), + StateSpaceInducingPointStrategy(), model_name, qr); +}; + +template +auto pic_gp_from_covariance(CovFunc &&covariance_function, + GrouperFunction &&grouper_function, + InducingPointStrategy &&strategy, + const std::string &model_name, + QRImplementation qr = DenseQRImplementation{}) { + return pic_gp_from_covariance_and_mean( + std::forward(covariance_function), ZeroMean(), + std::forward(grouper_function), + std::forward(strategy), model_name, qr); +}; + +template +auto pic_gp_from_covariance(CovFunc covariance_function, + GrouperFunction grouper_function, + const std::string &model_name, + QRImplementation qr = DenseQRImplementation{}) { + return pic_gp_from_covariance_and_mean< + CovFunc, decltype(ZeroMean()), GrouperFunction, + decltype(StateSpaceInducingPointStrategy()), QRImplementation>( + std::forward(covariance_function), ZeroMean(), + std::forward(grouper_function), + StateSpaceInducingPointStrategy(), model_name, qr); +}; + +} // namespace albatross + +#endif /* INCLUDE_ALBATROSS_MODELS_PIC_GP_H_ */ diff --git a/include/albatross/src/models/sparse_common.hpp b/include/albatross/src/models/sparse_common.hpp new file mode 100644 index 00000000..b5267b99 --- /dev/null +++ b/include/albatross/src/models/sparse_common.hpp @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2024 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ +#define INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ + +namespace albatross { + +namespace details { + +constexpr double DEFAULT_NUGGET = 1e-8; + +inline std::string measurement_nugget_name() { return "measurement_nugget"; } + +inline std::string inducing_nugget_name() { return "inducing_nugget"; } + +static constexpr double cSparseRNugget = 1.e-10; + +} // namespace details + +struct UniformlySpacedInducingPoints { + + UniformlySpacedInducingPoints(std::size_t num_points_ = 10) + : num_points(num_points_) {} + + template + std::vector operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, + const std::vector &features) const { + double min = *std::min_element(features.begin(), features.end()); + double max = *std::max_element(features.begin(), features.end()); + return linspace(min, max, num_points); + } + + std::size_t num_points; +}; + +struct StateSpaceInducingPointStrategy { + + template ::value, + int> = 0> + auto operator()(const CovarianceFunction &cov, + const std::vector &features) const { + return cov.state_space_representation(features); + } + + template ::value, + int> = 0> + auto + operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, + const std::vector &features ALBATROSS_UNUSED) const + ALBATROSS_FAIL( + CovarianceFunction, + "Covariance function is missing state_space_representation method, " + "be sure _ssr_impl has been defined for the types concerned"); +}; + +struct SPQRImplementation { + using QRType = Eigen::SPQR>; + + static std::unique_ptr compute(const Eigen::MatrixXd &m, + ThreadPool *threads) { + return SPQR_create(m.sparseView(), threads); + } +}; + +struct DenseQRImplementation { + using QRType = Eigen::ColPivHouseholderQR; + + static std::unique_ptr compute(const Eigen::MatrixXd &m, + ThreadPool *threads + __attribute__((unused))) { + return std::make_unique(m); + } +}; + + +} // namespace albatross + +#endif // INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ \ No newline at end of file diff --git a/include/albatross/src/models/sparse_gp.hpp b/include/albatross/src/models/sparse_gp.hpp index 36234e4e..90985cb4 100644 --- a/include/albatross/src/models/sparse_gp.hpp +++ b/include/albatross/src/models/sparse_gp.hpp @@ -15,81 +15,10 @@ namespace albatross { -namespace details { - -constexpr double DEFAULT_NUGGET = 1e-8; - -inline std::string measurement_nugget_name() { return "measurement_nugget"; } - -inline std::string inducing_nugget_name() { return "inducing_nugget"; } - -static constexpr double cSparseRNugget = 1.e-10; - -} // namespace details - template class SparseGaussianProcessRegression; -struct UniformlySpacedInducingPoints { - - UniformlySpacedInducingPoints(std::size_t num_points_ = 10) - : num_points(num_points_) {} - - template - std::vector operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, - const std::vector &features) const { - double min = *std::min_element(features.begin(), features.end()); - double max = *std::max_element(features.begin(), features.end()); - return linspace(min, max, num_points); - } - - std::size_t num_points; -}; - -struct StateSpaceInducingPointStrategy { - - template ::value, - int> = 0> - auto operator()(const CovarianceFunction &cov, - const std::vector &features) const { - return cov.state_space_representation(features); - } - - template ::value, - int> = 0> - auto - operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, - const std::vector &features ALBATROSS_UNUSED) const - ALBATROSS_FAIL( - CovarianceFunction, - "Covariance function is missing state_space_representation method, " - "be sure _ssr_impl has been defined for the types concerned"); -}; - -struct SPQRImplementation { - using QRType = Eigen::SPQR>; - - static std::unique_ptr compute(const Eigen::MatrixXd &m, - ThreadPool *threads) { - return SPQR_create(m.sparseView(), threads); - } -}; - -struct DenseQRImplementation { - using QRType = Eigen::ColPivHouseholderQR; - - static std::unique_ptr compute(const Eigen::MatrixXd &m, - ThreadPool *threads - __attribute__((unused))) { - return std::make_unique(m); - } -}; - template struct SparseGPFit {}; template struct Fit> { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 32a13262..77fd9db0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -48,6 +48,7 @@ swift_add_tool(albatross_unit_tests test_serializable_ldlt.cc test_serialize.cc test_sparse_gp.cc + test_pic_gp.cc test_stats.cc test_thread_pool.cc test_traits_cereal.cc diff --git a/tests/test_pic_gp.cc b/tests/test_pic_gp.cc new file mode 100644 index 00000000..0849d487 --- /dev/null +++ b/tests/test_pic_gp.cc @@ -0,0 +1,742 @@ +/* + * Copyright (C) 2024 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include + +// #include "albatross/src/eigen/serializable_ldlt.hpp" +// #include "albatross/src/linalg/block_diagonal.hpp" +#include "test_models.h" +#include +#include + +#include +#include + +namespace albatross { + +struct LeaveOneIntervalOut { + + LeaveOneIntervalOut(){}; + explicit LeaveOneIntervalOut(double group_domain_size_) + : group_domain_size(group_domain_size_){}; + + int operator()(const double &f) const { + return static_cast(floor(f / group_domain_size)); + } + + int operator()(const Measurement &f) const { + return static_cast(floor(f.value / group_domain_size)); + } + + double group_domain_size = 5.; +}; + +TEST(TestPicGP, TestPredictionExists) { + const auto dataset = make_toy_linear_data(); + UniformlySpacedInducingPoints strategy(8); + LeaveOneIntervalOut grouper; + const auto pic = + pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + const auto pic_fit = pic.fit(dataset); + + const auto test_features = linspace(0.1, 9.9, 11); + + const auto pic_pred = + pic_fit.predict_with_measurement_noise(test_features).joint(); + + EXPECT_GT(pic_pred.mean.size(), 0); +} + +static constexpr std::size_t kNumRandomFits = 1000; + +static constexpr std::size_t kMaxSize = 20000; + +TEST(TestPicGP, ComparePrediction) { + std::default_random_engine gen(22); + std::uniform_int_distribution data_sizes{1, kMaxSize}; + std::uniform_int_distribution predict_sizes{1, 200}; + // std::uniform_real_distribution group_domain_sizes{1.e-3, 7.}; + + auto workers = make_shared_thread_pool(8); + + std::ofstream csv_out( + "/home/peddie/orion-engine/third_party/albatross/times.csv"); + + csv_out << "idx,n_train,n_inducing,n_blocks,blocks_used,n_predict,dense_fit_" + "time_ns,dense_pred_" + "time_ns,pic_fit_time_ns,pic_pred_time_ns,mean_err_norm,cov_err_" + "norm\n"; + + for (std::size_t i = 0; i < kNumRandomFits; ++i) { + // static constexpr double kLargestTrainPoint = + // static_cast(kNumTrainPoints) - 1.; + // static constexpr double kSpatialBuffer = 0; + // static constexpr std::size_t kNumBlocks = 5; + + // UniformlySpacedInducingPoints strategy(kNumInducingPoints); + // LeaveOneIntervalOut grouper( + // kLargestTrainPoint / static_cast(kNumBlocks) + 1e-8); + const std::size_t n_train = data_sizes(gen); + const auto n_train_fraction = [&n_train](double div) { + return std::max(std::size_t{1}, static_cast(ceil( + static_cast(n_train) / div))); + }; + std::uniform_int_distribution inducing_sizes{ + n_train_fraction(30.), n_train_fraction(5.)}; + const std::size_t n_inducing_points = inducing_sizes(gen); + const double largest_training_point = static_cast(n_train) - 1.; + std::uniform_int_distribution num_blocks_dist{2, 50}; + const std::size_t n_blocks = num_blocks_dist(gen); + + const std::size_t n_predict = predict_sizes(gen); + + UniformlySpacedInducingPoints strategy(n_inducing_points); + LeaveOneIntervalOut grouper(largest_training_point / + static_cast(n_blocks)); + auto direct = + gp_from_covariance(make_simple_covariance_function(), "direct"); + auto pic = + pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + pic.set_thread_pool(workers); + + auto dataset = make_toy_linear_data(5, 1, 0.1, n_train); + const auto begin_direct_fit = std::chrono::steady_clock::now(); + auto direct_fit = direct.fit(dataset); + const auto end_direct_fit = std::chrono::steady_clock::now(); + const auto begin_pic_fit = std::chrono::steady_clock::now(); + auto pic_fit = pic.fit(dataset); + const auto end_pic_fit = std::chrono::steady_clock::now(); + + std::set< + std::result_of_t> + train_indices; + std::transform(dataset.features.begin(), dataset.features.end(), + std::inserter(train_indices, train_indices.begin()), + grouper); + auto test_features = linspace(0, largest_training_point, n_predict); + // const std::size_t test_features_before = test_features.size(); + test_features.erase( + std::remove_if(test_features.begin(), test_features.end(), + [&train_indices, &grouper](const auto &f) { + // std::cout << "f: " << f << "; group: " + // << grouper(f) + // << std::endl; + return train_indices.count(grouper(f)) == 0; + }), + test_features.end()); + + if (test_features.empty()) { + continue; + } + + // const std::size_t test_features_after = test_features.size(); + + // std::cout << "train_indices (" << train_indices.size() << "): "; + // for (const auto &index : train_indices) { + // std::cout << index << ", "; + // } + // std::cout << std::endl; + + // std::cout << "test_features, " << test_features_before << " -> " + // << test_features_after << ": "; + // for (const auto &feature : test_features) { + // std::cout << feature << ", "; + // } + // std::cout << std::endl; + const auto begin_direct_pred = std::chrono::steady_clock::now(); + auto direct_pred = + direct_fit.predict_with_measurement_noise(test_features).joint(); + const auto end_direct_pred = std::chrono::steady_clock::now(); + auto pic_pred = + pic_fit.predict_with_measurement_noise(test_features).joint(); + const auto end_pic_pred = std::chrono::steady_clock::now(); + + csv_out << i << ',' << n_train << ',' << n_inducing_points << ',' + << n_blocks << ',' << train_indices.size() << ',' << n_predict + << ',' + << std::chrono::duration_cast( + end_direct_fit - begin_direct_fit) + .count() + << ',' + << std::chrono::duration_cast( + end_direct_pred - begin_direct_pred) + .count() + << ',' + << std::chrono::duration_cast( + end_pic_fit - begin_pic_fit) + .count() + << ',' + << std::chrono::duration_cast( + end_pic_pred - end_direct_pred) + .count() + << ',' << (pic_pred.mean - direct_pred.mean).norm() << ',' + << (pic_pred.covariance - direct_pred.covariance).norm() << '\n'; + + const double pic_error = (pic_pred.mean - direct_pred.mean).norm(); + EXPECT_LT(pic_error, 1e-4); + // << "|u|: " << n_inducing_points << "; |f|: " << dataset.size() + // << "; |p|: " << test_features.size() + // << "; B width: " << grouper.group_domain_size + // << "; n_B: " << train_indices.size(); + + // const double pic_cov_error = + // (pic_pred.covariance - direct_pred.covariance).norm(); + // EXPECT_LT(pic_cov_error, 1e-6) + // << "|u|: " << n_inducing_points << "; |f|: " << dataset.size() + // << "; |p|: " << test_features.size() + // << "; B width: " << grouper.group_domain_size + // << "; n_B: " << train_indices.size(); + + if (i % 10 == 0) { + csv_out.flush(); + std::cout << i << std::endl; + } + } + csv_out << std::endl; + csv_out.close(); +} + +template +class BruteForcePIC + : public CovarianceFunction< + BruteForcePIC> { +public: + CovarianceType cov_; + std::vector inducing_points_; + GrouperFunction grouper_; + Eigen::LDLT K_uu_ldlt_; + + BruteForcePIC(const std::vector &inducing_points, + CovarianceType &&cov, GrouperFunction &&grouper) + : cov_{cov}, inducing_points_{inducing_points}, grouper_{grouper}, + K_uu_ldlt_{cov_(inducing_points_, inducing_points_).ldlt()} {} + + template double _call_impl(const X &x, const X &y) const { + if (grouper_(x) == grouper_(y)) { + return cov_(x, y); + } + + Eigen::VectorXd K_xu(inducing_points_.size()); + Eigen::VectorXd K_uy(inducing_points_.size()); + for (Eigen::Index i = 0; + i < static_cast(inducing_points_.size()); ++i) { + K_xu[i] = cov_(x, inducing_points_[cast::to_size(i)]); + K_uy[i] = cov_(inducing_points_[cast::to_size(i)], y); + } + // const Eigen::VectorXd K_uy = cov_(inducing_points_, y); + return K_xu.dot(K_uu_ldlt_.solve(K_uy)); + } +}; + +template +auto make_brute_force_pic_covariance( + const std::vector &inducing_points, + CovarianceType &&cov, GrouperFunction &&grouper) { + return BruteForcePIC( + inducing_points, std::forward(cov), + std::forward(grouper)); +} + +TEST(TestPicGP, ScalarEquivalence) { + static constexpr std::size_t kNumTrainPoints = 3; + static constexpr std::size_t kNumTestPoints = 1; + static constexpr std::size_t kNumInducingPoints = 3; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + // Set the grouper so that all the simple test data falls within one + // group. + LeaveOneIntervalOut grouper(10); + auto direct = gp_from_covariance(make_simple_covariance_function(), "direct"); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto direct_fit = direct.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto direct_pred = + direct_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + const double pic_error = (pic_pred.mean - direct_pred.mean).norm(); + EXPECT_LT(pic_error, 1e-7) + << "|u|: " << kNumInducingPoints << "; |f|: " << dataset.size() + << "; |p|: " << test_features.size() + << "; B width: " << grouper.group_domain_size; + + const double pic_cov_error = + (pic_pred.covariance - direct_pred.covariance).norm(); + EXPECT_LT(pic_cov_error, 1e-7) + << "|u|: " << kNumInducingPoints << "; |f|: " << dataset.size() + << "; |p|: " << test_features.size() + << "; B width: " << grouper.group_domain_size + << "; n_B: " << kNumTrainPoints << "\n" + << pic_pred.covariance << "\n" + << direct_pred.covariance; +} + +TEST(TestPicGP, SingleBlockEquivalence) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 4; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + // Set the grouper so that all the simple test data falls within one + // group. + LeaveOneIntervalOut grouper(10); + auto direct = gp_from_covariance(make_simple_covariance_function(), "direct"); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto direct_fit = direct.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto direct_pred = + direct_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + const double pic_error = (pic_pred.mean - direct_pred.mean).norm(); + EXPECT_LT(pic_error, 2e-7) + << "|u|: " << kNumInducingPoints << "; |f|: " << dataset.size() + << "; |p|: " << test_features.size() + << "; B width: " << grouper.group_domain_size << "\n" + << pic_pred.mean.transpose() << "\n" + << direct_pred.mean.transpose(); + + const double pic_cov_error = + (pic_pred.covariance - direct_pred.covariance).norm(); + EXPECT_LT(pic_cov_error, 2e-7) + << "|u|: " << kNumInducingPoints << "; |f|: " << dataset.size() + << "; |p|: " << test_features.size() + << "; B width: " << grouper.group_domain_size + << "; n_B: " << kNumTrainPoints << "\n" + << pic_pred.covariance << "\n" + << direct_pred.covariance; +} + +template +JointDistribution test_pic(const RegressionDataset &data, + const std::vector &predict, + CovarianceType &&cov, InducingStrategy &&inducing, + GrouperFunction &&grouper) { + const auto u = inducing(cov, data.features); + + auto bfp_cov = make_brute_force_pic_covariance(inducing(cov, data.features), + cov, grouper); + + BlockDiagonal K_ff; + const auto indexers = group_by(data.features, grouper).indexers(); + std::vector reordered_inds; + for (const auto &pair : indexers) { + reordered_inds.insert(reordered_inds.end(), pair.second.begin(), + pair.second.end()); + K_ff.blocks.emplace_back(cov(subset(data.features, pair.second))); + K_ff.blocks.back().diagonal() += + subset(data.targets.covariance.diagonal(), pair.second); + } + const std::vector ordered_features = + subset(data.features, reordered_inds); + + Eigen::MatrixXd K_uu = cov(u, u); + K_uu.diagonal() += Eigen::VectorXd::Constant(K_uu.rows(), 1e-8); + const Eigen::MatrixXd K_fu = cov(ordered_features, u); + + const Eigen::SerializableLDLT K_uu_ldlt = K_uu.ldlt(); + const Eigen::MatrixXd P = K_uu_ldlt.sqrt_solve(K_fu.transpose()); + const Eigen::MatrixXd Q_ff = P.transpose() * P; + BlockDiagonal Q_ff_diag; + Eigen::Index i = 0; + for (const auto &pair : indexers) { + const Eigen::Index cols = cast::to_index(pair.second.size()); + auto P_cols = P.block(0, i, P.rows(), cols); + Q_ff_diag.blocks.emplace_back(P_cols.transpose() * P_cols); + i += cols; + } + // auto Lambda = K_ff - Q_ff_diag; + + Eigen::MatrixXd Q_ff_lambda = Q_ff; + Eigen::Index k = 0; + for (const auto &block : K_ff.blocks) { + Q_ff_lambda.block(k, k, block.rows(), block.rows()) = block; + Q_ff_lambda.block(k, k, block.rows(), block.rows()).diagonal() += + Eigen::VectorXd::Constant(block.rows(), 1e-2); + k += block.rows(); + } + + const auto print_matrix = [](const Eigen::MatrixXd &m, std::string &&label) { + std::cout << label << " (" << m.rows() << "x" << m.cols() << "):\n" + << m.format(Eigen::FullPrecision) << std::endl; + }; + + const auto print_vector = [](const Eigen::VectorXd &v, std::string &&label) { + std::cout << label << " (" << v.size() + << "): " << v.transpose().format(Eigen::FullPrecision) + << std::endl; + }; + + print_matrix(Q_ff_lambda, "Q_ff + Lambda"); + // const BlockDiagonalLDLT Lambda_ldlt = Lambda.ldlt(); + + print_matrix(bfp_cov(ordered_features), "BFP: Q_ff + Lambda"); + + print_matrix(Q_ff_lambda - bfp_cov(ordered_features), + "Q_ff_lambda difference"); + + const Eigen::SerializableLDLT S = Q_ff_lambda.ldlt(); + + print_matrix(S.matrixL(), "PITC L"); + print_vector(S.vectorD(), "PITC D"); + + Eigen::MatrixXd K_PIC(ordered_features.size(), predict.size()); + Eigen::MatrixXd K_PITC(ordered_features.size(), predict.size()); + + Eigen::MatrixXd V_PIC = + Eigen::MatrixXd::Zero(ordered_features.size(), predict.size()); + + Eigen::MatrixXd Y = K_uu_ldlt.solve(K_fu.transpose()); + print_matrix(Y, "Y"); + + Eigen::MatrixXd W = K_uu_ldlt.solve(S.solve(K_fu).transpose()); + print_matrix(W, "W"); + + for (Eigen::Index f = 0; f < cast::to_index(ordered_features.size()); ++f) { + for (Eigen::Index p = 0; p < cast::to_index(predict.size()); ++p) { + const std::vector pv{predict[p]}; + K_PITC(f, p) = K_fu.row(f).dot(K_uu_ldlt.solve(cov(u, pv)).col(0)); + if (grouper(predict[p]) == grouper(ordered_features[f])) { + K_PIC(f, p) = cov(ordered_features[f], predict[p]); + V_PIC(f, p) = + K_PIC(f, p) - K_fu.row(f).dot(K_uu_ldlt.solve(cov(u, pv)).col(0)); + } else { + K_PIC(f, p) = K_fu.row(f).dot(K_uu_ldlt.solve(cov(u, pv)).col(0)); + } + } + } + + const Eigen::MatrixXd K_pu = cov(predict, u); + // Eigen::Index j = 0; + // Eigen::Index blk = 0; + // std::vector WW; + + const Eigen::MatrixXd WBW = K_uu_ldlt.solve(S.solve(K_fu).transpose()); + // Eigen::MatrixXd WBW = + // Eigen::MatrixXd::Zero(u.size(), ordered_features.size()); + + // Eigen::MatrixXd K_Su = S.solve(K_fu); + // std::stringstream Ksus; + // Ksus << "K_Su[" << blk << "]"; + // print_matrix(K_Su, Ksus.str()); + + // for (const auto &block : K_ff.blocks) { + // // Eigen::MatrixXd K_Au = K_fu; + // // K_Au.middleRows(j, block.cols()) = + // // Eigen::MatrixXd::Zero(block.cols(), K_Au.cols()); + // // std::stringstream Kaus; + // // std::cout << "Block " << blk << " j = " << j << " size = " << + // block.cols() + // // << std::endl; + // // Kaus << "K_Au[" << blk << "]"; + // // print_matrix(K_Au, Kaus.str()); + + // Eigen::MatrixXd WB = K_uu_ldlt.solve(K_Su.transpose()); + // std::stringstream WBs; + // WBs << "W_B[" << blk << "]"; + // print_matrix(WB, WBs.str()); + // WB.leftCols(j) = Eigen::MatrixXd::Zero(K_fu.cols(), j); + // WB.rightCols(ordered_features.size() - j - block.cols()) = + // Eigen::MatrixXd::Zero(K_fu.cols(), + // ordered_features.size() - j - block.cols()); + // print_matrix(WB, WBs.str()); + + // print_matrix(WB * V_PIC, "WB * V"); + + // WW.push_back(WB); + // WBW += WB; + + // blk++; + // j += block.cols(); + // } + + print_matrix(K_PIC, "K_PIC"); + print_matrix(bfp_cov(ordered_features, predict), "BFP: K_PIC"); + + print_matrix(K_PIC - bfp_cov(ordered_features, predict), "K_PIC error"); + print_matrix(K_PITC, "K_PITC"); + + print_matrix(V_PIC, "V_PIC"); + + print_matrix(WBW, "W"); + const Eigen::MatrixXd U = K_pu * WBW * V_PIC; + print_matrix(U, "U"); + + auto SV = S.sqrt_solve(V_PIC); + const Eigen::MatrixXd VSV = + V_PIC.transpose() * S.solve(V_PIC); // SV.transpose() * SV; + print_matrix(VSV, "VSV"); + + const Eigen::MatrixXd predict_prior = cov(predict); + + print_matrix(predict_prior, "prior"); + + print_matrix(bfp_cov(predict), "BFP: prior"); + + print_matrix(predict_prior - bfp_cov(predict), "prior error"); + + // auto KK = K_PITC_ldlt.sqrt_solve(K_PIC); + // const Eigen::MatrixXd explained_cov = KK.transpose() * KK; + const Eigen::MatrixXd explained_cov = K_PIC.transpose() * S.solve(K_PIC); + print_matrix(explained_cov, "explained"); + + const Eigen::VectorXd PIC_mean = + K_PIC.transpose() * S.solve(data.targets.mean); + print_vector(PIC_mean, "PIC mean"); + const Eigen::MatrixXd predict_cov = predict_prior - explained_cov; + + print_matrix(predict_cov, "K_**"); + + const Eigen::MatrixXd PITC_cov = + predict_prior - K_PITC.transpose() * S.solve(K_PITC); + + const Eigen::MatrixXd PIC_cov = PITC_cov - U - U.transpose() - VSV; + print_matrix(PITC_cov, "PTIC cov"); + print_matrix(PIC_cov, "PIC cov"); + + print_matrix(predict_cov - PITC_cov, "PIC - PITC"); + print_matrix(predict_cov - PIC_cov, "PIC - factored"); + + return JointDistribution{PIC_mean, PIC_cov}; +} + +TEST(TestPicGP, BruteForceEquivalenceOneBlock) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + LeaveOneIntervalOut grouper(10); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + auto bfp_cov = make_brute_force_pic_covariance( + strategy(make_simple_covariance_function(), dataset.features), + make_simple_covariance_function(), grouper); + auto bfp = gp_from_covariance(bfp_cov); + + auto bfp_fit = bfp.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto bfp_pred = bfp_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + std::cout << "BFP mean (" << bfp_pred.mean.size() + << "): " << bfp_pred.mean.transpose().format(Eigen::FullPrecision) + << std::endl; + const double pic_error = (pic_pred.mean - bfp_pred.mean).norm(); + // const auto test_result = + // test_pic(dataset, test_features, make_simple_covariance_function(), + // strategy, grouper); + + EXPECT_LT(pic_error, 1e-8); + // EXPECT_LT((pic_pred.mean - test_result.mean).norm(), 1e-8); + + const double pic_cov_error = + (pic_pred.covariance - bfp_pred.covariance).norm(); + EXPECT_LT(pic_cov_error, 1e-7); +} + +TEST(TestPicGP, BruteForceEquivalenceMultipleBlocks) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + LeaveOneIntervalOut grouper(2); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + auto bfp_cov = make_brute_force_pic_covariance( + strategy(make_simple_covariance_function(), dataset.features), + make_simple_covariance_function(), grouper); + auto bfp = gp_from_covariance(bfp_cov); + + auto bfp_fit = bfp.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto bfp_pred = bfp_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + // std::cout << "BFP mean (" << bfp_pred.mean.size() + // << "): " << bfp_pred.mean.transpose().format(Eigen::FullPrecision) + // << std::endl; + const double pic_error = (pic_pred.mean - bfp_pred.mean).norm(); + // const auto test_result = + // test_pic(dataset, test_features, make_simple_covariance_function(), + // strategy, grouper); + + EXPECT_LT(pic_error, 1e-7); + // EXPECT_LT((pic_pred.mean - test_result.mean).norm(), 1e-8); + + const double pic_cov_error = + (pic_pred.covariance - bfp_pred.covariance).norm(); + EXPECT_LT(pic_cov_error, 1e-7); +} + +TEST(TestPicGP, EmitCSV) { + static constexpr std::size_t kNumTrainPoints = 80; + static constexpr std::size_t kNumTestPoints = 300; + static constexpr std::size_t kNumInducingPoints = 20; + + static constexpr double kLargestTrainPoint = + static_cast(kNumTrainPoints) - 1.; + static constexpr double kSpatialBuffer = 0; + static constexpr std::size_t kNumBlocks = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + LeaveOneIntervalOut grouper(kLargestTrainPoint / + static_cast(kNumBlocks) + 1e-8); + // LeaveOneIntervalOut grouper(10); + + auto direct = gp_from_covariance(make_simple_covariance_function(), "direct"); + + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + auto pitc = + sparse_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "sparse", DenseQRImplementation{}); + + auto dataset = + make_toy_linear_data(-kLargestTrainPoint / 2., 1, 0.1, kNumTrainPoints); + // auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto bfp_cov = make_brute_force_pic_covariance( + strategy(make_simple_covariance_function(), dataset.features), + make_simple_covariance_function(), grouper); + auto bfp = gp_from_covariance(bfp_cov); + + auto test_features = linspace( + kSpatialBuffer, kLargestTrainPoint - kSpatialBuffer, kNumTestPoints); + // auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto direct_fit = direct.fit(dataset); + auto pic_fit = pic.fit(dataset); + auto pitc_fit = pitc.fit(dataset); + // std::cout << "BFP: "; + auto bfp_fit = bfp.fit(dataset); + + auto direct_pred = + direct_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + auto pitc_pred = + pitc_fit.predict_with_measurement_noise(test_features).joint(); + // std::cout << "BFP: "; + auto bfp_pred = bfp_fit.predict_with_measurement_noise(test_features).joint(); + + std::ofstream csv_out( + "/home/peddie/orion-engine/third_party/albatross/example.csv"); + + csv_out << "type,idx,x,mean,marginal,group\n"; + for (std::size_t pred = 0; pred < direct_pred.size(); ++pred) { + csv_out << "dense," << std::setprecision(16) << pred << ',' + << test_features[pred] << ',' << direct_pred.mean[cast::to_index(pred)] << ',' + << direct_pred.covariance(cast::to_index(pred), cast::to_index(pred)) << ',' + << grouper(test_features[pred]) << '\n'; + } + + for (std::size_t pred = 0; pred < pitc_pred.size(); ++pred) { + csv_out << "pitc," << std::setprecision(16) << pred << ',' + << test_features[pred] << ',' << pitc_pred.mean[cast::to_index(pred)] << ',' + << pitc_pred.covariance(cast::to_index(pred), cast::to_index(pred)) << ',' + << grouper(test_features[pred]) << '\n'; + } + + for (std::size_t pred = 0; pred < pic_pred.size(); ++pred) { + csv_out << "pic," << std::setprecision(16) << pred << ',' + << test_features[pred] << ',' << pic_pred.mean[cast::to_index(pred)] << ',' + << pic_pred.covariance(cast::to_index(pred), cast::to_index(pred)) << ',' + << grouper(test_features[pred]) << '\n'; + } + + for (std::size_t pred = 0; pred < bfp_pred.size(); ++pred) { + csv_out << "bfp," << std::setprecision(16) << pred << ',' + << test_features[pred] << ',' << bfp_pred.mean[cast::to_index(pred)] << ',' + << bfp_pred.covariance(cast::to_index(pred), cast::to_index(pred)) << ',' + << grouper(test_features[pred]) << '\n'; + } + + csv_out << std::endl; + csv_out.close(); + + std::ofstream points_out("/home/peddie/orion-engine/third_party/" + "albatross/example_points.csv"); + + points_out << "type,x,y\n"; + for (std::size_t i = 0; i < dataset.size(); ++i) { + points_out << "train," << dataset.features[i] << ',' + << dataset.targets.mean[cast::to_index(i)] << '\n'; + } + + for (const auto &f : pic_fit.get_fit().inducing_features) { + points_out << "inducing," << f << ",0\n"; + } + points_out << std::endl; + points_out.close(); + + // std::cout << "BFP L: " + // << Eigen::MatrixXd(bfp_fit.get_fit().train_covariance.matrixL()) + // .format(Eigen::FullPrecision) + // << std::endl; + // std::cout << "BFP D: " + // << + // bfp_fit.get_fit().train_covariance.vectorD().transpose().format( + // Eigen::FullPrecision) + // << std::endl; + + // const auto test_result = + // test_pic(dataset, test_features, make_simple_covariance_function(), + // strategy, grouper); + + // std::cout << "bfp_pred.covariance (" << bfp_pred.covariance.rows() << "x" + // << bfp_pred.covariance.cols() << "):\n" + // << bfp_pred.covariance << std::endl; + + const double pic_error = (pic_pred.mean - direct_pred.mean).norm(); + EXPECT_LT(pic_error, 6.5e-7); + // EXPECT_LT((pic_pred.mean - test_result.mean).norm(), 1e-7); + // << "|u|: " << kNumInducingPoints << "; |f|: " << dataset.size() + // << "; |p|: " << test_features.size() + // << "; B width: " << grouper.group_domain_size << "\n" + // << pic_pred.mean.transpose() << "\n" + // << direct_pred.mean.transpose(); + + const double pic_cov_error = + (pic_pred.covariance - direct_pred.covariance).norm(); + EXPECT_LT(pic_cov_error, 5e-7); + // << "|u|: " << kNumInducingPoints << "; |f|: " << dataset.size() + // << "; |p|: " << test_features.size() + // << "; B width: " << grouper.group_domain_size + // << "; n_B: " << kNumTrainPoints << "\n" + // << pic_pred.covariance << "\n" + // << direct_pred.covariance; +} + +} // namespace albatross \ No newline at end of file