Merge pull request #79 from SciML/gpu_offload

ChrisRackauckas · web-flow · commit 5dbe0784daaf · 2021-12-20T15:51:20.000-05:00
Add GPU offloading
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -0,0 +1,17 @@
+steps:
+  - label: "GPU"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1:
+           coverage: false # 1000x slowdown
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    env:
+      GROUP: 'GPU'
+      JULIA_PKG_SERVER: "" # it often struggles with our large artifacts
+      # SECRET_CODECOV_TOKEN: "..."
+    timeout_in_minutes: 30
+    # Don't run Buildkite if the commit message includes the text [skip tests]
+    if: build.message !~ /\[skip tests\]/
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -1,13 +1,11 @@
 name: CI
 on:
   - push
-  - pull_request
 jobs:
   test:
     name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
     runs-on: ${{ matrix.os }}
     strategy:
-      fail-fast: false
       matrix:
         version:
           - '1'
@@ -16,6 +14,8 @@ jobs:
           - ubuntu-latest
         arch:
           - x64
+        group:
+          - Core
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
diff --git a/Project.toml b/Project.toml
@@ -37,7 +37,9 @@ julia = "1.6"
 
 [extras]
 Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Pardiso"]
+test = ["Test", "Pardiso", "Pkg", "SafeTestsets"]
diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md
@@ -40,27 +40,27 @@ like CUDA.
 These overloads tend to work for many array types, such as `CuArrays` for GPU-accelerated
 solving, using the overloads provided by the respective packages. Given that this can be
 customized per-package, details given below describe a subset of important arrays
-(`Matrix`, `SparseMatrixCSC`, `CuMatrix`, etc.) 
+(`Matrix`, `SparseMatrixCSC`, `CuMatrix`, etc.)
 
 - `LUFactorization(pivot=LinearAlgebra.RowMaximum())`: Julia's built in `lu`.
   - On dense matrices this uses the current BLAS implementation of the user's computer
     which by default is OpenBLAS but will use MKL if the user does `using MKL` in their
-    system. 
+    system.
   - On sparse matrices this will use UMFPACK from SuiteSparse. Note that this will not
     cache the symbolic factorization.
   - On CuMatrix it will use a CUDA-accelerated LU from CuSolver.
   - On BandedMatrix and BlockBandedMatrix it will use a banded LU.
 - `QRFactorization(pivot=LinearAlgebra.NoPivot(),blocksize=16)`: Julia's built in `qr`.
   - On dense matrices this uses the current BLAS implementation of the user's computer
     which by default is OpenBLAS but will use MKL if the user does `using MKL` in their
-    system. 
+    system.
   - On sparse matrices this will use SPQR from SuiteSparse
   - On CuMatrix it will use a CUDA-accelerated QR from CuSolver.
   - On BandedMatrix and BlockBandedMatrix it will use a banded QR.
 - `SVDFactorization(full=false,alg=LinearAlgebra.DivideAndConquer())`: Julia's built in `svd`.
   - On dense matrices this uses the current BLAS implementation of the user's computer
     which by default is OpenBLAS but will use MKL if the user does `using MKL` in their
-    system. 
+    system.
 - `GenericFactorization(fact_alg)`: Constructs a linear solver from a generic
   factorization algorithm `fact_alg` which complies with the Base.LinearAlgebra
   factorization API. Quoting from Base:
@@ -119,6 +119,15 @@ Base.@kwdef struct PardisoJL <: SciMLLinearSolveAlgorithm
 end
 ```
 
+### CUDA.jl
+
+Note that `CuArrays` are supported by `GenericFactorization` in the "normal" way.
+The following are non-standard GPU factorization routines.
+
+- `GPUOffloadFactorization`: An offloading technique used to GPU-accelerate CPU-based
+  computations. Requires a sufficiently large `A` to overcome the data transfer
+  costs.
+
 ### IterativeSolvers.jl
 
 - `IterativeSolversJL_CG(args...;kwargs...)`: A generic CG implementation
diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl
@@ -44,6 +44,7 @@ function __init__()
     IS_OPENBLAS[] = occursin("openblas", BLAS.get_config().loaded_libs[1].libname)
   end
 
+  @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" include("cuda.jl")
   @require Pardiso="46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2" include("pardiso.jl")
 end
 
@@ -52,6 +53,5 @@ export LUFactorization, SVDFactorization, QRFactorization, GenericFactorization,
 export KrylovJL, KrylovJL_CG, KrylovJL_GMRES, KrylovJL_BICGSTAB, KrylovJL_MINRES,
        IterativeSolversJL, IterativeSolversJL_CG, IterativeSolversJL_GMRES,
        IterativeSolversJL_BICGSTAB, IterativeSolversJL_MINRES
-export DefaultLinSolve
 
 end
diff --git a/src/cuda.jl b/src/cuda.jl
@@ -0,0 +1,34 @@
+struct GPUOffloadFactorization <: AbstractFactorization end
+
+function SciMLBase.solve(cache::LinearCache, alg::GPUOffloadFactorization; kwargs...)
+    if cache.isfresh
+        fact = do_factorization(alg, CUDA.CuArray(cache.A), cache.b, cache.u)
+        cache = set_cacheval(cache, fact)
+    end
+
+    copyto!(cache.u,cache.b)
+    y = Array(ldiv!(cache.cacheval, CUDA.CuArray(cache.u)))
+    SciMLBase.build_linear_solution(alg,y,nothing,cache)
+end
+
+function do_factorization(alg::GPUOffloadFactorization, A, b, u)
+    A isa Union{AbstractMatrix,AbstractDiffEqOperator} ||
+        error("LU is not defined for $(typeof(A))")
+
+    if A isa AbstractDiffEqOperator
+        A = A.A
+    end
+    fact = qr(CUDA.CuArray(A))
+    return fact
+end
+
+function LinearAlgebra.ldiv!(x::CUDA.CuArray,_qr::CUDA.CUSOLVER.CuQR,b::CUDA.CuArray)
+  _x = UpperTriangular(_qr.R) \ (_qr.Q' * reshape(b,length(b),1))
+  x .= vec(_x)
+  CUDA.unsafe_free!(_x)
+  return x
+end
+# make `\` work
+LinearAlgebra.ldiv!(F::CUDA.CUSOLVER.CuQR, b::CUDA.CuArray) = (x = similar(b); ldiv!(x, F, b); x)
+
+export GPUOffloadFactorization
diff --git a/test/basictests.jl b/test/basictests.jl
@@ -0,0 +1,235 @@
+using LinearSolve, LinearAlgebra, SparseArrays
+using Test
+
+n = 8
+A = Matrix(I,n,n)
+b = ones(n)
+A1 = A/1; b1 = rand(n); x1 = zero(b)
+A2 = A/2; b2 = rand(n); x2 = zero(b)
+
+prob1 = LinearProblem(A1, b1; u0=x1)
+prob2 = LinearProblem(A2, b2; u0=x2)
+
+cache_kwargs = (;verbose=true, abstol=1e-8, reltol=1e-8, maxiter=30,)
+
+function test_interface(alg, prob1, prob2)
+    A1 = prob1.A; b1 = prob1.b; x1 = prob1.u0
+    A2 = prob2.A; b2 = prob2.b; x2 = prob2.u0
+
+    y = solve(prob1, alg; cache_kwargs...)
+    @test A1 *  y  ≈ b1
+
+    cache = SciMLBase.init(prob1,alg; cache_kwargs...) # initialize cache
+    y = solve(cache)
+    @test A1 *  y  ≈ b1
+
+    cache = LinearSolve.set_A(cache,copy(A2))
+    y = solve(cache)
+    @test A2 *  y  ≈ b1
+
+    cache = LinearSolve.set_b(cache,b2)
+    y = solve(cache)
+    @test A2 *  y  ≈ b2
+
+    return
+end
+
+@testset "LinearSolve" begin
+
+@testset "Default Linear Solver" begin
+    test_interface(nothing, prob1, prob2)
+
+    A1 = prob1.A; b1 = prob1.b; x1 = prob1.u0
+    y = solve(prob1)
+    @test A1 *  y  ≈ b1
+
+    _prob = LinearProblem(SymTridiagonal(A1), b1; u0=x1)
+    y = solve(_prob)
+    @test A1 *  y  ≈ b1
+
+    _prob = LinearProblem(Tridiagonal(A1), b1; u0=x1)
+    y = solve(_prob)
+    @test A1 *  y  ≈ b1
+
+    _prob = LinearProblem(Symmetric(A1), b1; u0=x1)
+    y = solve(_prob)
+    @test A1 *  y  ≈ b1
+
+    _prob = LinearProblem(Hermitian(A1), b1; u0=x1)
+    y = solve(_prob)
+    @test A1 *  y  ≈ b1
+
+
+    _prob = LinearProblem(sparse(A1), b1; u0=x1)
+    y = solve(_prob)
+    @test A1 *  y  ≈ b1
+end
+
+@testset "UMFPACK Factorization" begin
+    A1 = A/1; b1 = rand(n); x1 = zero(b)
+    A2 = A/2; b2 = rand(n); x2 = zero(b)
+
+    prob1 = LinearProblem(sparse(A1), b1; u0=x1)
+    prob2 = LinearProblem(sparse(A2), b2; u0=x2)
+    test_interface(UMFPACKFactorization(), prob1, prob2)
+
+    # Test that refactoring wrong throws.
+    cache = SciMLBase.init(prob1,UMFPACKFactorization(); cache_kwargs...) # initialize cache
+    y = solve(cache)
+    cache = LinearSolve.set_A(cache,sprand(n, n, 0.8))
+    @test_throws ArgumentError solve(cache)
+end
+
+@testset "KLU Factorization" begin
+    A1 = A/1; b1 = rand(n); x1 = zero(b)
+    A2 = A/2; b2 = rand(n); x2 = zero(b)
+
+    prob1 = LinearProblem(sparse(A1), b1; u0=x1)
+    prob2 = LinearProblem(sparse(A2), b2; u0=x2)
+    test_interface(KLUFactorization(), prob1, prob2)
+
+    # Test that refactoring wrong throws.
+    cache = SciMLBase.init(prob1,KLUFactorization(); cache_kwargs...) # initialize cache
+    y = solve(cache)
+    X = copy(A1)
+    X[8,8] = 0.0
+    X[7,8] = 1.0
+    cache = LinearSolve.set_A(cache,sparse(X))
+    @test_throws ArgumentError solve(cache)
+end
+
+@testset "Concrete Factorizations" begin
+    for alg in (
+                LUFactorization(),
+                QRFactorization(),
+                SVDFactorization(),
+                RFLUFactorization()
+               )
+        @testset "$alg" begin
+            test_interface(alg, prob1, prob2)
+        end
+    end
+end
+
+@testset "Generic Factorizations" begin
+    for fact_alg in (
+                     lu, lu!,
+                     qr, qr!,
+                     cholesky,
+                     #cholesky!,
+    #                ldlt, ldlt!,
+                     bunchkaufman, bunchkaufman!,
+                     lq, lq!,
+                     svd, svd!,
+                     LinearAlgebra.factorize,
+                    )
+        @testset "fact_alg = $fact_alg" begin
+            alg = GenericFactorization(fact_alg=fact_alg)
+            test_interface(alg, prob1, prob2)
+        end
+    end
+end
+
+@testset "KrylovJL" begin
+    kwargs = (;gmres_restart=5,)
+    for alg in (
+                ("Default",KrylovJL(kwargs...)),
+                ("CG",KrylovJL_CG(kwargs...)),
+                ("GMRES",KrylovJL_GMRES(kwargs...)),
+    #           ("BICGSTAB",KrylovJL_BICGSTAB(kwargs...)),
+                ("MINRES",KrylovJL_MINRES(kwargs...)),
+               )
+        @testset "$(alg[1])" begin
+            test_interface(alg[2], prob1, prob2)
+        end
+    end
+end
+
+@testset "IterativeSolversJL" begin
+    kwargs = (;gmres_restart=5,)
+    for alg in (
+                ("Default", IterativeSolversJL(kwargs...)),
+                ("CG", IterativeSolversJL_CG(kwargs...)),
+                ("GMRES",IterativeSolversJL_GMRES(kwargs...)),
+    #           ("BICGSTAB",IterativeSolversJL_BICGSTAB(kwargs...)),
+    #            ("MINRES",IterativeSolversJL_MINRES(kwargs...)),
+               )
+        @testset "$(alg[1])" begin
+            test_interface(alg[2], prob1, prob2)
+        end
+    end
+end
+
+@testset "PardisoJL" begin
+    @test_throws UndefVarError alg = PardisoJL()
+
+    using Pardiso, SparseArrays
+
+    A1 = sparse([ 1. 0 -2  3
+                 0  5  1  2
+                -2  1  4 -7
+                 3  2 -7  5 ])
+    b1 = rand(4)
+    prob1 = LinearProblem(A1, b1)
+
+    lambda = 3
+    e = ones(n)
+    e2 = ones(n-1)
+    A2 = spdiagm(-1 => im*e2, 0 => lambda*e, 1 => -im*e2)
+    b2 = rand(n) + im * zeros(n)
+
+    prob2 = LinearProblem(A2, b2)
+
+    for alg in (
+                PardisoJL(),
+                MKLPardisoFactorize(),
+                MKLPardisoIterate(),
+               )
+
+        u = solve(prob1, alg; cache_kwargs...).u
+        @test A1 * u ≈ b1
+
+        u = solve(prob2, alg; cache_kwargs...).u
+        @test eltype(u) <: Complex
+        @test_broken A2 * u ≈ b2
+    end
+
+end
+
+@testset "Preconditioners" begin
+    @testset "Vector Diagonal Preconditioner" begin
+        s = rand(n)
+        Pl, Pr = Diagonal(s),LinearSolve.InvPreconditioner(Diagonal(s))
+
+        x = rand(n,n)
+        y = rand(n,n)
+
+        mul!(y, Pl, x); @test y ≈ s .* x
+        mul!(y, Pr, x); @test y ≈ s .\ x
+
+        y .= x; ldiv!(Pl, x); @test x ≈ s .\ y
+        y .= x; ldiv!(Pr, x); @test x ≈ s .* y
+
+        ldiv!(y, Pl, x); @test y ≈ s .\ x
+        ldiv!(y, Pr, x); @test y ≈ s .* x
+    end
+
+    @testset "ComposePreconditioenr" begin
+        s1 = rand(n)
+        s2 = rand(n)
+
+        x = rand(n,n)
+        y = rand(n,n)
+
+        P1 = Diagonal(s1)
+        P2 = Diagonal(s2)
+
+        P  = LinearSolve.ComposePreconditioner(P1,P2)
+
+        # ComposePreconditioner
+        ldiv!(y, P, x);      @test y ≈ ldiv!(P2, ldiv!(P1, x))
+        y .= x; ldiv!(P, x); @test x ≈ ldiv!(P2, ldiv!(P1, y))
+    end
+end
+
+end # testset
diff --git a/test/cuda.jl b/test/cuda.jl
diff --git a/test/downstream/Project.toml b/test/downstream/Project.toml
diff --git a/test/runtests.jl b/test/runtests.jl