Skip to content

Commit 5dbe078

Browse files
Merge pull request #79 from SciML/gpu_offload
Add GPU offloading
2 parents 8bd920a + 93829c3 commit 5dbe078

File tree

10 files changed

+363
-236
lines changed

10 files changed

+363
-236
lines changed

.buildkite/pipeline.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
steps:
2+
- label: "GPU"
3+
plugins:
4+
- JuliaCI/julia#v1:
5+
version: "1"
6+
- JuliaCI/julia-test#v1:
7+
coverage: false # 1000x slowdown
8+
agents:
9+
queue: "juliagpu"
10+
cuda: "*"
11+
env:
12+
GROUP: 'GPU'
13+
JULIA_PKG_SERVER: "" # it often struggles with our large artifacts
14+
# SECRET_CODECOV_TOKEN: "..."
15+
timeout_in_minutes: 30
16+
# Don't run Buildkite if the commit message includes the text [skip tests]
17+
if: build.message !~ /\[skip tests\]/

.github/workflows/CI.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
name: CI
22
on:
33
- push
4-
- pull_request
54
jobs:
65
test:
76
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
87
runs-on: ${{ matrix.os }}
98
strategy:
10-
fail-fast: false
119
matrix:
1210
version:
1311
- '1'
@@ -16,6 +14,8 @@ jobs:
1614
- ubuntu-latest
1715
arch:
1816
- x64
17+
group:
18+
- Core
1919
steps:
2020
- uses: actions/checkout@v2
2121
- uses: julia-actions/setup-julia@v1

Project.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ julia = "1.6"
3737

3838
[extras]
3939
Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
40+
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
41+
SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
4042
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
4143

4244
[targets]
43-
test = ["Test", "Pardiso"]
45+
test = ["Test", "Pardiso", "Pkg", "SafeTestsets"]

docs/src/solvers/solvers.md

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,27 +40,27 @@ like CUDA.
4040
These overloads tend to work for many array types, such as `CuArrays` for GPU-accelerated
4141
solving, using the overloads provided by the respective packages. Given that this can be
4242
customized per-package, details given below describe a subset of important arrays
43-
(`Matrix`, `SparseMatrixCSC`, `CuMatrix`, etc.)
43+
(`Matrix`, `SparseMatrixCSC`, `CuMatrix`, etc.)
4444

4545
- `LUFactorization(pivot=LinearAlgebra.RowMaximum())`: Julia's built in `lu`.
4646
- On dense matrices this uses the current BLAS implementation of the user's computer
4747
which by default is OpenBLAS but will use MKL if the user does `using MKL` in their
48-
system.
48+
system.
4949
- On sparse matrices this will use UMFPACK from SuiteSparse. Note that this will not
5050
cache the symbolic factorization.
5151
- On CuMatrix it will use a CUDA-accelerated LU from CuSolver.
5252
- On BandedMatrix and BlockBandedMatrix it will use a banded LU.
5353
- `QRFactorization(pivot=LinearAlgebra.NoPivot(),blocksize=16)`: Julia's built in `qr`.
5454
- On dense matrices this uses the current BLAS implementation of the user's computer
5555
which by default is OpenBLAS but will use MKL if the user does `using MKL` in their
56-
system.
56+
system.
5757
- On sparse matrices this will use SPQR from SuiteSparse
5858
- On CuMatrix it will use a CUDA-accelerated QR from CuSolver.
5959
- On BandedMatrix and BlockBandedMatrix it will use a banded QR.
6060
- `SVDFactorization(full=false,alg=LinearAlgebra.DivideAndConquer())`: Julia's built in `svd`.
6161
- On dense matrices this uses the current BLAS implementation of the user's computer
6262
which by default is OpenBLAS but will use MKL if the user does `using MKL` in their
63-
system.
63+
system.
6464
- `GenericFactorization(fact_alg)`: Constructs a linear solver from a generic
6565
factorization algorithm `fact_alg` which complies with the Base.LinearAlgebra
6666
factorization API. Quoting from Base:
@@ -119,6 +119,15 @@ Base.@kwdef struct PardisoJL <: SciMLLinearSolveAlgorithm
119119
end
120120
```
121121

122+
### CUDA.jl
123+
124+
Note that `CuArrays` are supported by `GenericFactorization` in the "normal" way.
125+
The following are non-standard GPU factorization routines.
126+
127+
- `GPUOffloadFactorization`: An offloading technique used to GPU-accelerate CPU-based
128+
computations. Requires a sufficiently large `A` to overcome the data transfer
129+
costs.
130+
122131
### IterativeSolvers.jl
123132

124133
- `IterativeSolversJL_CG(args...;kwargs...)`: A generic CG implementation

src/LinearSolve.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ function __init__()
4444
IS_OPENBLAS[] = occursin("openblas", BLAS.get_config().loaded_libs[1].libname)
4545
end
4646

47+
@require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" include("cuda.jl")
4748
@require Pardiso="46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2" include("pardiso.jl")
4849
end
4950

@@ -52,6 +53,5 @@ export LUFactorization, SVDFactorization, QRFactorization, GenericFactorization,
5253
export KrylovJL, KrylovJL_CG, KrylovJL_GMRES, KrylovJL_BICGSTAB, KrylovJL_MINRES,
5354
IterativeSolversJL, IterativeSolversJL_CG, IterativeSolversJL_GMRES,
5455
IterativeSolversJL_BICGSTAB, IterativeSolversJL_MINRES
55-
export DefaultLinSolve
5656

5757
end

src/cuda.jl

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
struct GPUOffloadFactorization <: AbstractFactorization end
2+
3+
function SciMLBase.solve(cache::LinearCache, alg::GPUOffloadFactorization; kwargs...)
4+
if cache.isfresh
5+
fact = do_factorization(alg, CUDA.CuArray(cache.A), cache.b, cache.u)
6+
cache = set_cacheval(cache, fact)
7+
end
8+
9+
copyto!(cache.u,cache.b)
10+
y = Array(ldiv!(cache.cacheval, CUDA.CuArray(cache.u)))
11+
SciMLBase.build_linear_solution(alg,y,nothing,cache)
12+
end
13+
14+
function do_factorization(alg::GPUOffloadFactorization, A, b, u)
15+
A isa Union{AbstractMatrix,AbstractDiffEqOperator} ||
16+
error("LU is not defined for $(typeof(A))")
17+
18+
if A isa AbstractDiffEqOperator
19+
A = A.A
20+
end
21+
fact = qr(CUDA.CuArray(A))
22+
return fact
23+
end
24+
25+
function LinearAlgebra.ldiv!(x::CUDA.CuArray,_qr::CUDA.CUSOLVER.CuQR,b::CUDA.CuArray)
26+
_x = UpperTriangular(_qr.R) \ (_qr.Q' * reshape(b,length(b),1))
27+
x .= vec(_x)
28+
CUDA.unsafe_free!(_x)
29+
return x
30+
end
31+
# make `\` work
32+
LinearAlgebra.ldiv!(F::CUDA.CUSOLVER.CuQR, b::CUDA.CuArray) = (x = similar(b); ldiv!(x, F, b); x)
33+
34+
export GPUOffloadFactorization

test/basictests.jl

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
using LinearSolve, LinearAlgebra, SparseArrays
2+
using Test
3+
4+
n = 8
5+
A = Matrix(I,n,n)
6+
b = ones(n)
7+
A1 = A/1; b1 = rand(n); x1 = zero(b)
8+
A2 = A/2; b2 = rand(n); x2 = zero(b)
9+
10+
prob1 = LinearProblem(A1, b1; u0=x1)
11+
prob2 = LinearProblem(A2, b2; u0=x2)
12+
13+
cache_kwargs = (;verbose=true, abstol=1e-8, reltol=1e-8, maxiter=30,)
14+
15+
function test_interface(alg, prob1, prob2)
16+
A1 = prob1.A; b1 = prob1.b; x1 = prob1.u0
17+
A2 = prob2.A; b2 = prob2.b; x2 = prob2.u0
18+
19+
y = solve(prob1, alg; cache_kwargs...)
20+
@test A1 * y b1
21+
22+
cache = SciMLBase.init(prob1,alg; cache_kwargs...) # initialize cache
23+
y = solve(cache)
24+
@test A1 * y b1
25+
26+
cache = LinearSolve.set_A(cache,copy(A2))
27+
y = solve(cache)
28+
@test A2 * y b1
29+
30+
cache = LinearSolve.set_b(cache,b2)
31+
y = solve(cache)
32+
@test A2 * y b2
33+
34+
return
35+
end
36+
37+
@testset "LinearSolve" begin
38+
39+
@testset "Default Linear Solver" begin
40+
test_interface(nothing, prob1, prob2)
41+
42+
A1 = prob1.A; b1 = prob1.b; x1 = prob1.u0
43+
y = solve(prob1)
44+
@test A1 * y b1
45+
46+
_prob = LinearProblem(SymTridiagonal(A1), b1; u0=x1)
47+
y = solve(_prob)
48+
@test A1 * y b1
49+
50+
_prob = LinearProblem(Tridiagonal(A1), b1; u0=x1)
51+
y = solve(_prob)
52+
@test A1 * y b1
53+
54+
_prob = LinearProblem(Symmetric(A1), b1; u0=x1)
55+
y = solve(_prob)
56+
@test A1 * y b1
57+
58+
_prob = LinearProblem(Hermitian(A1), b1; u0=x1)
59+
y = solve(_prob)
60+
@test A1 * y b1
61+
62+
63+
_prob = LinearProblem(sparse(A1), b1; u0=x1)
64+
y = solve(_prob)
65+
@test A1 * y b1
66+
end
67+
68+
@testset "UMFPACK Factorization" begin
69+
A1 = A/1; b1 = rand(n); x1 = zero(b)
70+
A2 = A/2; b2 = rand(n); x2 = zero(b)
71+
72+
prob1 = LinearProblem(sparse(A1), b1; u0=x1)
73+
prob2 = LinearProblem(sparse(A2), b2; u0=x2)
74+
test_interface(UMFPACKFactorization(), prob1, prob2)
75+
76+
# Test that refactoring wrong throws.
77+
cache = SciMLBase.init(prob1,UMFPACKFactorization(); cache_kwargs...) # initialize cache
78+
y = solve(cache)
79+
cache = LinearSolve.set_A(cache,sprand(n, n, 0.8))
80+
@test_throws ArgumentError solve(cache)
81+
end
82+
83+
@testset "KLU Factorization" begin
84+
A1 = A/1; b1 = rand(n); x1 = zero(b)
85+
A2 = A/2; b2 = rand(n); x2 = zero(b)
86+
87+
prob1 = LinearProblem(sparse(A1), b1; u0=x1)
88+
prob2 = LinearProblem(sparse(A2), b2; u0=x2)
89+
test_interface(KLUFactorization(), prob1, prob2)
90+
91+
# Test that refactoring wrong throws.
92+
cache = SciMLBase.init(prob1,KLUFactorization(); cache_kwargs...) # initialize cache
93+
y = solve(cache)
94+
X = copy(A1)
95+
X[8,8] = 0.0
96+
X[7,8] = 1.0
97+
cache = LinearSolve.set_A(cache,sparse(X))
98+
@test_throws ArgumentError solve(cache)
99+
end
100+
101+
@testset "Concrete Factorizations" begin
102+
for alg in (
103+
LUFactorization(),
104+
QRFactorization(),
105+
SVDFactorization(),
106+
RFLUFactorization()
107+
)
108+
@testset "$alg" begin
109+
test_interface(alg, prob1, prob2)
110+
end
111+
end
112+
end
113+
114+
@testset "Generic Factorizations" begin
115+
for fact_alg in (
116+
lu, lu!,
117+
qr, qr!,
118+
cholesky,
119+
#cholesky!,
120+
# ldlt, ldlt!,
121+
bunchkaufman, bunchkaufman!,
122+
lq, lq!,
123+
svd, svd!,
124+
LinearAlgebra.factorize,
125+
)
126+
@testset "fact_alg = $fact_alg" begin
127+
alg = GenericFactorization(fact_alg=fact_alg)
128+
test_interface(alg, prob1, prob2)
129+
end
130+
end
131+
end
132+
133+
@testset "KrylovJL" begin
134+
kwargs = (;gmres_restart=5,)
135+
for alg in (
136+
("Default",KrylovJL(kwargs...)),
137+
("CG",KrylovJL_CG(kwargs...)),
138+
("GMRES",KrylovJL_GMRES(kwargs...)),
139+
# ("BICGSTAB",KrylovJL_BICGSTAB(kwargs...)),
140+
("MINRES",KrylovJL_MINRES(kwargs...)),
141+
)
142+
@testset "$(alg[1])" begin
143+
test_interface(alg[2], prob1, prob2)
144+
end
145+
end
146+
end
147+
148+
@testset "IterativeSolversJL" begin
149+
kwargs = (;gmres_restart=5,)
150+
for alg in (
151+
("Default", IterativeSolversJL(kwargs...)),
152+
("CG", IterativeSolversJL_CG(kwargs...)),
153+
("GMRES",IterativeSolversJL_GMRES(kwargs...)),
154+
# ("BICGSTAB",IterativeSolversJL_BICGSTAB(kwargs...)),
155+
# ("MINRES",IterativeSolversJL_MINRES(kwargs...)),
156+
)
157+
@testset "$(alg[1])" begin
158+
test_interface(alg[2], prob1, prob2)
159+
end
160+
end
161+
end
162+
163+
@testset "PardisoJL" begin
164+
@test_throws UndefVarError alg = PardisoJL()
165+
166+
using Pardiso, SparseArrays
167+
168+
A1 = sparse([ 1. 0 -2 3
169+
0 5 1 2
170+
-2 1 4 -7
171+
3 2 -7 5 ])
172+
b1 = rand(4)
173+
prob1 = LinearProblem(A1, b1)
174+
175+
lambda = 3
176+
e = ones(n)
177+
e2 = ones(n-1)
178+
A2 = spdiagm(-1 => im*e2, 0 => lambda*e, 1 => -im*e2)
179+
b2 = rand(n) + im * zeros(n)
180+
181+
prob2 = LinearProblem(A2, b2)
182+
183+
for alg in (
184+
PardisoJL(),
185+
MKLPardisoFactorize(),
186+
MKLPardisoIterate(),
187+
)
188+
189+
u = solve(prob1, alg; cache_kwargs...).u
190+
@test A1 * u b1
191+
192+
u = solve(prob2, alg; cache_kwargs...).u
193+
@test eltype(u) <: Complex
194+
@test_broken A2 * u b2
195+
end
196+
197+
end
198+
199+
@testset "Preconditioners" begin
200+
@testset "Vector Diagonal Preconditioner" begin
201+
s = rand(n)
202+
Pl, Pr = Diagonal(s),LinearSolve.InvPreconditioner(Diagonal(s))
203+
204+
x = rand(n,n)
205+
y = rand(n,n)
206+
207+
mul!(y, Pl, x); @test y s .* x
208+
mul!(y, Pr, x); @test y s .\ x
209+
210+
y .= x; ldiv!(Pl, x); @test x s .\ y
211+
y .= x; ldiv!(Pr, x); @test x s .* y
212+
213+
ldiv!(y, Pl, x); @test y s .\ x
214+
ldiv!(y, Pr, x); @test y s .* x
215+
end
216+
217+
@testset "ComposePreconditioenr" begin
218+
s1 = rand(n)
219+
s2 = rand(n)
220+
221+
x = rand(n,n)
222+
y = rand(n,n)
223+
224+
P1 = Diagonal(s1)
225+
P2 = Diagonal(s2)
226+
227+
P = LinearSolve.ComposePreconditioner(P1,P2)
228+
229+
# ComposePreconditioner
230+
ldiv!(y, P, x); @test y ldiv!(P2, ldiv!(P1, x))
231+
y .= x; ldiv!(P, x); @test x ldiv!(P2, ldiv!(P1, y))
232+
end
233+
end
234+
235+
end # testset

0 commit comments

Comments
 (0)