Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander-Barth committed Dec 5, 2024
1 parent 5637fb3 commit 07a6ee5
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 17 deletions.
12 changes: 6 additions & 6 deletions ext/AMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,16 @@ function interpnd_d!(pos::AbstractVector{<:NTuple{N}},A,vec) where N
return nothing
end

function interpnd!(pos::AbstractVector{<:NTuple{N}},A_d::ROCArray,vec_d) where N
function interpnd!(pos::AbstractVector{<:NTuple{N}},A::ROCArray,vec) where N
AMDGPU.@sync begin
len = length(pos)
kernel = @roc launch=false interpnd_d!(pos,A_d,vec_d)
kernel = @roc launch=false interpnd_d!(pos,A,vec)
config = AMDGPU.launch_configuration(kernel)
groupsize = min(len, config.groupsize)
gridsize = cld(len, groupsize)
@debug gridsize,groupsize

kernel(pos,A_d,vec_d; groupsize, gridsize)
kernel(pos,A,vec; groupsize, gridsize)
end
end

Expand Down Expand Up @@ -80,15 +80,15 @@ function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,B) where N
end


function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values_d::ROCArray,B_d) where N
B_d .= 0
function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values::ROCArray,B) where N
B .= 0

AMDGPU.@sync begin
len = length(pos)
#numgridsize = ceil(Int, length(pos)/256)
# must be one
numgridsize = 1
@roc groupsize=256 gridsize=numgridsize interp_adjn_d!(pos,values_d,B_d)
@roc groupsize=256 gridsize=numgridsize interp_adjn_d!(pos,values,B)
end
end

Expand Down
24 changes: 13 additions & 11 deletions ext/CUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ function interpnd_d!(pos::AbstractVector{<:NTuple{N}},A,vec) where N

@inbounds for i = index:stride:length(pos)
p = pos[i]
ind = floor.(Int,p)
#ind = floor.(Int,p)
ind = unsafe_trunc.(Int32,floor.(p))

# interpolation coefficients
c = p .- ind
Expand All @@ -27,27 +28,28 @@ function interpnd_d!(pos::AbstractVector{<:NTuple{N}},A,vec) where N
return nothing
end

function interpnd!(pos::AbstractVector{<:NTuple{N}},d_A::CuArray,vec_d) where N
function interpnd!(pos::AbstractVector{<:NTuple{N}},A::CuArray,vec) where N
CUDA.@sync begin
len = length(pos)
kernel = @cuda launch=false interpnd_d!(pos,d_A,vec_d)
kernel = @cuda launch=false interpnd_d!(pos,A,vec)
config = launch_configuration(kernel.fun)
threads = min(len, config.threads)
blocks = cld(len, threads)
@debug blocks,threads

kernel(pos,d_A,vec_d; threads, blocks)
kernel(pos,A,vec; threads, blocks)
end
end


function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,A2) where N
function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,B) where N
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x

@inbounds for i = index:stride:length(pos)
p = pos[i]
ind = floor.(Int,p)
#ind = floor.(Int,p)
ind = unsafe_trunc.(Int32,floor.(p))

# interpolation coefficients
c = p .- ind
Expand All @@ -57,24 +59,24 @@ function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,A2) where N

cc = prod(ntuple(n -> (offset[n] == 1 ? c[n] : 1-c[n]),Val(N)))

I = LinearIndices(A2)[p2...]
CUDA.atomic_add!(pointer(A2,I), values[i] * cc)
I = LinearIndices(B)[p2...]
CUDA.atomic_add!(pointer(B,I), values[i] * cc)
end
end

return nothing
end


function interp_adjn!(pos::AbstractVector{<:NTuple{N}},cuvalues::CuArray,d_A2) where N
d_A2 .= 0
function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values::CuArray,B) where N
B .= 0

CUDA.@sync begin
len = length(pos)
#numblocks = ceil(Int, length(pos)/256)
# must be one
numblocks = 1
@cuda threads=256 blocks=numblocks interp_adjn_d!(pos,cuvalues,d_A2)
@cuda threads=256 blocks=numblocks interp_adjn_d!(pos,values,B)
end
end

Expand Down

0 comments on commit 07a6ee5

Please sign in to comment.