Skip to content

Commit

Permalink
Support HIP
Browse files Browse the repository at this point in the history
  • Loading branch information
WeiqunZhang committed Sep 20, 2024
1 parent 104a8ca commit c0757e0
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 130 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
name: cuda

on: [push, pull_request]
on:
push:
pull_request:
schedule:
- cron: "22 22 * * 6"

concurrency:
group: ${{ github.ref }}-${{ github.head_ref }}-cuda
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/gcc.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
name: GCC

on: [push, pull_request]
on:
push:
pull_request:
schedule:
- cron: "22 22 * * 6"

concurrency:
group: ${{ github.ref }}-${{ github.head_ref }}-linux-gcc
Expand Down
18 changes: 10 additions & 8 deletions Src/AMReX_Arena.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "AMReX_Arena.H"
#include "AMReX_BLassert.H"
#include "AMReX_Gpu.H"

namespace amrex
Expand All @@ -8,11 +9,12 @@ void* allocate_host (std::size_t sz)
{
#if defined(AMREX_USE_CUDA)
void* p;
cudaHostAlloc(&p, sz, cudaHostAllocMapped);
AMREX_CUDA_SAFE_CALL(cudaHostAlloc(&p, sz, cudaHostAllocMapped));
return p;
#elif defined(AMREX_USE_HIP)
void* p;
hipHostAlloc(&p, sz, hipHostAllocMapped | hipHostMallocNonCoherent);
AMREX_HIP_SAFE_CALL(hipHostMalloc(&p, sz, hipHostMallocMapped |
hipHostMallocNonCoherent));
return p;
#elif defined(AMREX_USE_SYCL)
return sycl::malloc_host(...);
Expand All @@ -24,9 +26,9 @@ void* allocate_host (std::size_t sz)
void free_host (void* pt)
{
#if defined(AMREX_USE_CUDA)
cudaFreeHost(pt);
AMREX_CUDA_SAFE_CALL(cudaFreeHost(pt));
#elif defined(AMREX_USE_HIP)
hipHostFree(pt);
AMREX_HIP_SAFE_CALL(hipHostFree(pt));
#elif defined(AMREX_USE_SYCL)
sycl::free(...);
#else
Expand All @@ -38,9 +40,9 @@ void* allocate_device (std::size_t sz)
{
void* p;
#if defined(AMREX_USE_CUDA)
cudaMalloc(&p, sz);
AMREX_CUDA_SAFE_CALL(cudaMalloc(&p, sz));
#elif defined(AMREX_USE_HIP)
hipMalloc(&p, sz);
AMREX_HIP_SAFE_CALL(hipMalloc(&p, sz));
#elif defined(AMREX_USE_SYCL)
p = sycl::malloc_device(...);
#else
Expand All @@ -52,9 +54,9 @@ void* allocate_device (std::size_t sz)
void free_device (void* pt)
{
#if defined(AMREX_USE_CUDA)
cudaFree(pt);
AMREX_CUDA_SAFE_CALL(cudaFree(pt));
#elif defined(AMREX_USE_HIP)
hipFree(pt);
AMREX_HIP_SAFE_CALL(hipFree(pt));
#elif defined(AMREX_USE_SYCL)
sycl::free(...);
#else
Expand Down
29 changes: 28 additions & 1 deletion Src/AMReX_BLassert.H
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include "AMReX_Extension.H"
#include "AMReX_Gpu.H"
#include <cassert>
#include <stdexcept>
#include <string>

namespace amrex {

Expand All @@ -29,4 +31,29 @@ void Assert (const char* EX, const char* file, int line, const char* msg)
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX,MSG) (EX)?((void)0):amrex::Assert( # EX , __FILE__, __LINE__ , # MSG)
#define AMREX_ALWAYS_ASSERT(EX) (EX)?((void)0):amrex::Assert( # EX , __FILE__, __LINE__)

#endif /*BL_BL_ASSERT_H*/

#if defined (AMREX_USE_CUDA)

#define AMREX_CUDA_SAFE_CALL(call) { \
auto amrex_i_err = call; \
if (cudaSuccess != amrex_i_err) { \
std::string errStr(std::string("CUDA error in file ") + __FILE__ \
+ " line " + std::to_string(__LINE__) \
+ ": " + cudaGetErrorString(amrex_i_err)); \
throw std::runtime_error(errStr); \
}}

#elif defined (AMREX_USE_HIP)

#define AMREX_HIP_SAFE_CALL(call) { \
auto amrex_i_err = call; \
if (hipSuccess != amrex_i_err) { \
std::string errStr(std::string("HIP error in file ") + __FILE__ \
+ " line " + std::to_string(__LINE__) \
+ " " + hipGetErrorString(amrex_i_err)); \
throw std::runtime_error(errStr); \
}}

#endif

#endif
1 change: 0 additions & 1 deletion Src/AMReX_BLassert.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "AMReX_BLassert.H"

#include <cstdio>
#include <stdexcept>

namespace amrex
{
Expand Down
6 changes: 4 additions & 2 deletions Src/AMReX_Gpu.H
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

#if defined(AMREX_USE_GPU) && !defined(AMREX_USE_SYCL)

#if defined(AMREX_USE_HIP)
#if defined(AMREX_USE_CUDA)
#include <cuda_runtime.h>
#elif defined(AMREX_USE_HIP)
#include <hip/hip_runtime.h>
#endif

Expand Down Expand Up @@ -106,7 +108,7 @@ namespace Gpu {
void streamSynchronize ();
}

#if defined(AMREX_USE_CUDA)
#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)

template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
void ParallelFor (T n, L const& f)
Expand Down
11 changes: 7 additions & 4 deletions Src/AMReX_Gpu.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include "AMReX_BLassert.H"
#include "AMReX_Gpu.H"

#ifdef AMREX_USE_GPU
Expand Down Expand Up @@ -25,9 +26,9 @@ void setStream (gpuStream_t a_stream)
void streamSynchronize ()
{
#if defined(AMREX_USE_CUDA)
cudaStreamSynchronize(gpu_stream);
AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(gpu_stream));
#elif defined(AMREX_USE_HIP)
hipStreamSynchronize(gpu_stream);
AMREX_HIP_SAFE_CALL(hipStreamSynchronize(gpu_stream));
#elif defined(AMREX_USE_SYCL)
static_assert(false);
#else
Expand All @@ -38,9 +39,11 @@ void streamSynchronize ()
void htod_memcpy (void* p_d, void const* p_h, std::size_t sz)
{
#if defined(AMREX_USE_CUDA)
cudaMemcpyAsync(p_d, p_h, sz, cudaMemcpyHostToDevice, gpu_stream);
AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d, p_h, sz, cudaMemcpyHostToDevice,
gpu_stream));
#elif defined(AMREX_USE_HIP)
hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice, gpu_stream);
AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice,
gpu_stream));
#elif defined(AMREX_USE_SYCL)
static_assert(false);
#else
Expand Down
2 changes: 1 addition & 1 deletion Tests/GPU/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ int main (int argc, char* argv[])
{
amrex::ignore_unused(argc, argv);

std::size_t N = 256*256*256;
std::size_t N = 256*256*256*8;
auto* p = (double*)allocate_device(N*sizeof(double));

Parser parser("epsilon/kp*2*x/w0**2*exp(-(x**2+y**2)/w0**2)*sin(k0*z)");
Expand Down
7 changes: 0 additions & 7 deletions Tools/GNUMake/Make.defs
Original file line number Diff line number Diff line change
Expand Up @@ -439,13 +439,6 @@ else ifeq ($(USE_HIP),TRUE)
AMD_ARCH = $(AMREX_AMD_ARCH)
endif

# For AMD GPUs, the wavefront is 64 except for gfx10??.
ifeq ($(findstring gfx10,$(AMD_ARCH)),)
DEFINES += -DAMREX_AMDGCN_WAVEFRONT_SIZE=64
else
DEFINES += -DAMREX_AMDGCN_WAVEFRONT_SIZE=32
endif

ifeq ($(HIP_SAVE_TEMPS),TRUE)
ifeq ($(USE_GPU_RDC),TRUE)
$(warning *** HIP_SAVE_TEMPS requires USE_GPU_RDC=FALSE to obtain the assembly files for AMD GPU kernels.)
Expand Down
126 changes: 22 additions & 104 deletions Tools/GNUMake/comps/hip.mak
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@ ifneq ($(NO_CONFIG_CHECKING),TRUE)
endif

CXX = $(HIP_PATH)/bin/hipcc
CC = $(HIP_PATH)/bin/hipcc
FC = gfortran
F90 = gfortran

ifdef CXXSTD
CXXSTD := $(strip $(CXXSTD))
Expand All @@ -24,12 +21,6 @@ endif

# Generic flags, always used
CXXFLAGS = -std=$(CXXSTD) -m64
CFLAGS = -std=c11 -m64

FFLAGS = -ffixed-line-length-none -fno-range-check -fno-second-underscore
F90FLAGS = -ffree-line-length-none -fno-range-check -fno-second-underscore -fimplicit-none

FMODULES = -J$(fmoddir) -I $(fmoddir)

# rdc support
ifeq ($(USE_GPU_RDC),TRUE)
Expand All @@ -44,110 +35,37 @@ HIPCC_FLAGS += -pthread

CXXFLAGS += $(HIPCC_FLAGS)

# add fopenmp targeting the gnu library
ifeq ($(USE_OMP),TRUE)
CXXFLAGS += -fopenmp=libgomp
CFLAGS += -fopenmp=libgomp
HIPCC_FLAGS += -fopenmp=libgomp
endif

ifneq ($(BL_NO_FORT),TRUE)

# Taken straight from gnu
# ask gfortran the name of the library to link in. First check for the
# static version. If it returns only the name w/o a path, then it
# was not found. In that case, ask for the shared-object version.
gfortran_liba = $(shell $(F90) -print-file-name=libgfortran.a)
gfortran_libso = $(shell $(F90) -print-file-name=libgfortran.so)
# =============================================================================================

ifneq ($(gfortran_liba),libgfortran.a) # if found the full path is printed, thus `neq`.
LIBRARY_LOCATIONS += $(dir $(gfortran_liba))
ifeq ($(DEBUG),TRUE)
CXXFLAGS += -g -O1
else
LIBRARY_LOCATIONS += $(dir $(gfortran_libso))
CXXFLAGS += -gline-tables-only -fdebug-info-for-profiling -O3
endif

override XTRALIBS += -lgfortran

quadmath_liba = $(shell $(F90) -print-file-name=libquadmath.a)
quadmath_libso = $(shell $(F90) -print-file-name=libquadmath.so)

ifneq ($(quadmath_liba),libquadmath.a)
override XTRALIBS += -lquadmath
else ifneq ($(quadmath_libso),libquadmath.so)
override XTRALIBS += -lquadmath
endif
ifeq ($(WARN_ALL),TRUE)
warning_flags = -Wall -Wextra -Wunreachable-code -Wnull-dereference
warning_flags += -Wfloat-conversion -Wextra-semi

endif # BL_NO_FORT

# =============================================================================================

ifeq ($(HIP_COMPILER),clang)

ifeq ($(DEBUG),TRUE)
CXXFLAGS += -g -O1 -munsafe-fp-atomics
CFLAGS += -g -O0

FFLAGS += -g -O0 -ggdb -fbounds-check -fbacktrace -Wuninitialized -Wunused -ffpe-trap=invalid,zero -finit-real=snan -finit-integer=2147483647 -ftrapv
F90FLAGS += -g -O0 -ggdb -fbounds-check -fbacktrace -Wuninitialized -Wunused -ffpe-trap=invalid,zero -finit-real=snan -finit-integer=2147483647 -ftrapv

else # DEBUG=FALSE flags

CXXFLAGS += -gline-tables-only -fdebug-info-for-profiling -O3 -munsafe-fp-atomics
CFLAGS += -gline-tables-only -fdebug-info-for-profiling -O3
FFLAGS += -g1 -O3
F90FLAGS += -g1 -O3
warning_flags += -Wpedantic

ifneq ($(WARN_SHADOW),FALSE)
warning_flags += -Wshadow
endif

ifeq ($(WARN_ALL),TRUE)
warning_flags = -Wall -Wextra -Wunreachable-code -Wnull-dereference
warning_flags += -Wfloat-conversion -Wextra-semi

warning_flags += -Wpedantic

ifneq ($(WARN_SHADOW),FALSE)
warning_flags += -Wshadow
endif

CXXFLAGS += $(warning_flags) -Woverloaded-virtual
CFLAGS += $(warning_flags)
endif

ifeq ($(WARN_ERROR),TRUE)
CXXFLAGS += -Werror -Wno-deprecated-declarations -Wno-gnu-zero-variadic-macro-arguments
CFLAGS += -Werror
endif

# Generic HIP info
ROC_PATH=$(realpath $(dir $(HIP_PATH)))
SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include $(HIP_PATH)/include

# rocRand
SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/hiprand $(ROC_PATH)/include/rocrand
LIBRARY_LOCATIONS += $(ROC_PATH)/lib
LIBRARIES += -Wl,--rpath=$(ROC_PATH)/lib -lhiprand -lrocrand

# rocPrim - Header only
SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/rocprim

# rocThrust - Header only
# SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/rocthrust

# rocTracer
ifeq ($(USE_ROCTX),TRUE)
CXXFLAGS += -DAMREX_USE_ROCTX
HIPCC_FLAGS += -DAMREX_USE_ROCTX
LIBRARY_LOCATIONS += $(ROC_PATH)/lib
LIBRARIES += -Wl,--rpath=$(ROC_PATH)/lib -lroctracer64 -lroctx64
endif
CXXFLAGS += $(warning_flags) -Woverloaded-virtual
CFLAGS += $(warning_flags)
endif

# hipcc passes a lot of unused arguments to clang
LEGACY_DEPFLAGS += -Wno-unused-command-line-argument
ifeq ($(WARN_ERROR),TRUE)
CXXFLAGS += -Werror -Wno-deprecated-declarations -Wno-gnu-zero-variadic-macro-arguments
CFLAGS += -Werror
endif

# =============================================================================================
# Generic HIP info
ROC_PATH=$(realpath $(dir $(HIP_PATH)))
SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include $(HIP_PATH)/include

else ifeq ($(HIP_COMPILER),nvcc)
$(error HIP_COMPILER nvcc is not supported at this time. Use USE_CUDA to compile for NVIDIA platforms.)
endif
# hipcc passes a lot of unused arguments to clang
LEGACY_DEPFLAGS += -Wno-unused-command-line-argument

# =============================================================================================

0 comments on commit c0757e0

Please sign in to comment.