From c0757e0267c5472f342a43ee058919a0f7225e08 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Fri, 20 Sep 2024 14:49:51 -0700 Subject: [PATCH] Support HIP --- .github/workflows/cuda.yml | 6 +- .github/workflows/gcc.yml | 6 +- Src/AMReX_Arena.cpp | 18 +++--- Src/AMReX_BLassert.H | 29 ++++++++- Src/AMReX_BLassert.cpp | 1 - Src/AMReX_Gpu.H | 6 +- Src/AMReX_Gpu.cpp | 11 ++-- Tests/GPU/main.cpp | 2 +- Tools/GNUMake/Make.defs | 7 -- Tools/GNUMake/comps/hip.mak | 126 +++++++----------------------------- 10 files changed, 82 insertions(+), 130 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 5d1f8a8..cf19a88 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -1,6 +1,10 @@ name: cuda -on: [push, pull_request] +on: + push: + pull_request: + schedule: + - cron: "22 22 * * 6" concurrency: group: ${{ github.ref }}-${{ github.head_ref }}-cuda diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index bd6a9ab..58c1001 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -1,6 +1,10 @@ name: GCC -on: [push, pull_request] +on: + push: + pull_request: + schedule: + - cron: "22 22 * * 6" concurrency: group: ${{ github.ref }}-${{ github.head_ref }}-linux-gcc diff --git a/Src/AMReX_Arena.cpp b/Src/AMReX_Arena.cpp index ef9efba..3f20ad4 100644 --- a/Src/AMReX_Arena.cpp +++ b/Src/AMReX_Arena.cpp @@ -1,4 +1,5 @@ #include "AMReX_Arena.H" +#include "AMReX_BLassert.H" #include "AMReX_Gpu.H" namespace amrex @@ -8,11 +9,12 @@ void* allocate_host (std::size_t sz) { #if defined(AMREX_USE_CUDA) void* p; - cudaHostAlloc(&p, sz, cudaHostAllocMapped); + AMREX_CUDA_SAFE_CALL(cudaHostAlloc(&p, sz, cudaHostAllocMapped)); return p; #elif defined(AMREX_USE_HIP) void* p; - hipHostAlloc(&p, sz, hipHostAllocMapped | hipHostMallocNonCoherent); + AMREX_HIP_SAFE_CALL(hipHostMalloc(&p, sz, hipHostMallocMapped | + hipHostMallocNonCoherent)); return p; #elif defined(AMREX_USE_SYCL) return sycl::malloc_host(...); @@ -24,9 +26,9 @@ void* allocate_host (std::size_t sz) void free_host (void* pt) { #if defined(AMREX_USE_CUDA) - cudaFreeHost(pt); + AMREX_CUDA_SAFE_CALL(cudaFreeHost(pt)); #elif defined(AMREX_USE_HIP) - hipHostFree(pt); + AMREX_HIP_SAFE_CALL(hipHostFree(pt)); #elif defined(AMREX_USE_SYCL) sycl::free(...); #else @@ -38,9 +40,9 @@ void* allocate_device (std::size_t sz) { void* p; #if defined(AMREX_USE_CUDA) - cudaMalloc(&p, sz); + AMREX_CUDA_SAFE_CALL(cudaMalloc(&p, sz)); #elif defined(AMREX_USE_HIP) - hipMalloc(&p, sz); + AMREX_HIP_SAFE_CALL(hipMalloc(&p, sz)); #elif defined(AMREX_USE_SYCL) p = sycl::malloc_device(...); #else @@ -52,9 +54,9 @@ void* allocate_device (std::size_t sz) void free_device (void* pt) { #if defined(AMREX_USE_CUDA) - cudaFree(pt); + AMREX_CUDA_SAFE_CALL(cudaFree(pt)); #elif defined(AMREX_USE_HIP) - hipFree(pt); + AMREX_HIP_SAFE_CALL(hipFree(pt)); #elif defined(AMREX_USE_SYCL) sycl::free(...); #else diff --git a/Src/AMReX_BLassert.H b/Src/AMReX_BLassert.H index dc0100c..fc85342 100644 --- a/Src/AMReX_BLassert.H +++ b/Src/AMReX_BLassert.H @@ -6,6 +6,8 @@ #include "AMReX_Extension.H" #include "AMReX_Gpu.H" #include +#include +#include namespace amrex { @@ -29,4 +31,29 @@ void Assert (const char* EX, const char* file, int line, const char* msg) #define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX,MSG) (EX)?((void)0):amrex::Assert( # EX , __FILE__, __LINE__ , # MSG) #define AMREX_ALWAYS_ASSERT(EX) (EX)?((void)0):amrex::Assert( # EX , __FILE__, __LINE__) -#endif /*BL_BL_ASSERT_H*/ + +#if defined (AMREX_USE_CUDA) + +#define AMREX_CUDA_SAFE_CALL(call) { \ + auto amrex_i_err = call; \ + if (cudaSuccess != amrex_i_err) { \ + std::string errStr(std::string("CUDA error in file ") + __FILE__ \ + + " line " + std::to_string(__LINE__) \ + + ": " + cudaGetErrorString(amrex_i_err)); \ + throw std::runtime_error(errStr); \ + }} + +#elif defined (AMREX_USE_HIP) + +#define AMREX_HIP_SAFE_CALL(call) { \ + auto amrex_i_err = call; \ + if (hipSuccess != amrex_i_err) { \ + std::string errStr(std::string("HIP error in file ") + __FILE__ \ + + " line " + std::to_string(__LINE__) \ + + " " + hipGetErrorString(amrex_i_err)); \ + throw std::runtime_error(errStr); \ + }} + +#endif + +#endif diff --git a/Src/AMReX_BLassert.cpp b/Src/AMReX_BLassert.cpp index 2041712..9abc1e2 100644 --- a/Src/AMReX_BLassert.cpp +++ b/Src/AMReX_BLassert.cpp @@ -1,7 +1,6 @@ #include "AMReX_BLassert.H" #include -#include namespace amrex { diff --git a/Src/AMReX_Gpu.H b/Src/AMReX_Gpu.H index 3098dd3..6d452a1 100644 --- a/Src/AMReX_Gpu.H +++ b/Src/AMReX_Gpu.H @@ -8,7 +8,9 @@ #if defined(AMREX_USE_GPU) && !defined(AMREX_USE_SYCL) -#if defined(AMREX_USE_HIP) +#if defined(AMREX_USE_CUDA) +#include +#elif defined(AMREX_USE_HIP) #include #endif @@ -106,7 +108,7 @@ namespace Gpu { void streamSynchronize (); } -#if defined(AMREX_USE_CUDA) +#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) template > > void ParallelFor (T n, L const& f) diff --git a/Src/AMReX_Gpu.cpp b/Src/AMReX_Gpu.cpp index e761518..20eb824 100644 --- a/Src/AMReX_Gpu.cpp +++ b/Src/AMReX_Gpu.cpp @@ -1,3 +1,4 @@ +#include "AMReX_BLassert.H" #include "AMReX_Gpu.H" #ifdef AMREX_USE_GPU @@ -25,9 +26,9 @@ void setStream (gpuStream_t a_stream) void streamSynchronize () { #if defined(AMREX_USE_CUDA) - cudaStreamSynchronize(gpu_stream); + AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(gpu_stream)); #elif defined(AMREX_USE_HIP) - hipStreamSynchronize(gpu_stream); + AMREX_HIP_SAFE_CALL(hipStreamSynchronize(gpu_stream)); #elif defined(AMREX_USE_SYCL) static_assert(false); #else @@ -38,9 +39,11 @@ void streamSynchronize () void htod_memcpy (void* p_d, void const* p_h, std::size_t sz) { #if defined(AMREX_USE_CUDA) - cudaMemcpyAsync(p_d, p_h, sz, cudaMemcpyHostToDevice, gpu_stream); + AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d, p_h, sz, cudaMemcpyHostToDevice, + gpu_stream)); #elif defined(AMREX_USE_HIP) - hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice, gpu_stream); + AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice, + gpu_stream)); #elif defined(AMREX_USE_SYCL) static_assert(false); #else diff --git a/Tests/GPU/main.cpp b/Tests/GPU/main.cpp index 74d289f..365f739 100644 --- a/Tests/GPU/main.cpp +++ b/Tests/GPU/main.cpp @@ -11,7 +11,7 @@ int main (int argc, char* argv[]) { amrex::ignore_unused(argc, argv); - std::size_t N = 256*256*256; + std::size_t N = 256*256*256*8; auto* p = (double*)allocate_device(N*sizeof(double)); Parser parser("epsilon/kp*2*x/w0**2*exp(-(x**2+y**2)/w0**2)*sin(k0*z)"); diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs index 0f3b6bc..11fa915 100644 --- a/Tools/GNUMake/Make.defs +++ b/Tools/GNUMake/Make.defs @@ -439,13 +439,6 @@ else ifeq ($(USE_HIP),TRUE) AMD_ARCH = $(AMREX_AMD_ARCH) endif - # For AMD GPUs, the wavefront is 64 except for gfx10??. - ifeq ($(findstring gfx10,$(AMD_ARCH)),) - DEFINES += -DAMREX_AMDGCN_WAVEFRONT_SIZE=64 - else - DEFINES += -DAMREX_AMDGCN_WAVEFRONT_SIZE=32 - endif - ifeq ($(HIP_SAVE_TEMPS),TRUE) ifeq ($(USE_GPU_RDC),TRUE) $(warning *** HIP_SAVE_TEMPS requires USE_GPU_RDC=FALSE to obtain the assembly files for AMD GPU kernels.) diff --git a/Tools/GNUMake/comps/hip.mak b/Tools/GNUMake/comps/hip.mak index 87bb3e9..53c4c8c 100644 --- a/Tools/GNUMake/comps/hip.mak +++ b/Tools/GNUMake/comps/hip.mak @@ -12,9 +12,6 @@ ifneq ($(NO_CONFIG_CHECKING),TRUE) endif CXX = $(HIP_PATH)/bin/hipcc -CC = $(HIP_PATH)/bin/hipcc -FC = gfortran -F90 = gfortran ifdef CXXSTD CXXSTD := $(strip $(CXXSTD)) @@ -24,12 +21,6 @@ endif # Generic flags, always used CXXFLAGS = -std=$(CXXSTD) -m64 -CFLAGS = -std=c11 -m64 - -FFLAGS = -ffixed-line-length-none -fno-range-check -fno-second-underscore -F90FLAGS = -ffree-line-length-none -fno-range-check -fno-second-underscore -fimplicit-none - -FMODULES = -J$(fmoddir) -I $(fmoddir) # rdc support ifeq ($(USE_GPU_RDC),TRUE) @@ -44,110 +35,37 @@ HIPCC_FLAGS += -pthread CXXFLAGS += $(HIPCC_FLAGS) -# add fopenmp targeting the gnu library -ifeq ($(USE_OMP),TRUE) - CXXFLAGS += -fopenmp=libgomp - CFLAGS += -fopenmp=libgomp - HIPCC_FLAGS += -fopenmp=libgomp -endif - -ifneq ($(BL_NO_FORT),TRUE) - -# Taken straight from gnu -# ask gfortran the name of the library to link in. First check for the -# static version. If it returns only the name w/o a path, then it -# was not found. In that case, ask for the shared-object version. -gfortran_liba = $(shell $(F90) -print-file-name=libgfortran.a) -gfortran_libso = $(shell $(F90) -print-file-name=libgfortran.so) +# ============================================================================================= -ifneq ($(gfortran_liba),libgfortran.a) # if found the full path is printed, thus `neq`. - LIBRARY_LOCATIONS += $(dir $(gfortran_liba)) +ifeq ($(DEBUG),TRUE) + CXXFLAGS += -g -O1 else - LIBRARY_LOCATIONS += $(dir $(gfortran_libso)) + CXXFLAGS += -gline-tables-only -fdebug-info-for-profiling -O3 endif -override XTRALIBS += -lgfortran - -quadmath_liba = $(shell $(F90) -print-file-name=libquadmath.a) -quadmath_libso = $(shell $(F90) -print-file-name=libquadmath.so) - -ifneq ($(quadmath_liba),libquadmath.a) - override XTRALIBS += -lquadmath -else ifneq ($(quadmath_libso),libquadmath.so) - override XTRALIBS += -lquadmath -endif +ifeq ($(WARN_ALL),TRUE) + warning_flags = -Wall -Wextra -Wunreachable-code -Wnull-dereference + warning_flags += -Wfloat-conversion -Wextra-semi -endif # BL_NO_FORT - -# ============================================================================================= - -ifeq ($(HIP_COMPILER),clang) - - ifeq ($(DEBUG),TRUE) - CXXFLAGS += -g -O1 -munsafe-fp-atomics - CFLAGS += -g -O0 - - FFLAGS += -g -O0 -ggdb -fbounds-check -fbacktrace -Wuninitialized -Wunused -ffpe-trap=invalid,zero -finit-real=snan -finit-integer=2147483647 -ftrapv - F90FLAGS += -g -O0 -ggdb -fbounds-check -fbacktrace -Wuninitialized -Wunused -ffpe-trap=invalid,zero -finit-real=snan -finit-integer=2147483647 -ftrapv - - else # DEBUG=FALSE flags - - CXXFLAGS += -gline-tables-only -fdebug-info-for-profiling -O3 -munsafe-fp-atomics - CFLAGS += -gline-tables-only -fdebug-info-for-profiling -O3 - FFLAGS += -g1 -O3 - F90FLAGS += -g1 -O3 + warning_flags += -Wpedantic + ifneq ($(WARN_SHADOW),FALSE) + warning_flags += -Wshadow endif - ifeq ($(WARN_ALL),TRUE) - warning_flags = -Wall -Wextra -Wunreachable-code -Wnull-dereference - warning_flags += -Wfloat-conversion -Wextra-semi - - warning_flags += -Wpedantic - - ifneq ($(WARN_SHADOW),FALSE) - warning_flags += -Wshadow - endif - - CXXFLAGS += $(warning_flags) -Woverloaded-virtual - CFLAGS += $(warning_flags) - endif - - ifeq ($(WARN_ERROR),TRUE) - CXXFLAGS += -Werror -Wno-deprecated-declarations -Wno-gnu-zero-variadic-macro-arguments - CFLAGS += -Werror - endif - - # Generic HIP info - ROC_PATH=$(realpath $(dir $(HIP_PATH))) - SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include $(HIP_PATH)/include - - # rocRand - SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/hiprand $(ROC_PATH)/include/rocrand - LIBRARY_LOCATIONS += $(ROC_PATH)/lib - LIBRARIES += -Wl,--rpath=$(ROC_PATH)/lib -lhiprand -lrocrand - - # rocPrim - Header only - SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/rocprim - - # rocThrust - Header only - # SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/rocthrust - - # rocTracer - ifeq ($(USE_ROCTX),TRUE) - CXXFLAGS += -DAMREX_USE_ROCTX - HIPCC_FLAGS += -DAMREX_USE_ROCTX - LIBRARY_LOCATIONS += $(ROC_PATH)/lib - LIBRARIES += -Wl,--rpath=$(ROC_PATH)/lib -lroctracer64 -lroctx64 - endif + CXXFLAGS += $(warning_flags) -Woverloaded-virtual + CFLAGS += $(warning_flags) +endif - # hipcc passes a lot of unused arguments to clang - LEGACY_DEPFLAGS += -Wno-unused-command-line-argument +ifeq ($(WARN_ERROR),TRUE) + CXXFLAGS += -Werror -Wno-deprecated-declarations -Wno-gnu-zero-variadic-macro-arguments + CFLAGS += -Werror +endif -# ============================================================================================= +# Generic HIP info +ROC_PATH=$(realpath $(dir $(HIP_PATH))) +SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include $(HIP_PATH)/include -else ifeq ($(HIP_COMPILER),nvcc) - $(error HIP_COMPILER nvcc is not supported at this time. Use USE_CUDA to compile for NVIDIA platforms.) -endif +# hipcc passes a lot of unused arguments to clang +LEGACY_DEPFLAGS += -Wno-unused-command-line-argument -# =============================================================================================