diff --git a/src/alge/Makefile.am b/src/alge/Makefile.am index 0118586293..d3660e1030 100644 --- a/src/alge/Makefile.am +++ b/src/alge/Makefile.am @@ -198,6 +198,7 @@ libcsalge_a_SOURCES += \ cs_benchmark_cuda.cu \ cs_blas_cuda.cu \ cs_gradient_cuda.cu \ +cs_convection_diffusion_cuda.cu \ cs_matrix_spmv_cuda.cu \ cs_sles_it_cuda.cu \ cs_sles_pc_cuda.cu diff --git a/src/alge/cs_alge_cuda.cuh b/src/alge/cs_alge_cuda.cuh new file mode 100644 index 0000000000..11c5123553 --- /dev/null +++ b/src/alge/cs_alge_cuda.cuh @@ -0,0 +1,428 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ +#pragma once + +#include "cs_defs.h" + +/*---------------------------------------------------------------------------- + * Standard C library headers + *----------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(HAVE_MPI) +#include +#endif + +#include + +/*---------------------------------------------------------------------------- + * Local headers + *----------------------------------------------------------------------------*/ + +#include "bft_error.h" +#include "bft_mem.h" + +#include "cs_base_accel.h" +#include "cs_base_cuda.h" +#include "cs_blas.h" +#include "cs_cell_to_vertex.h" +#include "cs_ext_neighborhood.h" +#include "cs_field.h" +#include "cs_field_pointer.h" +#include "cs_halo.h" +#include "cs_halo_perio.h" +#include "cs_log.h" +#include "cs_math.h" +#include "cs_mesh.h" +#include "cs_mesh_adjacencies.h" +#include "cs_mesh_quantities.h" +#include "cs_parall.h" +#include "cs_porous_model.h" +#include "cs_prototypes.h" +#include "cs_timer.h" +#include "cs_timer_stats.h" + +BEGIN_C_DECLS + + typedef cs_real_t cs_cocg_t; + typedef cs_real_t cs_cocg_6_t[6]; + typedef cs_real_t cs_cocg_33_t[3][3]; + +END_C_DECLS + +template +static void +_sync_or_copy_real_h2d(const T *val_h, + cs_lnum_t n_vals, + int device_id, + cudaStream_t stream, + const T **val_d, + void **buf_d) +{ + const T *_val_d = NULL; + void *_buf_d = NULL; + + cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); + size_t size = n_vals * sizeof(T); + + if (alloc_mode == CS_ALLOC_HOST) { + CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); + cs_cuda_copy_h2d(_buf_d, val_h, size); + _val_d = (const T *)_buf_d; + } + else { + _val_d = (const T *)cs_get_device_ptr((void *)val_h); + + if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) + cudaMemPrefetchAsync(val_h, size, device_id, stream); + else + cs_sync_h2d(val_h); + } + + *val_d = _val_d; + *buf_d = _buf_d; +} + +/* Compute gridsize*/ + +static unsigned int +get_gridsize(unsigned int size, unsigned int blocksize){ + unsigned int gridsize = (unsigned int)ceil((double)size / blocksize); + + return gridsize; +} + + +__device__ static cs_real_t +cs_math_fabs_cuda(cs_real_t x) +{ + cs_real_t ret = (x < 0) ? -x : x; + + return ret; +} + +__device__ static cs_real_t +cs_math_3_dot_product_cuda(const cs_real_t u[3], + const cs_real_t v[3]) +{ + cs_real_t prod = u[0]*v[0] + u[1]*v[1] + u[2]*v[2]; + + return prod; +} + +__global__ static void +_set_one_to_coeff_b(const cs_lnum_t n_b_faces, + cs_real_33_t *_bc_coeff_b) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_idx >= n_b_faces){ + return; + } + + cs_lnum_t f_id = c_idx / 3; + size_t i = c_idx % 3; + + _bc_coeff_b[f_id][i][i] = 1; +} + +__device__ static void cs_math_3_normalize_cuda(const cs_real_t in[3], + cs_real_t out[3]) +{ + cs_real_t norm = sqrt(in[0]*in[0] + + in[1]*in[1] + + in[2]*in[2]); + + cs_real_t inverse_norm = 1. / norm; + + out[0] = inverse_norm * in[0]; + out[1] = inverse_norm * in[1]; + out[2] = inverse_norm * in[2]; +} + +__device__ static cs_real_t cs_math_3_square_norm_cuda(const cs_real_t in[3]){ + cs_real_t norm = in[0]*in[0] + in[1]*in[1] + in[2]*in[2]; + return norm; +} + +__device__ static void _math_6_inv_cramer_sym_in_place_cuda(cs_cocg_t in[6]){ + cs_real_t in00 = in[1]*in[2] - in[4]*in[4]; + cs_real_t in01 = in[4]*in[5] - in[3]*in[2]; + cs_real_t in02 = in[3]*in[4] - in[1]*in[5]; + cs_real_t in11 = in[0]*in[2] - in[5]*in[5]; + cs_real_t in12 = in[3]*in[5] - in[0]*in[4]; + cs_real_t in22 = in[0]*in[1] - in[3]*in[3]; + + cs_real_t det_inv = 1. / (in[0]*in00 + in[3]*in01 + in[5]*in02); + + in[0] = in00 * det_inv; + in[1] = in11 * det_inv; + in[2] = in22 * det_inv; + in[3] = in01 * det_inv; + in[4] = in12 * det_inv; + in[5] = in02 * det_inv; +} + +template +__device__ static void +_fact_crout_pp_cuda(cs_real_t *ad) +{ + cs_real_t aux[d_size]; + for (int kk = 0; kk < d_size - 1; kk++) { + int kk_d_size = kk*(kk + 1)/2; + for (int ii = kk + 1; ii < d_size; ii++) { + int ii_d_size = ii*(ii + 1)/2; + aux[ii] = ad[ii_d_size + kk]; + ad[ii_d_size + kk] = ad[ii_d_size + kk] + / ad[kk_d_size + kk]; + for (int jj = kk + 1; jj < ii + 1; jj++) { + ad[ii_d_size + jj] = ad[ii_d_size + jj] - ad[ii_d_size + kk]*aux[jj]; + } + } + } +} + +template +__device__ static void +_fw_and_bw_ldtl_pp_cuda(const cs_real_t mat[], + cs_real_t x[], + const cs_real_t b[]) +{ + cs_real_t aux[d_size]; + + for (int ii = 0; ii < d_size; ii++) { + int ii_d_size = ii*(ii + 1)/2; + aux[ii] = b[ii]; + for (int jj = 0; jj < ii; jj++) { + aux[ii] -= aux[jj]*mat[ii_d_size + jj]; + } + } + + for (int ii = 0; ii < d_size; ii++) { + int ii_d_size = ii*(ii + 1)/2; + aux[ii] /= mat[ii_d_size + ii]; + } + + for (int ii = d_size - 1; ii >= 0; ii--) { + x[ii] = aux[ii]; + for (int jj = d_size - 1; jj > ii; jj--) { + int jj_d_size = jj*(jj + 1)/2; + x[ii] -= x[jj]*mat[jj_d_size + ii]; + } + } +} + +template +__device__ uint32_t _conflict_mask(uint32_t mask, V v) noexcept { +#if __CUDA_ARCH__ >= 700 + return __match_any_sync(mask, v); +#else + uint32_t lanemask_eq = 1u << (threadIdx.x % 32); + if (!(mask & lanemask_eq)) + return 0; + uint32_t ref, ballot; + int leader; + goto entry; +loop: + mask &= ~ballot; +entry: + leader = __ffs(mask) - 1; + ref = __shfl_sync(mask, v, leader); + ballot = __ballot_sync(mask, v == ref); + if (!(ballot & lanemask_eq)) + goto loop; + return ballot; +#endif +} + +template +__device__ bool _reduce_add(uint32_t mask, uint32_t peers, T& v) noexcept { + int laneid = threadIdx.x % 32; + uint32_t lanemask_lt = (1u << laneid) - 1; + uint32_t lanemask_gt = -2u << laneid; + int rank = __popc(peers & lanemask_lt); + bool is_leader = rank == 0; + + peers &= lanemask_gt; + while (__any_sync(mask, peers)) { + int next = __ffs(peers); + + auto tmp = v.shuffle(mask, next - 1); + if (next) { + v.add(tmp); + } + + peers &= __ballot_sync(mask, !(rank & 1)); + + rank >>= 1; + } + + return is_leader; +} + + +template +class AtomicCell { + private: + T value = {}; + public: + using inner_type = T; + public: + __device__ AtomicCell() noexcept = default; + __device__ AtomicCell(T value) noexcept : value(value) {} + __device__ void add(const AtomicCell&restrict other) restrict noexcept { + value += other.value; + } + __device__ void atomic_add(const AtomicCell&restrict other) restrict noexcept { + atomicAdd(&value, other.value); + } + __device__ AtomicCell exchange(const AtomicCell&restrict other) restrict noexcept { + AtomicCell previous = *this; + *this = other; + return previous; + } + __device__ AtomicCell atomic_exchange(const AtomicCell&restrict other) restrict noexcept { + return AtomicCell(atomicExch(&value, other.value)); + } + __device__ AtomicCell shuffle(uint32_t mask, unsigned laneid) const noexcept { + return AtomicCell(__shfl_sync(mask, value, laneid)); + } + __device__ uint32_t conflict_mask(uint32_t mask) const noexcept { + return _conflict_mask(mask, (uintptr_t)this); + } + __device__ bool reduce_add(uint32_t mask, uint32_t peers) noexcept { + return _reduce_add(mask, peers, *this); + } + __device__ void conflict_free_add(uint32_t mask, AtomicCell other) noexcept { + uint32_t peers = conflict_mask(mask); + if (other.reduce_add(mask, peers)) { + atomic_add(other); + } + } + __device__ inner_type& operator*() noexcept { + return value; + } + __device__ inner_type const& operator*() const noexcept { + return value; + } + __device__ inner_type* operator->() noexcept { + return &value; + } + __device__ inner_type const* operator->() const noexcept { + return &value; + } + __device__ inner_type& get() noexcept { + return value; + } + __device__ inner_type const& get() const noexcept { + return value; + } + static __device__ AtomicCell& ref(inner_type& r) noexcept { + return reinterpret_cast(r); + } + static __device__ AtomicCell const& ref(inner_type const& r) noexcept { + return reinterpret_cast(r); + } +}; + +template +class AtomicCell { + private: + AtomicCell data[Head]; + public: + using inner_type = typename AtomicCell::inner_type[Head]; + public: + __device__ AtomicCell() noexcept = default; + __device__ void add(const AtomicCell&restrict other) restrict noexcept { + for (size_t i = 0; i < Head; ++i) { + data[i].add(other.data[i]); + } + } + __device__ void atomic_add(const AtomicCell&restrict other) restrict noexcept { + for (size_t i = 0; i < Head; ++i) { + data[i].atomic_add(other.data[i]); + } + } + __device__ AtomicCell exchange(const AtomicCell&restrict other) restrict noexcept { + AtomicCell previous; + for (size_t i = 0; i < Head; ++i) { + previous.data[i] = data[i].exchange(other.data[i]); + } + return previous; + } + __device__ AtomicCell atomic_exchange(const AtomicCell&restrict other) restrict noexcept { + AtomicCell previous; + for (size_t i = 0; i < Head; ++i) { + previous.data[i] = data[i].atomic_exchange(other.data[i]); + } + return previous; + } + __device__ AtomicCell shuffle(uint32_t mask, unsigned laneid) const noexcept { + AtomicCell shuffled; + for (size_t i = 0; i < Head; ++i) { + shuffled.data[i] = data[i].shuffle(mask, laneid); + } + return shuffled; + } + __device__ uint32_t conflict_mask(uint32_t mask) const noexcept { + return _conflict_mask(mask, (uintptr_t)this); + } + __device__ bool reduce_add(uint32_t mask, uint32_t peers) noexcept { + return _reduce_add(mask, peers, *this); + } + __device__ void conflict_free_add(uint32_t mask, AtomicCell other) noexcept { + uint32_t peers = conflict_mask(mask); + if (other.reduce_add(mask, peers)) { + atomic_add(other); + } + } + __device__ AtomicCell& operator[](size_t i) noexcept { + return data[i]; + } + __device__ AtomicCell const& operator[](size_t i) const noexcept { + return data[i]; + } + __device__ inner_type& get() noexcept { + return reinterpret_cast(*this); + } + __device__ inner_type const& get() const noexcept { + return reinterpret_cast(*this); + } + static __device__ AtomicCell& ref(inner_type& r) noexcept { + return reinterpret_cast(r); + } + static __device__ AtomicCell const& ref(inner_type const& r) noexcept { + return reinterpret_cast(r); + } +}; diff --git a/src/alge/cs_convection_diffusion.cxx b/src/alge/cs_convection_diffusion.cxx index c35c15bbf2..3ebbad926a 100644 --- a/src/alge/cs_convection_diffusion.cxx +++ b/src/alge/cs_convection_diffusion.cxx @@ -80,7 +80,7 @@ /*---------------------------------------------------------------------------- * Header for the current file *----------------------------------------------------------------------------*/ - +#include "time.h" #include "cs_convection_diffusion.h" #include "cs_convection_diffusion_priv.h" @@ -1211,6 +1211,271 @@ cs_slope_test_gradient(int f_id, } +#if defined(HAVE_OPENMP_TARGET) +// #pragma omp declare target +// const cs_real_t cs_math_zero_threshold = FLT_MIN; +// #pragma omp end declare target + +void +cs_slope_test_gradient_vector_target(const int inc, + const cs_halo_type_t halo_type, + const cs_real_33_t *grad, + cs_real_33_t *grdpa, + const cs_real_3_t *pvar, + const cs_real_3_t *coefa, + const cs_real_33_t *coefb, + const cs_real_t *i_massflux) +{ + const cs_mesh_t *m = cs_glob_mesh; + const cs_mesh_adjacencies_t *madj = cs_glob_mesh_adjacencies; + const cs_halo_t *halo = m->halo; + cs_mesh_quantities_t *fvq = cs_glob_mesh_quantities; + + const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_b_cells = m->n_b_cells; + const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const cs_lnum_t n_i_faces = m->n_i_faces; + const cs_lnum_t n_b_faces = m->n_b_faces; + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)m->i_face_cells; + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)m->b_face_cells; + const cs_lnum_t *restrict b_cells + = (const cs_lnum_t *restrict)m->b_cells; + const cs_real_t *restrict cell_vol = fvq->cell_vol; + const cs_real_3_t *restrict cell_cen + = (const cs_real_3_t *restrict)fvq->cell_cen; + const cs_real_3_t *restrict i_f_face_normal + = (const cs_real_3_t *restrict)fvq->i_f_face_normal; + const cs_real_3_t *restrict b_f_face_normal + = (const cs_real_3_t *restrict)fvq->b_f_face_normal; + const cs_real_3_t *restrict i_face_cog + = (const cs_real_3_t *restrict)fvq->i_face_cog; + const cs_real_3_t *restrict diipb + = (const cs_real_3_t *restrict)fvq->diipb; + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)madj->cell_cells_idx; + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)madj->cell_b_faces_idx; + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)madj->cell_cells; + const short int *restrict cell_i_faces_sgn + = (const short int *restrict)madj->cell_i_faces_sgn; + const cs_lnum_t *restrict cell_i_faces + = (const cs_lnum_t *restrict)madj->cell_i_faces; + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)madj->cell_b_faces; + + const int n_i_groups = m->i_face_numbering->n_groups; + const int n_i_threads = m->i_face_numbering->n_threads; + const int n_b_threads = m->b_face_numbering->n_threads; + const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; + const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index; + + bool scatter = true; + +#pragma omp target data map(tofrom: grdpa[0:n_cells_ext]) \ + map(to: grad[0:n_cells_ext], \ + i_face_cog[0:n_i_faces], \ + cell_i_faces_sgn[0:n_i_faces], \ + cell_i_faces[0:n_i_faces], \ + cell_cen[0:n_cells_ext], \ + pvar[0:n_cells_ext], \ + i_massflux[0:n_i_faces], \ + i_f_face_normal[0:n_i_faces], \ + b_face_cells[0:n_b_faces], \ + coefb[0:n_b_faces], \ + coefa[0:n_b_faces], \ + cell_cells_idx[0:n_cells_ext], \ + cell_cells[0:n_cells_ext], \ + b_cells[0:n_cells], \ + cell_b_faces_idx[0:n_cells+1], \ + cell_vol[0:n_cells_ext], \ + i_face_cells[0:n_i_faces]) +{ + if(scatter){ + #pragma omp target teams distribute parallel for \ + schedule(static,1) + for (cs_lnum_t face_id = 0; face_id < n_i_faces; face_id++){ + + cs_real_t difv[3], djfv[3]; + + cs_lnum_t ii = i_face_cells[face_id][0]; + cs_lnum_t jj = i_face_cells[face_id][1]; + + for (int jsou = 0; jsou < 3; jsou++) { + difv[jsou] = i_face_cog[face_id][jsou] - cell_cen[ii][jsou]; + djfv[jsou] = i_face_cog[face_id][jsou] - cell_cen[jj][jsou]; + } + + /* x-y-z component, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + cs_real_t pif = pvar[ii][isou]; + cs_real_t pjf = pvar[jj][isou]; + for (int jsou = 0; jsou < 3; jsou++) { + pif = pif + grad[ii][isou][jsou]*difv[jsou]; + pjf = pjf + grad[jj][isou][jsou]*djfv[jsou]; + } + + cs_real_t pfac = pjf; + if (i_massflux[face_id] > 0.) pfac = pif; + + /* U gradient */ + + cs_real_t vfac[3]; + + for (int jsou = 0; jsou < 3; jsou++) { + vfac[jsou] = pfac*i_f_face_normal[face_id][jsou]; + #pragma omp atomic + grdpa[ii][isou][jsou] = grdpa[ii][isou][jsou] + vfac[jsou]; + #pragma omp atomic + grdpa[jj][isou][jsou] = grdpa[jj][isou][jsou] - vfac[jsou]; + } + } + + } + + #pragma omp target teams distribute parallel for \ + schedule(static,1) if(m->n_b_faces > CS_THR_MIN) + for (cs_lnum_t face_id = 0; face_id < n_b_faces; face_id++) { + + cs_real_t diipbv[3]; + cs_lnum_t ii = b_face_cells[face_id]; + + for (int jsou = 0; jsou < 3; jsou++){ + diipbv[jsou] = diipb[face_id][jsou]; + } + + /* x-y-z components, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + cs_real_t pfac = inc*coefa[face_id][isou]; + /*coefu is a matrix */ + for (int jsou = 0; jsou < 3; jsou++) + pfac += coefb[face_id][jsou][isou]*( pvar[ii][jsou] + + grad[ii][jsou][0]*diipbv[0] + + grad[ii][jsou][1]*diipbv[1] + + grad[ii][jsou][2]*diipbv[2]); + + for (int jsou = 0; jsou < 3; jsou++){ + #pragma omp atomic + grdpa[ii][isou][jsou] += pfac*b_f_face_normal[face_id][jsou]; + } + } + + } + + } + else{ + #pragma omp target teams distribute parallel for \ + schedule(static,1) + for (cs_lnum_t ii = 0; ii < n_cells; ii++){ + + cs_lnum_t s_id = cell_cells_idx[ii]; + cs_lnum_t e_id = cell_cells_idx[ii+1]; + + cs_real_t difv[3], djfv[3]; + + cs_lnum_t jj, face_id, face_sgn; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + jj = cell_cells[index]; + face_id = cell_i_faces[index]; + face_sgn = cell_i_faces_sgn[index]; + + for (int jsou = 0; jsou < 3; jsou++) { + difv[jsou] = i_face_cog[face_id][jsou] - cell_cen[ii][jsou]; + djfv[jsou] = i_face_cog[face_id][jsou] - cell_cen[jj][jsou]; + } + + for (int isou = 0; isou < 3; isou++) { + cs_real_t pif = pvar[ii][isou]; + cs_real_t pjf = pvar[jj][isou]; + for (int jsou = 0; jsou < 3; jsou++) { + pif = pif + grad[ii][isou][jsou]*difv[jsou]; + pjf = pjf + grad[jj][isou][jsou]*djfv[jsou]; + } + + cs_real_t pfac = pjf; + if (i_massflux[face_id]*face_sgn > 0.) pfac = pif; + + pfac *= face_sgn; + + cs_real_t vfac[3]; + + for (int jsou = 0; jsou < 3; jsou++) { + vfac[jsou] = pfac*i_f_face_normal[face_id][jsou]; + grdpa[ii][isou][jsou] = grdpa[ii][isou][jsou] + vfac[jsou]; + } + } + } + + } + + #pragma omp target teams distribute parallel for \ + schedule(static,1) if(m->n_b_faces > CS_THR_MIN) + for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { + + cs_lnum_t ii = b_cells[c_idx]; + + cs_lnum_t s_id = cell_b_faces_idx[ii]; + cs_lnum_t e_id = cell_b_faces_idx[ii+1]; + + cs_lnum_t face_id; + + cs_real_t diipbv[3]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + face_id = cell_b_faces[index]; + + for (int jsou = 0; jsou < 3; jsou++){ + diipbv[jsou] = diipb[face_id][jsou]; + } + + /* x-y-z components, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + cs_real_t pfac = inc*coefa[face_id][isou]; + /*coefu is a matrix */ + for (int jsou = 0; jsou < 3; jsou++) + pfac += coefb[face_id][jsou][isou]*( pvar[ii][jsou] + + grad[ii][jsou][0]*diipbv[0] + + grad[ii][jsou][1]*diipbv[1] + + grad[ii][jsou][2]*diipbv[2]); + + for (int jsou = 0; jsou < 3; jsou++){ + grdpa[ii][isou][jsou] += pfac*b_f_face_normal[face_id][jsou]; + } + } + } + + } + } + + #pragma omp target teams distribute parallel for + for (cs_lnum_t cell_id = 0; cell_id < n_cells; cell_id++) { + cs_real_t unsvol = 1./cell_vol[cell_id]; + for (int isou = 0; isou < 3; isou++) { + for (int jsou = 0; jsou < 3; jsou++){ + grdpa[cell_id][isou][jsou] = grdpa[cell_id][isou][jsou]*unsvol; + } + } + } +} + /* Handle parallelism and periodicity */ + + if (halo != NULL) { + cs_halo_sync_var_strided(halo, halo_type, (cs_real_t *)grdpa, 9); + if (m->n_init_perio > 0) + cs_halo_perio_sync_var_sym_tens(halo, halo_type, (cs_real_t *)grdpa); + } +} +#endif + /*----------------------------------------------------------------------------*/ /*! * \brief Compute the upwind gradient used in the pure SOLU schemes @@ -3996,6 +4261,20 @@ cs_face_convection_scalar(int idtvar, BFT_FREE(courant); } +void cs_math_3_normalize_target_cd(const cs_real_t in[3], + cs_real_t out[3]) +{ + cs_real_t norm = sqrt(in[0]*in[0] + + in[1]*in[1] + + in[2]*in[2]); + + cs_real_t inverse_norm = 1. / norm; + + out[0] = inverse_norm * in[0]; + out[1] = inverse_norm * in[1]; + out[2] = inverse_norm * in[2]; +} + /*----------------------------------------------------------------------------*/ /*! * \brief Add the explicit part of the convection/diffusion terms of a transport @@ -4107,7 +4386,10 @@ cs_convection_diffusion_vector(int idtvar, cs_mesh_quantities_t *fvq = cs_glob_mesh_quantities; const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_b_cells = m->n_b_cells; const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const cs_lnum_t n_i_faces = m->n_i_faces; + const cs_lnum_t n_b_faces = m->n_b_faces; const int n_i_groups = m->i_face_numbering->n_groups; const int n_i_threads = m->i_face_numbering->n_threads; const int n_b_threads = m->b_face_numbering->n_threads; @@ -4270,67 +4552,216 @@ cs_convection_diffusion_vector(int idtvar, - when we have convection, we are not in pure upwind and we have not shunted the slope test. */ - if ( (idiffp != 0 && ircflp == 1) || ivisep == 1 - || ( iconvp != 0 && iupwin == 0 - && (ischcp == 0 || ircflp == 1 || isstpp == 0))) { - if (f_id != -1) { - /* Get the calculation option from the field */ - if (f->type & CS_FIELD_VARIABLE && eqp.iwgrec == 1) { - if (eqp.idiff > 0) { - int key_id = cs_field_key_id("gradient_weighting_id"); - int diff_id = cs_field_get_key_int(f, key_id); - if (diff_id > -1) { - cs_field_t *weight_f = cs_field_by_id(diff_id); - gweight = weight_f->val; - cs_field_synchronize(weight_f, halo_type); + + /* Timing the computation */ + + clock_t start, stop, start_slope, stop_slope; + unsigned long elapsed, elapsed_cuda, elapsed_slope; + + cs_real_33_t *grad_cpu, *grad_gpu; + cs_real_33_t *grdpa_cpu, *grdpa_gpu; + + bool compute_cuda; + bool compute_cpu; + bool res_cpu; + bool perf; + bool accuracy; + +#if defined(HAVE_CUDA) + compute_cuda = (cs_get_device_id() > -1) ? true : false; +#else + compute_cuda = false; +#endif + +res_cpu = !compute_cuda; + +#if defined(DEBUG) + compute_cpu = true; + perf = true; + accuracy = true; +#elif defined(NDEBUG) + compute_cpu = true; + perf = false; + accuracy = false; +#else + compute_cpu = false; + perf = false; + accuracy = false; +#endif + + + // Pour l'instant ces lignes sont pour moi + // Elles seront à enlever + // compute_cuda = true; + compute_cpu = true; + // res_cpu = false; + + // A ne pas garder dans la version finale + perf = true; + // accuracy = false; + +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(!res_cpu){ + grad_gpu = grad; + grdpa_gpu = grdpa; + } else { + BFT_MALLOC(grad_gpu, n_cells_ext, cs_real_33_t); + BFT_MALLOC(grdpa_gpu, n_cells_ext, cs_real_33_t); + } + if(perf){ + start = clock(); + } + + bool flag1 = ( (idiffp != 0 && ircflp == 1) || ivisep == 1 + || ( iconvp != 0 && iupwin == 0 + && (ischcp == 0 || ircflp == 1 || isstpp == 0))); + + if (flag1) { + + if (f_id != -1) { + /* Get the calculation option from the field */ + if (f->type & CS_FIELD_VARIABLE && eqp.iwgrec == 1) { + if (eqp.idiff > 0) { + int key_id = cs_field_key_id("gradient_weighting_id"); + int diff_id = cs_field_get_key_int(f, key_id); + if (diff_id > -1) { + cs_field_t *weight_f = cs_field_by_id(diff_id); + gweight = weight_f->val; + cs_field_synchronize(weight_f, halo_type); + } } } } - } - cs_gradient_vector_synced_input(var_name, - gradient_type, - halo_type, - inc, - nswrgp, - iwarnp, - imligp, - epsrgp, - climgp, - coefav, - coefbv, - _pvar, - gweight, /* weighted gradient */ - cpl, - grad); + cs_gradient_vector_synced_input(var_name, + gradient_type, + halo_type, + inc, + nswrgp, + iwarnp, + imligp, + epsrgp, + climgp, + coefav, + coefbv, + _pvar, + gweight, /* weighted gradient */ + cpl, + grad_gpu); + } + + bool flag2 = (iconvp > 0 && iupwin == 0 && isstpp == 0); + + cs_convection_diffusion_vector_cuda(m, + cs_glob_mesh_adjacencies, + fvq, + _pvar, + i_massflux, + grad_gpu, + grdpa_gpu, + coefav, + coefbv, + inc, + flag1, + flag2, + perf); - } - else { -# pragma omp parallel for - for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) { - for (int isou = 0; isou < 3; isou++) { - for (int jsou = 0; jsou < 3; jsou++) - grad[cell_id][isou][jsou] = 0.; + /* Handle parallelism and periodicity */ + if (flag2){ + if (halo != NULL) { + cs_halo_sync_var_strided(halo, halo_type, (cs_real_t *)grdpa_gpu, 9); + if (m->n_init_perio > 0) + cs_halo_perio_sync_var_sym_tens(halo, halo_type, (cs_real_t *)grdpa_gpu); } } + + if(perf){ + stop = clock(); + elapsed_cuda = (stop - start) * 1e6 / CLOCKS_PER_SEC; + } } +#endif - /* ====================================================================== - ---> Compute uncentered gradient grdpa for the slope test - ======================================================================*/ + if(compute_cpu){ + if(res_cpu){ + grad_cpu = grad; + grdpa_cpu = grdpa; + } else { + BFT_MALLOC(grad_cpu, n_cells_ext, cs_real_33_t); + BFT_MALLOC(grdpa_cpu, n_cells_ext, cs_real_33_t); + } -# pragma omp parallel for - for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) { - for (int jsou = 0; jsou < 3; jsou++) { - for (int isou = 0; isou < 3; isou++) - grdpa[cell_id][isou][jsou] = 0.; + if(perf){ + start = clock(); } - } - if (iconvp > 0 && iupwin == 0 && isstpp == 0) { + if ( (idiffp != 0 && ircflp == 1) || ivisep == 1 + || ( iconvp != 0 && iupwin == 0 + && (ischcp == 0 || ircflp == 1 || isstpp == 0))) { + + if (f_id != -1) { + /* Get the calculation option from the field */ + if (f->type & CS_FIELD_VARIABLE && eqp.iwgrec == 1) { + if (eqp.idiff > 0) { + int key_id = cs_field_key_id("gradient_weighting_id"); + int diff_id = cs_field_get_key_int(f, key_id); + if (diff_id > -1) { + cs_field_t *weight_f = cs_field_by_id(diff_id); + gweight = weight_f->val; + cs_field_synchronize(weight_f, halo_type); + } + } + } + } + + cs_gradient_vector_synced_input(var_name, + gradient_type, + halo_type, + inc, + nswrgp, + iwarnp, + imligp, + epsrgp, + climgp, + coefav, + coefbv, + _pvar, + gweight, /* weighted gradient */ + cpl, + grad_cpu); + } + else { + # pragma omp parallel for + for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) { + for (int isou = 0; isou < 3; isou++) { + for (int jsou = 0; jsou < 3; jsou++) + grad_cpu[cell_id][isou][jsou] = 0.; + } + } + } + +/* ====================================================================== + ---> Compute uncentered gradient grdpa for the slope test + ======================================================================*/ - _slope_test_gradient_strided<3>(inc, + # pragma omp parallel for + for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) { + for (int jsou = 0; jsou < 3; jsou++) { + for (int isou = 0; isou < 3; isou++) + grdpa_cpu[cell_id][isou][jsou] = 0.; + } + } + + if (iconvp > 0 && iupwin == 0 && isstpp == 0) { + + + if(compute_cpu){ + if(perf){ + start_slope = clock(); + } + _slope_test_gradient_strided<3>(inc, halo_type, (const cs_real_33_t *)grad, grdpa, @@ -4339,22 +4770,110 @@ cs_convection_diffusion_vector(int idtvar, coefbv, i_massflux); - } - - /* ====================================================================== - ---> Contribution from interior faces - ======================================================================*/ + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("convection compute Slope time in us: CPU = %ld\n", elapsed_slope); + } + } - n_upwind = 0; + #if defined(HAVE_OPENMP_TARGET) + if(compute_cuda){ + if(perf){ + start_slope = clock(); + } + cs_slope_test_gradient_vector_target(inc, + halo_type, + (const cs_real_33_t *)grad_cpu, + grdpa_cpu, + _pvar, + coefav, + coefbv, + i_massflux); + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("convection compute Slope time in us: OMP = %ld\n", elapsed_slope); + } + } + #endif + } - if (n_cells_ext > n_cells) { -# pragma omp parallel for if(n_cells_ext -n_cells > CS_THR_MIN) - for (cs_lnum_t cell_id = n_cells; cell_id < n_cells_ext; cell_id++) { - for (int isou = 0; isou < 3; isou++) - rhs[cell_id][isou] = 0.; + if(perf){ + stop = clock(); + elapsed = (stop - start) * 1e6 / CLOCKS_PER_SEC; } } + /* Performances */ + if(perf){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + printf("convection Compute and tranferts time in us: CUDA = %ld\n", elapsed_cuda); + } + #endif + + if(compute_cpu){ + printf("convection compute time in us: CPU = %ld\n", elapsed); + } + } + + /* Accuracy grad_cpu and grad_gpu */ + if(accuracy){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + if(compute_cpu){ + cs_real_t cpu, cuda; + double err; + for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j =0; j < 3; ++j) { + cpu = grdpa_cpu[c_id][i][j]; + cuda = grdpa_gpu[c_id][i][j]; + err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); + if (err> 1e-6) { + printf("slop_test DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err); + } + } + } + } + } + } + #endif + } + +// Free memory +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(res_cpu){ + BFT_FREE(grad_gpu); + BFT_FREE(grdpa_gpu); + } + } +#endif + +// Free memory + if(compute_cpu){ + if(!res_cpu){ + BFT_FREE(grad_cpu); + BFT_FREE(grdpa_cpu); + } + } + + + /* ====================================================================== + ---> Contribution from interior faces + ======================================================================*/ + + n_upwind = 0; + + if (n_cells_ext > n_cells) { +# pragma omp parallel for if(n_cells_ext -n_cells > CS_THR_MIN) + for (cs_lnum_t cell_id = n_cells; cell_id < n_cells_ext; cell_id++) { + for (int isou = 0; isou < 3; isou++) + rhs[cell_id][isou] = 0.; + } + } /* --> Pure upwind flux =====================*/ @@ -4925,7 +5444,11 @@ cs_convection_diffusion_vector(int idtvar, /* Unsteady */ } else { - + // ---------------OMP and CUDA here --------------------- +if(compute_cpu){ + if(perf){ + start_slope = clock(); + } for (int g_id = 0; g_id < n_i_groups; g_id++) { # pragma omp parallel for reduction(+:n_upwind) for (int t_id = 0; t_id < n_i_threads; t_id++) { @@ -5047,6 +5570,164 @@ cs_convection_diffusion_vector(int idtvar, } } } + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("idtvar => 0 unsteady i_faces time in us: CPU = %ld\n", elapsed_slope); + } +} //compute_cpu + + #if defined(HAVE_OPENMP_TARGET) + if(compute_cuda){ + if(perf){ + start_slope = clock(); + } + #pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \ + map(to: i_face_cells[0:n_i_faces], \ + i_massflux[0:n_i_faces], \ + i_f_face_factor[0:n_i_faces], \ + i_face_u_normal[0:n_i_faces], \ + i_visc[0:n_i_faces], \ + i_face_cog[0:n_i_faces], \ + i_dist[0:n_i_faces], \ + weight[0:n_i_faces], \ + diipf[0:n_i_faces], \ + djjpf[0:n_i_faces], \ + i_pvar[0:n_i_faces], \ + grad[0:n_cells_ext], \ + grdpa[0:n_cells_ext], \ + cell_cen[0:n_cells_ext], \ + _pvar[0:n_cells_ext]) + { + #pragma omp target teams distribute parallel for reduction(+:n_upwind) \ + firstprivate(cs_math_zero_threshold, \ + iconvp, thetap, ischcp, blencp, blend_st, \ + imasac, idiffp, ircflp) \ + schedule(static,1) + for (cs_lnum_t face_id = 0; face_id < n_i_faces; face_id++) { + + cs_lnum_t ii = i_face_cells[face_id][0]; + cs_lnum_t jj = i_face_cells[face_id][1]; + + cs_real_t fluxi[3], fluxj[3] ; + for (int isou = 0; isou < 3; isou++) { + fluxi[isou] = 0; + fluxj[isou] = 0; + } + cs_real_3_t pip, pjp; + cs_real_3_t pif, pjf; + bool upwind_switch = false; + cs_real_3_t _pi, _pj; + + for (int i = 0; i < 3; i++) { + _pi[i] = _pvar[ii][i]; + _pj[i] = _pvar[jj][i]; + } + + /* Scaling due to mass balance in porous modelling */ + if (i_f_face_factor != NULL) { + const cs_real_t *n = i_face_u_normal[face_id]; + cs_math_3_normal_scaling(n, i_f_face_factor[face_id][0], _pi); + cs_math_3_normal_scaling(n, i_f_face_factor[face_id][1], _pj); + } + + cs_real_t bldfrp = (cs_real_t) ircflp; + /* Local limitation of the reconstruction */ + if (df_limiter != NULL && ircflp > 0) + bldfrp = cs_math_fmax(cs_math_fmin(df_limiter[ii], df_limiter[jj]), + 0.); + + cs_i_cd_unsteady_slope_test_strided<3>(&upwind_switch, + iconvp, + bldfrp, + ischcp, + blencp, + blend_st, + weight[face_id], + i_dist[face_id], + cell_cen[ii], + cell_cen[jj], + i_face_u_normal[face_id], + i_face_cog[face_id], + diipf[face_id], + djjpf[face_id], + i_massflux[face_id], + grad[ii], + grad[jj], + grdpa[ii], + grdpa[jj], + _pi, + _pj, + pif, + pjf, + pip, + pjp); + + cs_i_conv_flux_strided<3>(iconvp, + thetap, + imasac, + _pvar[ii], + _pvar[jj], + pif, + pif, /* no relaxation */ + pjf, + pjf, /* no relaxation */ + i_massflux[face_id], + fluxi, + fluxj); + + + cs_i_diff_flux_strided<3>(idiffp, + thetap, + pip, + pjp, + pip, /* no relaxation */ + pjp, /* no relaxation */ + i_visc[face_id], + fluxi, + fluxj); + + if (upwind_switch) { + + /* in parallel, face will be counted by one and only one rank */ + if (ii < n_cells) + n_upwind++; + + if (v_slope_test != NULL) { + v_slope_test[ii] += fabs(i_massflux[face_id]) / cell_vol[ii]; + v_slope_test[jj] += fabs(i_massflux[face_id]) / cell_vol[jj]; + } + } + /* Saving velocity at internal faces, if needed */ + if (i_pvar != NULL) { + if (i_massflux[face_id] >= 0.) { + for (cs_lnum_t i = 0; i < 3; i++) + i_pvar[face_id][i] += thetap * pif[i]; + } + else { + for (cs_lnum_t i = 0; i < 3; i++) + i_pvar[face_id][i] += thetap * pjf[i]; + } + } + + for (int isou = 0; isou < 3; isou++) { + + #pragma omp atomic + rhs[ii][isou] -= fluxi[isou]; + #pragma omp atomic + rhs[jj][isou] += fluxj[isou]; + + } /* isou */ + + } + } // target data + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("idtvar => 0 unsteady i_faces time in us: OMP = %ld\n", elapsed_slope); + } + } // compute_cuda + #endif } /* idtvar */ @@ -5266,7 +5947,11 @@ cs_convection_diffusion_vector(int idtvar, /* Unsteady */ } else { - + // ---------------OMP and CUDA here --------------------- +if(compute_cpu){ + if(perf){ + start_slope = clock(); + } # pragma omp parallel for if(m->n_b_faces > CS_THR_MIN) for (int t_id = 0; t_id < n_b_threads; t_id++) { for (cs_lnum_t face_id = b_group_index[t_id*2]; @@ -5460,6 +6145,240 @@ cs_convection_diffusion_vector(int idtvar, BFT_FREE(df_limiter_local); } } + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("idtvar => 0 unsteady b_faces time in us: CPU = %ld\n", elapsed_slope); + } +} // compute_cpu + +#if defined(HAVE_OPENMP_TARGET) +if(compute_cuda){ + if(perf){ + start_slope = clock(); + } +#pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \ + map(to: b_face_cells[0:n_b_faces], \ + b_massflux[0:n_b_faces], \ + b_f_face_factor[0:n_b_faces], \ + b_face_u_normal[0:n_b_faces], \ + bc_type[0:n_b_faces], \ + b_visc[0:n_b_faces], \ + b_face_cells[0:n_b_faces], \ + b_face_surf[0:n_b_faces], \ + coefav[0:n_b_faces], \ + coefbv[0:n_b_faces], \ + cofafv[0:n_b_faces], \ + cofbfv[0:n_b_faces], \ + diipb[0:n_b_faces], \ + b_pvar[0:n_b_faces], \ + grad[0:n_cells_ext], \ + grdpa[0:n_cells_ext], \ + _pvar[0:n_cells_ext]) +{ + #pragma omp target teams distribute parallel for \ + private(pvar_distant, pvar_local, df_limiter_local) \ + firstprivate(cs_math_zero_threshold, iconvp, thetap, ischcp, blencp, blend_st, \ + imasac, idiffp, ircflp, inc, n_local, n_distant) \ + schedule(static,1) if(m->n_b_faces > CS_THR_MIN) + for (cs_lnum_t face_id = 0; face_id < n_b_faces; face_id++) { + + cs_lnum_t ii = b_face_cells[face_id]; + + cs_real_t fluxi[3]; + for (int isou = 0; isou < 3; isou++) { + fluxi[isou] = 0; + } + cs_real_3_t pip; + cs_real_3_t _pi; + cs_real_t pfac[3]; + + for (int i = 0; i < 3; i++) { + _pi[i] = _pvar[ii][i]; + } + + /* Scaling due to mass balance in porous modelling */ + if (b_f_face_factor != NULL) { + const cs_real_t *n = b_face_u_normal[face_id]; + cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pi); + } + + cs_real_t bldfrp = (cs_real_t) ircflp; + /* Local limitation of the reconstruction */ + if (df_limiter != NULL && ircflp > 0) + bldfrp = cs_math_fmax(df_limiter[ii], 0.); + + cs_b_cd_unsteady_strided<3>(bldfrp, + diipb[face_id], + grad[ii], + _pi, + pip); + cs_b_upwind_flux_strided<3>(iconvp, + thetap, + imasac, + inc, + bc_type[face_id], + _pi, + _pi, /* no relaxation */ + pip, + coefav[face_id], + coefbv[face_id], + b_massflux[face_id], + pfac, + fluxi); + + /* Saving velocity on boundary faces */ + if (b_pvar != NULL) { + if (b_massflux[face_id] >= 0.) { + for (cs_lnum_t i = 0; i < 3; i++) + b_pvar[face_id][i] += thetap * _pi[i]; + } + else { + for (cs_lnum_t i = 0; i < 3; i++) { + b_pvar[face_id][i] += thetap * pfac[i]; + } + } + } + + cs_b_diff_flux_strided<3>(idiffp, + thetap, + inc, + pip, + cofafv[face_id], + cofbfv[face_id], + b_visc[face_id], + fluxi); + + for(int isou = 0; isou < 3; isou++) { + #pragma omp atomic + rhs[ii][isou] -= fluxi[isou]; + } + + } + } + + /* The variable is internally coupled and an implicit contribution + * is required */ + if (icoupl > 0) { + /* Prepare data for sending */ + BFT_MALLOC(pvar_distant, n_distant, cs_real_3_t); + + for (cs_lnum_t ii = 0; ii < n_distant; ii++) { + cs_lnum_t face_id = faces_distant[ii]; + cs_lnum_t jj = b_face_cells[face_id]; + + cs_real_3_t pip; + cs_real_3_t _pj; + + for (int i = 0; i < 3; i++) { + _pj[i] = _pvar[jj][i]; + } + + cs_real_t bldfrp = (cs_real_t) ircflp; + /* Local limitation of the reconstruction */ + /* Note: to be treated exactly as a internal face, should be a bending + * between the two cells... */ + if (df_limiter != NULL && ircflp > 0) + bldfrp = cs_math_fmax(df_limiter[jj], 0.); + + /* Scaling due to mass balance in porous modelling */ + if (b_f_face_factor != NULL) { + const cs_real_t *n = b_face_u_normal[face_id]; + cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj); + } + + cs_b_cd_unsteady_strided<3>(bldfrp, + diipb[face_id], + grad[jj], + _pj, + pip); + + for (int k = 0; k < 3; k++) + pvar_distant[ii][k] = pip[k]; + } + + /* Receive data */ + BFT_MALLOC(pvar_local, n_local, cs_real_3_t); + cs_internal_coupling_exchange_var(cpl, + 3, /* Dimension */ + (cs_real_t *)pvar_distant, + (cs_real_t *)pvar_local); + + if (df_limiter != NULL) { + BFT_MALLOC(df_limiter_local, n_local, cs_real_t); + cs_internal_coupling_exchange_var(cpl, + 1, /* Dimension */ + df_limiter, + df_limiter_local); + } + + /* Flux contribution */ + assert(f != NULL); + cs_real_t *hintp = f->bc_coeffs->hint; + cs_real_t *hextp = f->bc_coeffs->rcodcl2; + for (cs_lnum_t ii = 0; ii < n_local; ii++) { + cs_lnum_t face_id = faces_local[ii]; + cs_lnum_t jj = b_face_cells[face_id]; + cs_real_t surf = b_face_surf[face_id]; + cs_real_t pip[3], pjp[3]; + cs_real_t fluxi[3] = {0., 0., 0.}; + cs_real_3_t _pj; + + for (int i = 0; i < 3; i++) { + _pj[i] = _pvar[jj][i]; + } + + /* Scaling due to mass balance in porous modelling */ + if (b_f_face_factor != NULL) { + const cs_real_t *n = b_face_u_normal[face_id]; + cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj); + } + + cs_real_t bldfrp = (cs_real_t) ircflp; + /* Local limitation of the reconstruction */ + if (df_limiter != NULL && ircflp > 0) + bldfrp = cs_math_fmax(cs_math_fmin(df_limiter_local[ii], + df_limiter[jj]), + 0.); + + cs_b_cd_unsteady_strided<3>(bldfrp, + diipb[face_id], + grad[jj], + _pj, + pip); + + for (int k = 0; k < 3; k++) + pjp[k] = pvar_local[ii][k]; + + cs_real_t hint = hintp[face_id]; + cs_real_t hext = hextp[face_id]; + cs_real_t heq = _calc_heq(hint, hext)*surf; + + cs_b_diff_flux_coupling_strided<3>(idiffp, + pip, + pjp, + heq, + fluxi); + + for (int k = 0; k < 3; k++) + #pragma omp atomic + rhs[jj][k] -= thetap * fluxi[k]; + } + + BFT_FREE(pvar_local); + /* Sending structures are no longer needed */ + BFT_FREE(pvar_distant); + if (df_limiter != NULL) { + BFT_FREE(df_limiter_local); + } + } // target data + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("idtvar => 0 unsteady b_faces time in us: OMP = %ld\n", elapsed_slope); + } +} // compute_cuda +#endif } /* idtvar */ /* Boundary convective flux imposed at some faces (tags in icvfli array) */ diff --git a/src/alge/cs_convection_diffusion_cuda.cu b/src/alge/cs_convection_diffusion_cuda.cu new file mode 100644 index 0000000000..c110ababfd --- /dev/null +++ b/src/alge/cs_convection_diffusion_cuda.cu @@ -0,0 +1,291 @@ +#include "cs_alge_cuda.cuh" + +#include "cs_convection_diffusion.h" +#include "cs_convection_diffusion_priv.h" + +#include "cs_slope_test_gradient_vector_cuda_scatter.cuh" +#include "cs_slope_test_gradient_vector_cuda_gather.cuh" + +/*---------------------------------------------------------------------------- + * _gradient_vector the gradient of a vector using a given gradient of + * this vector (typically lsq). + * + * parameters: + * m <-- pointer to associated mesh structure + * fvq <-- pointer to associated finite volume quantities + * cpl <-- structure associated with internal coupling, or NULL + * inc <-- if 0, solve on increment; 1 otherwise + * coefav <-- B.C. coefficients for boundary face normals + * coefbv <-- B.C. coefficients for boundary face normals + * pvar <-- variable + * c_weight <-- weighted gradient coefficient variable + * r_grad --> gradient used for reconstruction + * grad --> gradient of pvar (du_i/dx_j : grad[][i][j]) + *----------------------------------------------------------------------------*/ +extern "C" void +cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_real_3_t *restrict pvar, + const cs_real_t i_massflux[], + const cs_real_33_t *grad, + cs_real_33_t *grdpa, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const int inc, + const bool flag1, + const bool flag2, + const bool perf) +{ + const cs_lnum_t n_cells = mesh->n_cells; + const cs_lnum_t n_b_cells = mesh->n_b_cells; + const cs_lnum_t n_cells_ext = mesh->n_cells_with_ghosts; + const cs_lnum_t n_i_faces = mesh->n_i_faces; + const cs_lnum_t n_b_faces = mesh->n_b_faces; + + int device_id; + cudaGetDevice(&device_id); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t start, mem_h2d, init, f_i, f_b, f_f, stop; + float msec = 0.0f; + CS_CUDA_CHECK(cudaEventCreate(&start)); + CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); + CS_CUDA_CHECK(cudaEventCreate(&init)); + CS_CUDA_CHECK(cudaEventCreate(&f_i)); + CS_CUDA_CHECK(cudaEventCreate(&f_b)); + CS_CUDA_CHECK(cudaEventCreate(&f_f)); + CS_CUDA_CHECK(cudaEventCreate(&stop)); + + + // Record the start event + CS_CUDA_CHECK(cudaEventRecord(start, stream)); + + unsigned int blocksize = 256; + + cs_real_33_t *grad_d = NULL; + CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells_ext * sizeof(cs_real_33_t))); + + cs_real_33_t *grdpa_d; + CS_CUDA_CHECK(cudaMalloc(&grdpa_d, n_cells_ext * sizeof(cs_real_33_t))); + + cs_gnum_t n_upwind; + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(mesh->i_face_cells); + + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(mesh->b_face_cells); + + const cs_real_3_t *restrict cell_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); + + const cs_real_3_t *restrict diipb + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); + + const cs_real_3_t *restrict b_f_face_normal + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_normal); + + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); + + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); + + const cs_lnum_t *restrict b_cells + = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(mesh->b_cells); + + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces); + + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx); + + cs_real_t *restrict i_massflux_d; + CS_CUDA_CHECK(cudaMalloc(&i_massflux_d, sizeof(cs_real_t)*n_i_faces)); + cs_cuda_copy_h2d(i_massflux_d, (void *)i_massflux, sizeof(cs_real_t)*n_i_faces); + + cs_real_3_t *restrict i_face_cog; + CS_CUDA_CHECK(cudaMalloc(&i_face_cog, sizeof(cs_real_3_t)*n_i_faces)); + cs_cuda_copy_h2d(i_face_cog, (void *)fvq->i_face_cog, sizeof(cs_real_3_t)*n_i_faces); + + cs_real_3_t *restrict i_f_face_normal; + CS_CUDA_CHECK(cudaMalloc(&i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces)); + cs_cuda_copy_h2d(i_f_face_normal, (void *)fvq->i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces); + + cs_real_t *restrict cell_vol; + CS_CUDA_CHECK(cudaMalloc(&cell_vol, sizeof(cs_real_t)*n_cells)); + cs_cuda_copy_h2d(cell_vol, (void *)fvq->cell_vol, sizeof(cs_real_t)*n_cells); + + cs_mesh_adjacencies_update_cell_i_faces(); + const cs_lnum_t n_cells_i_face = (madj->cell_cells_idx[n_cells]); + + cs_lnum_t *restrict cell_i_faces; + CS_CUDA_CHECK(cudaMalloc(&cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face)); + cs_cuda_copy_h2d(cell_i_faces, madj->cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face); + + short int *restrict cell_i_faces_sgn; + CS_CUDA_CHECK(cudaMalloc(&cell_i_faces_sgn, sizeof(short int)*n_cells_i_face)); + cs_cuda_copy_h2d(cell_i_faces_sgn, madj->cell_i_faces_sgn, sizeof(short int)*n_cells_i_face); + + + void *_coefb_d, *_coefa_d, *_pvar_d; + + const cs_real_3_t * coefa_d = NULL; + const cs_real_3_t * pvar_d = NULL; + const cs_real_33_t * coefb_d = NULL; + + /* Initialization */ + + _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, + &pvar_d, &_pvar_d); + _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream, + &coefa_d, &_coefa_d); + _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream, + &coefb_d, &_coefb_d); + + if(flag1){ + cs_cuda_copy_h2d(grad_d, grad, sizeof(cs_real_33_t)*n_cells_ext); + } + else{ + cudaMemset(grad_d, 0, n_cells_ext * sizeof(cs_real_33_t)); + } + + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + + cudaMemset(grdpa_d, 0, n_cells_ext * sizeof(cs_real_33_t)); + + CS_CUDA_CHECK(cudaEventRecord(init, stream)); + + if (flag2) { + cs_slope_test_gradient_vector_cuda_i<<<(unsigned int)ceil((double)n_i_faces / blocksize), blocksize, 0, stream>>> + (n_i_faces, + i_face_cells, + i_face_cog, + cell_cen, + pvar_d, + i_massflux_d, + i_f_face_normal, + grad_d, + grdpa_d); + + + // cs_slope_test_gradient_vector_cuda_i_gather<<<(unsigned int)ceil((double)n_cells / blocksize), blocksize, 0, stream>>> + // (n_cells, + // i_face_cog, + // cell_cen, + // pvar_d, + // i_massflux_d, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn, + // grad_d, + // grdpa_d); + + CS_CUDA_CHECK(cudaEventRecord(f_i, stream)); + + cs_slope_test_gradient_vector_cuda_b<<>> + (n_b_faces, + pvar_d, + b_face_cells, + diipb, + inc, + coefa_d, + coefb_d, + b_f_face_normal, + grad_d, + grdpa_d); + + + // cs_slope_test_gradient_vector_cuda_b_gather<<>> + // (n_b_cells, + // pvar_d, + // diipb, + // inc, + // coefa_d, + // coefb_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx, + // grad_d, + // grdpa_d); + + CS_CUDA_CHECK(cudaEventRecord(f_b, stream)); + + cs_slope_test_gradient_vector_cuda_f<<>> + (n_cells * 3 * 3, + cell_vol, + grdpa_d); + + CS_CUDA_CHECK(cudaEventRecord(f_f, stream)); + + } + + n_upwind = 0; + + /* Sync to host */ + if (grdpa_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(grdpa, grdpa_d, size); + } + else + cs_sync_d2h(grdpa); + + CS_CUDA_CHECK(cudaEventRecord(stop, stream)); + CS_CUDA_CHECK(cudaEventSynchronize(stop)); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + if(perf){ + printf("convection_diffusion Kernels:\n"); + printf("Execution time in us: \t"); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + printf("Init = %f\t", msec*1000.f); + + if (flag2) { + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, f_i)); + printf("f_i = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, f_i, f_b)); + printf("f_b = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, f_b, f_f)); + printf("f_f = %f\t", msec*1000.f); + + } + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\n", msec*1000.f); + } + + if (!flag1){ + CS_CUDA_CHECK(cudaFree(grad_d)); + } + + if (_pvar_d != NULL) + CS_CUDA_CHECK(cudaFree(_pvar_d)); + if (_coefa_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefa_d)); + if (_coefb_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefb_d)); + + CS_CUDA_CHECK(cudaFree(grad_d)); + CS_CUDA_CHECK(cudaFree(grdpa_d)); + CS_CUDA_CHECK(cudaFree(i_massflux_d)); + CS_CUDA_CHECK(cudaFree(i_f_face_normal)); + CS_CUDA_CHECK(cudaFree(cell_vol)); + CS_CUDA_CHECK(cudaFree(cell_i_faces)); + CS_CUDA_CHECK(cudaFree(cell_i_faces_sgn)); + CS_CUDA_CHECK(cudaFree(i_face_cog)); +} diff --git a/src/alge/cs_convection_diffusion_priv.h b/src/alge/cs_convection_diffusion_priv.h index 7cb0c1c71d..6c5a06ca5b 100644 --- a/src/alge/cs_convection_diffusion_priv.h +++ b/src/alge/cs_convection_diffusion_priv.h @@ -27,6 +27,35 @@ /*----------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------- + * Local headers + *----------------------------------------------------------------------------*/ + +#include "cs_base.h" +#include "cs_base_accel.h" +#include "cs_halo.h" +#include "cs_internal_coupling.h" +#include "cs_mesh.h" +#include "cs_mesh_quantities.h" + +/*----------------------------------------------------------------------------*/ + +BEGIN_C_DECLS + +/*! \cond DOXYGEN_SHOULD_SKIP_THIS */ + +/*============================================================================ + * Macro definitions + *============================================================================*/ + +/*============================================================================= + * Local type definitions + *============================================================================*/ + +/* Type for symmetric least-squares covariance matrices + as they are adimensional, single-precision should be usable here */ + + #include "cs_defs.h" /*---------------------------------------------------------------------------- @@ -50,6 +79,37 @@ * Global variables *============================================================================*/ +/*============================================================================= + * Semi-private function prototypes + *============================================================================*/ + +#if defined(HAVE_CUDA) + +void +cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_real_3_t *restrict pvar, + const cs_real_t i_massflux[], + const cs_real_33_t *grad, + cs_real_33_t *grdpa, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const int inc, + const bool flag1, + const bool flag2, + const bool perf); + +#endif + +/* defined(HAVE_CUDA) */ + +/*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ + +/*----------------------------------------------------------------------------*/ + +END_C_DECLS + /*============================================================================ * Public inlined function *============================================================================*/ diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index c4e44ad5e9..ece9a34171 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -39,6 +39,14 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #if defined(HAVE_MPI) #include @@ -190,7 +198,7 @@ const cs_e2n_sum_t _e2n_sum_type = CS_E2N_SUM_SCATTER; /* Strided LSQ gradient variant */ -static int _use_legacy_strided_lsq_gradient = false; +static int _use_legacy_strided_lsq_gradient = true; /*============================================================================ * Private function definitions @@ -690,6 +698,31 @@ _sync_scalar_gradient_halo(const cs_mesh_t *m, } } +/* Compute the unit in the last place (ULP) */ +template +typename std::enable_if::is_integer, T>::type +cs_diff_ulp(T x, T y) +{ + // Since `epsilon()` is the gap size (ULP, unit in the last place) + // of floating-point numbers in interval [1, 2), we can scale it to + // the gap size in interval [2^e, 2^{e+1}), where `e` is the exponent + // of `x` and `y`. + + // If `x` and `y` have different gap sizes (which means they have + // different exponents), we take the smaller one. Taking the bigger + // one is also reasonable, I guess. + const T m = std::min(std::fabs(x), std::fabs(y)); + + // Subnormal numbers have fixed exponent, which is `min_exponent - 1`. + const int exp = m < std::numeric_limits::min() + ? std::numeric_limits::min_exponent - 1 + : std::ilogb(m); + + // We divide the absolute difference by the epsilon times the exponent (1 ulp) + return std::fabs(x - y) / std::ldexp(std::numeric_limits::epsilon(), exp); +} + + /*---------------------------------------------------------------------------- * Synchronize strided gradient ghost cell values. * @@ -5449,6 +5482,8 @@ _reconstruct_strided_gradient(const cs_mesh_t *m, cs_real_t (*restrict r_grad)[stride][3], cs_real_t (*restrict grad)[stride][3]) { + using grad_t = cs_real_t[stride][3]; + const cs_lnum_t n_cells = m->n_cells; const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; @@ -5483,27 +5518,114 @@ _reconstruct_strided_gradient(const cs_mesh_t *m, /* Initialize gradient */ /*---------------------*/ - /* Initialization */ + /* Timing the computation */ -# pragma omp parallel for - for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { - for (cs_lnum_t i = 0; i < stride; i++) { - for (cs_lnum_t j = 0; j < 3; j++) - grad[c_id][i][j] = 0.0; + std::chrono::high_resolution_clock::time_point start, stop; + std::chrono::microseconds elapsed, elapsed_cuda; + + grad_t *grad_cpu, *grad_gpu; + + bool compute_cuda; + bool compute_cpu; + bool res_cpu; + bool perf; + bool accuracy; + +#if defined(HAVE_CUDA) + compute_cuda = (cs_get_device_id() > -1) ? true : false; +#else + compute_cuda = false; +#endif + +res_cpu = !compute_cuda; + +#if defined(DEBUG) + compute_cpu = true; + perf = true; + accuracy = true; +#elif defined(NDEBUG) + compute_cpu = true; + perf = false; + accuracy = false; +#else + compute_cpu = false; + perf = false; + accuracy = false; +#endif + + + // Pour l'instant ces lignes sont pour moi + // Elles seront à enlever + // compute_cuda = true; + // compute_cpu = true; + // res_cpu = false; + + // A ne pas garder dans la version finale + // perf = false; + // accuracy = false; + + +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(!res_cpu){ + grad_gpu = grad; + } else { + BFT_MALLOC(grad_gpu, n_cells_ext, grad_t); + } + if(perf){ + start = std::chrono::high_resolution_clock::now(); + } + + cs_reconstruct_vector_gradient_cuda(m, + madj, + fvq, + halo_type, + inc, + coefav, + coefbv, + pvar, + c_weight, + r_grad, + grad_gpu, + cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION, + perf); + if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed_cuda = std::chrono::duration_cast(stop - start); } } +#endif - /* Interior faces contribution */ + if(compute_cpu){ + if(res_cpu){ + grad_cpu = grad; + } else { + BFT_MALLOC(grad_cpu, n_cells_ext, grad_t); + } + + if(perf){ + start = std::chrono::high_resolution_clock::now(); + } + /* Initialization */ + # pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { + for (cs_lnum_t i = 0; i < stride; i++) { + for (cs_lnum_t j = 0; j < 3; j++) + grad_cpu[c_id][i][j] = 0.0; + } + } cs_lnum_t n_i_groups, n_i_threads; cs_mesh_i_faces_thread_block_count(m, CS_E2N_SUM_SCATTER, 0, &n_i_groups, &n_i_threads); - for (int g_id = 0; g_id < n_i_groups; g_id++) { + /* Interior faces contribution */ -# pragma omp parallel for - for (int t_id = 0; t_id < n_i_threads; t_id++) { + for (int g_id = 0; g_id < n_i_groups; g_id++) { + # pragma omp parallel for + for (int t_id = 0; t_id < n_i_threads; t_id++) { + cs_lnum_t s_id, e_id; cs_mesh_i_faces_thread_block_range(m, CS_E2N_SUM_SCATTER, g_id, t_id, n_i_threads, 0, &s_id, &e_id); @@ -5522,12 +5644,12 @@ _reconstruct_strided_gradient(const cs_mesh_t *m, + (1.0-pond)* c_weight[c_id2]); /* - Remark: \f$ \varia_\face = \alpha_\ij \varia_\celli + Remark: \f$ \varia_\face = \alpha_\ij \varia_\celli + (1-\alpha_\ij) \varia_\cellj\f$ - but for the cell \f$ \celli \f$ we remove - \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ - and for the cell \f$ \cellj \f$ we remove - \f$ \varia_\cellj \sum_\face \vect{S}_\face = \vect{0} \f$ + but for the cell \f$ \celli \f$ we remove + \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ + and for the cell \f$ \cellj \f$ we remove + \f$ \varia_\cellj \sum_\face \vect{S}_\face = \vect{0} \f$ */ for (cs_lnum_t i = 0; i < stride; i++) { @@ -5544,10 +5666,9 @@ _reconstruct_strided_gradient(const cs_mesh_t *m, + r_grad[c_id2][i][2])); for (cs_lnum_t j = 0; j < 3; j++) { - grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; - grad[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; + grad_cpu[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; + grad_cpu[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; } - } } /* End of loop on faces */ @@ -5570,10 +5691,10 @@ _reconstruct_strided_gradient(const cs_mesh_t *m, cs_lnum_t f_id = cell_b_faces[fidx]; - /* - Remark: for the cell \f$ \celli \f$ we remove - \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ - */ + /* + Remark: for the cell \f$ \celli \f$ we remove + \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ + */ for (cs_lnum_t i = 0; i < stride; i++) { @@ -5588,13 +5709,13 @@ _reconstruct_strided_gradient(const cs_mesh_t *m, cs_real_t rfac = 0.; for (cs_lnum_t k = 0; k < stride; k++) { cs_real_t vecfac = r_grad[c_id][k][0] * diipb[f_id][0] - + r_grad[c_id][k][1] * diipb[f_id][1] - + r_grad[c_id][k][2] * diipb[f_id][2]; + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; rfac += coefbv[f_id][i][k] * vecfac; } for (cs_lnum_t j = 0; j < 3; j++) { - grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; + grad_cpu[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; } } @@ -5602,36 +5723,93 @@ _reconstruct_strided_gradient(const cs_mesh_t *m, } /* loop on boundary cells */ -# pragma omp parallel for - for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - cs_real_t dvol; - /* Is the cell disabled (for solid or porous)? Not the case if coupled */ - if (has_dc * c_disable_flag[has_dc * c_id] == 0) - dvol = 1. / cell_f_vol[c_id]; - else - dvol = 0.; + # pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + cs_real_t dvol; + /* Is the cell disabled (for solid or porous)? Not the case if coupled */ + if (has_dc * c_disable_flag[has_dc * c_id] == 0) + dvol = 1. / cell_f_vol[c_id]; + else + dvol = 0.; - for (cs_lnum_t i = 0; i < stride; i++) { - for (cs_lnum_t j = 0; j < 3; j++) - grad[c_id][i][j] *= dvol; - } + for (cs_lnum_t i = 0; i < stride; i++) { + for (cs_lnum_t j = 0; j < 3; j++) + grad_cpu[c_id][i][j] *= dvol; + } - if (cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION) { - cs_real_t gradpa[3]; - for (cs_lnum_t i = 0; i < stride; i++) { - for (cs_lnum_t j = 0; j < 3; j++) { - gradpa[j] = grad[c_id][i][j]; - grad[c_id][i][j] = 0.; + if (cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION) { + cs_real_t gradpa[3]; + for (cs_lnum_t i = 0; i < stride; i++) { + for (cs_lnum_t j = 0; j < 3; j++) { + gradpa[j] = grad_cpu[c_id][i][j]; + grad_cpu[c_id][i][j] = 0.; + } + + for (cs_lnum_t j = 0; j < 3; j++) + for (cs_lnum_t k = 0; k < 3; k++) + grad_cpu[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; + } } + } + + if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(stop - start); + } + } - for (cs_lnum_t j = 0; j < 3; j++) - for (cs_lnum_t k = 0; k < 3; k++) - grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; + /* Performances */ + if(perf){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + printf("reconstruct Compute and tranferts time in us: CUDA = %ld\n", elapsed_cuda.count()); } + #endif + + if(compute_cpu){ + printf("reconstruct Compute and tranferts time in us: CPU = %ld\n", elapsed.count()); } } - /* Periodicity and parallelism treatment */ + /* Accuracy grad_cpu and grad_gpu */ + if(accuracy){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + if(compute_cpu){ + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < stride; i++) { + for (int j =0; j < 3; ++j) { + auto cpu = grad_cpu[c_id][i][j]; + auto cuda = grad_gpu[c_id][i][j]; + double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); + if (err> 1e-6) { + printf("reconstruct DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\tulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + } + } + } + } + } + } + #endif + } + +// Free memory +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(res_cpu){ + BFT_FREE(grad_gpu); + } + } +#endif + +// Free memory + if(compute_cpu){ + if(!res_cpu){ + BFT_FREE(grad_cpu); + } + } + + /* Periodicity and parallelism treatment */ if (m->halo != NULL) { cs_halo_sync_var_strided(m->halo, halo_type, (cs_real_t *)grad, stride*3); @@ -5644,6 +5822,7 @@ _reconstruct_strided_gradient(const cs_mesh_t *m, (cs_real_t *)grad); } } + } /*---------------------------------------------------------------------------- @@ -6653,6 +6832,351 @@ _find_bc_coeffs(const char *var_name, * gradv --> gradient of pvar (du_i/dx_j : gradv[][i][j]) *----------------------------------------------------------------------------*/ +BEGIN_C_DECLS +#if defined(HAVE_OPENMP_TARGET) + +void +_lsq_vector_gradient_target(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_halo_type_t halo_type, + const int inc, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict c_weight, + cs_real_33_t *restrict gradv, + cs_cocg_6_t *restrict cocg, + cs_real_33_t *restrict rhs) +{ + const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_b_cells = m->n_b_cells; + const cs_lnum_t n_i_faces = m->n_i_faces; + const cs_lnum_t n_b_faces = m->n_b_faces; + const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const int n_i_groups = m->i_face_numbering->n_groups; + const int n_i_threads = m->i_face_numbering->n_threads; + const int n_b_threads = m->b_face_numbering->n_threads; + const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; + const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index; + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)m->i_face_cells; + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)m->b_face_cells; + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)madj->cell_cells_idx; + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)madj->cell_b_faces_idx; + const cs_lnum_t *restrict cell_cells_lst + = (const cs_lnum_t *restrict)m->cell_cells_lst; + const cs_lnum_t *restrict b_cells + = (const cs_lnum_t *restrict)m->b_cells; + + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)madj->cell_cells; + const short int *restrict cell_i_faces_sgn + = (const short int *restrict)madj->cell_i_faces_sgn; + const cs_lnum_t *restrict cell_i_faces + = (const cs_lnum_t *restrict)madj->cell_i_faces; + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)madj->cell_b_faces; + + const cs_real_3_t *restrict cell_f_cen + = (const cs_real_3_t *restrict)fvq->cell_f_cen; + const cs_real_t *restrict weight = fvq->weight; + const cs_real_t *restrict b_dist = fvq->b_dist; + const cs_real_3_t *restrict b_face_normal + = (const cs_real_3_t *restrict)fvq->b_face_normal; + + /* Timing the computation */ + + double t_kernel = 0.0; + double t_begin, t_end; + + bool scatter = true; + + /* Contribution from interior faces */ + int num_device = omp_get_num_devices(); + printf("OMP supported devices %d\n", num_device); + t_begin = omp_get_wtime(); +#pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \ + map(from: gradv[0:n_cells_ext]) \ + map(to: i_face_cells[0:n_i_faces], b_face_normal[0:n_b_faces], \ + coefav[0:n_b_faces], coefbv[0:n_b_faces], b_dist[0:n_b_faces],\ + cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\ + cell_cells_idx[0:n_cells_ext], \ + cell_cells_lst[0:n_cells_ext], \ + cell_b_faces_idx[0:n_cells+1], \ + b_face_cells[0:n_b_faces], \ + b_cells[0:n_b_cells], \ + cocg[0:n_cells_ext]) +{ + #pragma omp target teams distribute parallel for \ + schedule(static,1) + for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++){ + for (cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id][i][j] = 0.0; + } + } + } + if(scatter){ + #pragma omp target teams distribute parallel for \ + schedule(static,1) + for (cs_lnum_t f_id = 0; f_id < n_i_faces; f_id++) { + + cs_lnum_t c_id1 = i_face_cells[f_id][0]; + cs_lnum_t c_id2 = i_face_cells[f_id][1]; + + cs_real_t dc[3], fctb[3],_weight1, _weight2, _denom, _pond, pfac; + + for (cs_lnum_t i = 0; i < 3; i++){ + dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; + } + + cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + #pragma omp atomic + rhs[c_id1][i][j] += _weight2 * fctb[j]; + #pragma omp atomic + rhs[c_id2][i][j] += _weight1 * fctb[j]; + } + } + + } + } + else{ + #pragma omp target teams distribute parallel for \ + schedule(static,1) + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + + cs_lnum_t s_id = cell_cells_idx[c_id]; + cs_lnum_t e_id = cell_cells_idx[c_id+1]; + + cs_lnum_t c_id2, f_id; + + // cs_real_t _rhs[64][3][3]; + // cs_lnum_t tid = omp_get_thread_num(); + + // for(cs_lnum_t i = 0; i < 3; i++){ + // for(cs_lnum_t j = 0; j < 3; j++){ + // _rhs[tid][i][j] = 0.0; + // } + // } + + cs_real_t dc[3], fctb[3], _weight, _denom, _pond, pfac; + for(cs_lnum_t index = s_id; index < e_id; index++){ + + c_id2 = cell_cells[index]; + + for (cs_lnum_t i = 0; i < 3; i++){ + dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id][i]; + } + + cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight = 1.; + } + else{ + f_id = cell_i_faces[index]; + _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id] + + (1. - _pond)*c_weight[c_id2]); + _weight = c_weight[c_id2] * _denom; + } + + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = (pvar[c_id2][i] - pvar[c_id][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + rhs[c_id][i][j] += _weight * fctb[j]; + } + } + } + + // for(cs_lnum_t i = 0; i < 3; i++){ + // for(cs_lnum_t j = 0; j < 3; j++){ + // rhs[c_id][i][j] = _rhs[tid][i][j]; + // } + // } + + } + } + + if (halo_type == CS_HALO_EXTENDED) { + + #pragma omp target teams distribute parallel for \ + schedule(static,1) + for (cs_lnum_t c_id1 = 0; c_id1 < n_cells; c_id1++) { + for (cs_lnum_t cidx = cell_cells_idx[c_id1]; + cidx < cell_cells_idx[c_id1+1]; + cidx++) { + + cs_lnum_t c_id2 = cell_cells_lst[cidx]; + + cs_real_t dc[3]; + + for (cs_lnum_t i = 0; i < 3; i++){ + dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; + } + + cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + for (cs_lnum_t i = 0; i < 3; i++) { + + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + rhs[c_id1][i][j] += dc[j] * pfac; + } + } + } + } + + } + + if(scatter){ + #pragma omp target teams distribute parallel for \ + firstprivate(cs_math_zero_threshold) schedule(static,1) + for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { + + cs_lnum_t c_id1 = b_face_cells[f_id]; + + cs_real_t n_d_dist[3]; + // /* Normal is vector 0 if the b_face_normal norm is too small */ + cs_math_3_normalize(b_face_normal[f_id], n_d_dist); + + cs_real_t d_b_dist = 1. / b_dist[f_id]; + + // /* Normal divided by b_dist */ + for (cs_lnum_t i = 0; i < 3; i++){ + n_d_dist[i] *= d_b_dist; + } + + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id1][0] + + coefbv[f_id][1][i] * pvar[c_id1][1] + + coefbv[f_id][2][i] * pvar[c_id1][2] + - pvar[c_id1][i]); + + for (cs_lnum_t j = 0; j < 3; j++){ + #pragma omp atomic + rhs[c_id1][i][j] += n_d_dist[j] * pfac; + } + } + + } + } + else{ + #pragma omp target teams distribute parallel for \ + firstprivate(cs_math_zero_threshold) schedule(static,1) + for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; + + cs_lnum_t f_id; + + cs_real_t n_d_dist[3]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + + cs_math_3_normalize(b_face_normal[f_id], n_d_dist); + + cs_real_t d_b_dist = 1. / b_dist[f_id]; + + // /* Normal divided by b_dist */ + for (cs_lnum_t i = 0; i < 3; i++){ + n_d_dist[i] *= d_b_dist; + } + + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id][0] + + coefbv[f_id][1][i] * pvar[c_id][1] + + coefbv[f_id][2][i] * pvar[c_id][2] + - pvar[c_id][i]); + + for (cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id][i][j] += n_d_dist[j] * pfac; + } + } + } + + } + } + + + #pragma omp target teams distribute parallel for \ + schedule(static,1) + for (cs_lnum_t c_idx = 0; c_idx < n_cells*3*3; c_idx++) { + + size_t c_id = c_idx / (3*3); + size_t i = (c_idx / 3) % 3; + size_t j = c_idx % 3; + + auto cocg_temp = cocg[c_id]; + cs_real_t _cocg[3]; + + _cocg[0] = cocg_temp[5]; + _cocg[1] = cocg_temp[4]; + _cocg[2] = cocg_temp[2]; + + if(j == 0){ + _cocg[0] = cocg_temp[0]; + _cocg[1] = cocg_temp[3]; + _cocg[2] = cocg_temp[5]; + } + + if(j == 1){ + _cocg[0] = cocg_temp[3]; + _cocg[1] = cocg_temp[1]; + _cocg[2] = cocg_temp[4]; + } + + gradv[c_id][i][j] = rhs[c_id][i][0] * _cocg[0] + + rhs[c_id][i][1] * _cocg[1] + + rhs[c_id][i][2] * _cocg[2]; + } + +} // end omp data + +t_end = omp_get_wtime(); + +t_kernel = t_end - t_begin; +printf("Time of kernel: %lf\n", t_kernel); + +} + +#endif +END_C_DECLS + static void _lsq_vector_gradient(const cs_mesh_t *m, const cs_mesh_adjacencies_t *madj, @@ -6665,7 +7189,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, const cs_real_t *restrict c_weight, cs_real_33_t *restrict gradv) { - const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_cells = m->n_cells; const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; const int n_i_groups = m->i_face_numbering->n_groups; const int n_i_threads = m->i_face_numbering->n_threads; @@ -6691,16 +7215,113 @@ _lsq_vector_gradient(const cs_mesh_t *m, cs_cocg_6_t *restrict cocgb_s = NULL; cs_cocg_6_t *restrict cocg = NULL; - _get_cell_cocg_lsq(m, halo_type, false, fvq, &cocg, &cocgb_s); - cs_real_33_t *rhs; + /* Timing the computation */ - BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); + std::chrono::high_resolution_clock::time_point start, stop; + std::chrono::microseconds elapsed, elapsed_cuda, elapsed_target; + +#if defined(HAVE_CUDA) + bool accel = (cs_get_device_id() > -1) ? true : false; +#else + bool accel = false; +#endif + + _get_cell_cocg_lsq(m, halo_type, accel, fvq, &cocg, &cocgb_s); + + cs_real_33_t *rhs, *rhs_cuda, *rhs_target, *gradv_cuda, *gradv_cpu, *gradv_target; + bool compute_cuda, compute_cpu, res_cpu, perf, accuracy; + + compute_cuda = accel; + res_cpu = !accel; + +#if defined(DEBUG) + compute_cpu = true; + perf = true; + accuracy = true; +#elif defined(NDEBUG) + compute_cpu = true; + res_cpu = true; + perf = false; + accuracy = false; +#else + compute_cpu = false; + perf = false; + accuracy = false; +#endif + + // Pour l'instant ces lignes sont pour moi + // Elles seront à enlever + // compute_cuda = true; + compute_cpu = true; + // res_cpu = false; + perf = true; + // accuracy = true; + +BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); +BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); +BFT_MALLOC(rhs_target, n_cells_ext, cs_real_33_t); +BFT_MALLOC(gradv_cuda, n_cells_ext, cs_real_33_t); +BFT_MALLOC(gradv_cpu, n_cells_ext, cs_real_33_t); +BFT_MALLOC(gradv_target, n_cells_ext, cs_real_33_t); /* Compute Right-Hand Side */ /*-------------------------*/ +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(perf){ + start = std::chrono::high_resolution_clock::now(); + } + cs_lsq_vector_gradient_cuda( + m, + madj, + fvq, + halo_type, + inc, + coefav, + coefbv, + pvar, + c_weight, + cocg, + cocgb_s, + gradv, + rhs_cuda); + + if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed_cuda = std::chrono::duration_cast(stop - start); + } + } // end if compute_cuda +#endif -# pragma omp parallel for +#if defined(HAVE_OPENMP_TARGET) +if(perf){ + start = std::chrono::high_resolution_clock::now(); +} +_lsq_vector_gradient_target(m, + madj, + fvq, + halo_type, + inc, + coefav, + coefbv, + pvar, + c_weight, + gradv_target, + cocg, + rhs_target); +if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed_target = std::chrono::duration_cast(stop - start); + printf("OMP target lsq %ld\n", elapsed_target.count()); +} +#endif + +if(compute_cpu){ + if(perf){ + start = std::chrono::high_resolution_clock::now(); + } + # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) for (cs_lnum_t j = 0; j < 3; j++) @@ -6708,7 +7329,6 @@ _lsq_vector_gradient(const cs_mesh_t *m, } /* Contribution from interior faces */ - for (int g_id = 0; g_id < n_i_groups; g_id++) { # pragma omp parallel for @@ -6832,17 +7452,18 @@ _lsq_vector_gradient(const cs_mesh_t *m, /* Compute gradient */ /*------------------*/ + #pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) { - gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + gradv_cpu[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + rhs[c_id][i][1] * cocg[c_id][3] + rhs[c_id][i][2] * cocg[c_id][5]; - gradv[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + gradv_cpu[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + rhs[c_id][i][1] * cocg[c_id][1] + rhs[c_id][i][2] * cocg[c_id][4]; - gradv[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + gradv_cpu[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + rhs[c_id][i][1] * cocg[c_id][4] + rhs[c_id][i][2] * cocg[c_id][2]; } @@ -6900,12 +7521,38 @@ _lsq_vector_gradient(const cs_mesh_t *m, for (int kk = 0; kk < 9; kk++) { int ii = _33_9_idx[kk][0]; int jj = _33_9_idx[kk][1]; - gradv[c_id][ii][jj] = x[kk]; + gradv_cpu[c_id][ii][jj] = x[kk]; } } } + stop = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(stop - start); +} // end if COMPUTE_CPU + +if(accuracy){ + #pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j = 0; j < 3; ++j) { + auto cpu = gradv_cpu[c_id][i][j]; + auto cuda = gradv[c_id][i][j]; + + if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-12) { + printf("lsq DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\t|CPU - CUDA| = %.17f\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); + } + } + } + } +} + +if(perf) + printf("lsq Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); + +if(res_cpu){ + memcpy(gradv, gradv_cpu, sizeof(cs_real_33_t) * n_cells_ext); +} /* Periodicity and parallelism treatment */ @@ -6916,6 +7563,11 @@ _lsq_vector_gradient(const cs_mesh_t *m, } BFT_FREE(rhs); + BFT_FREE(rhs_cuda); + BFT_FREE(rhs_target); + BFT_FREE(gradv_cuda); + BFT_FREE(gradv_cpu); + BFT_FREE(gradv_target); } /*----------------------------------------------------------------------------*/ @@ -7022,10 +7674,20 @@ _lsq_strided_gradient(const cs_mesh_t *m, BFT_MALLOC(rhs, n_cells_ext, grad_t); cs_array_real_fill_zero(n_cells_ext*stride*3, (cs_real_t *)rhs); + grad_t *gradv_cpu; + BFT_MALLOC(gradv_cpu, n_cells_ext*stride*3, grad_t); + + +#if defined(HAVE_CUDA) + bool accel = (cs_get_device_id() > -1) ? true : false; +#else + bool accel = false; +#endif + cs_cocg_6_t *restrict cocgb = NULL; cs_cocg_6_t *restrict cocg = NULL; - _get_cell_cocg_lsq(m, halo_type, false, fvq, &cocg, &cocgb); + _get_cell_cocg_lsq(m, halo_type, accel, fvq, &cocg, &cocgb); /* Contribution from interior faces -------------------------------- */ @@ -7295,24 +7957,45 @@ _lsq_strided_gradient(const cs_mesh_t *m, #pragma omp parallel for if(n_cells >= CS_THR_MIN) for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { for (cs_lnum_t i = 0; i < stride; i++) { - grad[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] - + rhs[c_id][i][1] * cocg[c_id][3] - + rhs[c_id][i][2] * cocg[c_id][5]; + gradv_cpu[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + + rhs[c_id][i][1] * cocg[c_id][3] + + rhs[c_id][i][2] * cocg[c_id][5]; - grad[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] - + rhs[c_id][i][1] * cocg[c_id][1] - + rhs[c_id][i][2] * cocg[c_id][4]; + gradv_cpu[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + + rhs[c_id][i][1] * cocg[c_id][1] + + rhs[c_id][i][2] * cocg[c_id][4]; - grad[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] - + rhs[c_id][i][1] * cocg[c_id][4] - + rhs[c_id][i][2] * cocg[c_id][2]; + gradv_cpu[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + + rhs[c_id][i][1] * cocg[c_id][4] + + rhs[c_id][i][2] * cocg[c_id][2]; } } + memcpy(grad, gradv_cpu, sizeof(cs_real_t) * n_cells_ext * stride * 3); /* Correct gradient on boundary cells */ /*------------------------------------*/ - +cs_real_t c_norm, ref_norm; + +// #if defined(HAVE_CUDA) + // cs_lsq_vector_gradient_strided_cuda + // ( + // m, + // madj, + // fvq, + // halo_type, + // inc, + // coefav, + // coefbv, + // pvar, + // c_weight, + // cocg, + // cocgb, + // gradv, + // rhs, + // n_c_iter_max, + // c_eps); +// #else #pragma omp parallel for schedule(dynamic, CS_THR_MIN) for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { @@ -7321,7 +8004,7 @@ _lsq_strided_gradient(const cs_mesh_t *m, cs_lnum_t s_id = cell_b_faces_idx[c_id]; cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; - cs_real_3_t *c_grad = grad[c_id]; + cs_real_3_t *c_grad = gradv_cpu[c_id]; cs_real_t grad_0[stride][3], grad_i[stride][3]; @@ -7330,7 +8013,7 @@ _lsq_strided_gradient(const cs_mesh_t *m, /* Compute norm for convergence testing. */ - cs_real_t ref_norm = 0; + ref_norm = 0; for (cs_lnum_t kk = 0; kk < stride; kk++) { for (cs_lnum_t ll = 0; ll < 3; ll++) ref_norm += abs(c_grad[kk][ll]); @@ -7338,7 +8021,7 @@ _lsq_strided_gradient(const cs_mesh_t *m, /* Iterate over boundary condition contributions. */ - cs_real_t c_norm = 0; + c_norm = 0; int n_c_it; for (n_c_it = 0; n_c_it < n_c_iter_max; n_c_it++) { @@ -7453,6 +8136,7 @@ _lsq_strided_gradient(const cs_mesh_t *m, #endif n_c_it *= -1; } +// #endif /* Optional postprocessing */ @@ -7464,12 +8148,27 @@ _lsq_strided_gradient(const cs_mesh_t *m, } } /* End of correction for BC coeffs */ + #pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j = 0; j < 3; ++j) { + auto cpu = gradv_cpu[c_id][i][j]; + auto cuda = grad[c_id][i][j]; + + if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-6) { + printf("lsq_strided DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\t|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); + } + } + } + } + /* Synchronize halos */ _sync_strided_gradient_halo(m, halo_type, grad); BFT_FREE(rhs); + BFT_FREE(gradv_cpu); } /*---------------------------------------------------------------------------- @@ -8741,27 +9440,184 @@ _gradient_vector(const char *var_name, /* Use Neumann BC's as default if not provided */ + cs_real_3_t *_bc_coeff_a = NULL; cs_real_33_t *_bc_coeff_b = NULL; + /* Timing the computation */ + + std::chrono::high_resolution_clock::time_point start, stop; + std::chrono::microseconds elapsed, elapsed_cuda; + + cs_real_3_t *_bc_coeff_a_gpu = NULL; + cs_real_3_t *_bc_coeff_a_cpu = NULL; + cs_real_33_t *_bc_coeff_b_gpu = NULL; + cs_real_33_t *_bc_coeff_b_cpu = NULL; + + bool compute_cuda; + bool compute_cpu; + bool res_cpu; + bool perf; + bool accuracy; + +#if defined(HAVE_CUDA) + compute_cuda = (cs_get_device_id() > -1) ? true : false; +#else + compute_cuda = false; +#endif + + +res_cpu = !compute_cuda; + +#if defined(DEBUG) + compute_cpu = true; + perf = true; + accuracy = true; +#elif defined(NDEBUG) + compute_cpu = true; + perf = false; + accuracy = false; +#else + compute_cpu = false; + perf = false; + accuracy = false; +#endif + + // Pour l'instant ces lignes sont pour moi + // Elles seront à enlever + compute_cuda = false; + compute_cpu = true; + res_cpu = true; + + // A ne pas garder dans la version finale + // perf = false; + // accuracy = false; + +// Compute on GPU +#if defined(HAVE_CUDA) + if(compute_cuda){ + BFT_MALLOC(_bc_coeff_a_gpu, n_b_faces, cs_real_3_t); + BFT_MALLOC(_bc_coeff_b_gpu, n_b_faces, cs_real_33_t); + if(perf){ + start = std::chrono::high_resolution_clock::now(); + } + _gradient_vector_cuda(mesh, _bc_coeff_a_gpu, _bc_coeff_b_gpu, (bc_coeff_a == NULL), (bc_coeff_b == NULL), perf); + if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed_cuda = std::chrono::duration_cast(stop - start); + } + } +#endif + +// Compute on CPU + if(compute_cpu){ + BFT_MALLOC(_bc_coeff_a_cpu, n_b_faces, cs_real_3_t); + BFT_MALLOC(_bc_coeff_b_cpu, n_b_faces, cs_real_33_t); + + if(perf){ + start = std::chrono::high_resolution_clock::now(); + } + + if (bc_coeff_a == NULL) { + for (cs_lnum_t i = 0; i < n_b_faces; i++) { + for (cs_lnum_t j = 0; j < 3; j++) + _bc_coeff_a_cpu[i][j] = 0; + } + } + if (bc_coeff_b == NULL) { + for (cs_lnum_t i = 0; i < n_b_faces; i++) { + for (cs_lnum_t j = 0; j < 3; j++) { + for (cs_lnum_t k = 0; k < 3; k++) + _bc_coeff_b_cpu[i][j][k] = 0; + _bc_coeff_b_cpu[i][j][j] = 1; + } + } + } + + if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(stop - start); + } + } + +// selected the result of the computation on CPU or GPU if (bc_coeff_a == NULL) { - BFT_MALLOC(_bc_coeff_a, n_b_faces, cs_real_3_t); - for (cs_lnum_t i = 0; i < n_b_faces; i++) { - for (cs_lnum_t j = 0; j < 3; j++) - _bc_coeff_a[i][j] = 0; + if(res_cpu){ + bc_coeff_a = (const cs_real_3_t *)_bc_coeff_a_cpu; + } else { + bc_coeff_a = (const cs_real_3_t *)_bc_coeff_a_gpu; } - bc_coeff_a = (const cs_real_3_t *)_bc_coeff_a; } if (bc_coeff_b == NULL) { - BFT_MALLOC(_bc_coeff_b, n_b_faces, cs_real_33_t); - for (cs_lnum_t i = 0; i < n_b_faces; i++) { - for (cs_lnum_t j = 0; j < 3; j++) { - for (cs_lnum_t k = 0; k < 3; k++) - _bc_coeff_b[i][j][k] = 0; - _bc_coeff_b[i][j][j] = 1; + if(res_cpu){ + bc_coeff_b = (const cs_real_33_t *)_bc_coeff_b_cpu; + } else { + bc_coeff_b = (const cs_real_33_t *)_bc_coeff_b_gpu; + } + } + + /* Performances */ + if(perf){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + printf("_gradient_vector Compute and tranferts time in us: CUDA = %ld\n", elapsed_cuda.count()); } + #endif + + if(compute_cpu){ + printf("_gradient_vector Compute and tranferts time in us: CPU = %ld\n", elapsed.count()); + } + } + + /* Accuracy grad_cpu and grad_gpu */ + if(accuracy){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + if(compute_cpu){ + for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + auto cpu = _bc_coeff_a_cpu[f_id][i]; + auto cuda = _bc_coeff_a_gpu[f_id][i]; + double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); + if (err> 1e-12) { + printf("_gradient_vector_a DIFFERENCE @%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\tulp = %a\n", f_id, i, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + } + } + } + + for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j =0; j < 3; ++j) { + auto cpu = _bc_coeff_b_cpu[f_id][i][j]; + auto cuda = _bc_coeff_b_gpu[f_id][i][j]; + double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); + if (err> 1e-12) { + printf("_gradient_vector_b DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\tulp = %a\n", f_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + } + } + } + } + } + } + #endif + } + +// Free memory +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(res_cpu){ + BFT_FREE(_bc_coeff_a_gpu); + BFT_FREE(_bc_coeff_b_gpu); + } + } +#endif + +// Free memory + if(compute_cpu){ + if(!res_cpu){ + BFT_FREE(_bc_coeff_a_cpu); + BFT_FREE(_bc_coeff_b_cpu); } - bc_coeff_b = (const cs_real_33_t *)_bc_coeff_b; } /* Update of local BC. coefficients for internal coupling */ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index b8f16e2e79..df08ace6a0 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -23,53 +23,25 @@ */ /*----------------------------------------------------------------------------*/ +#include "cs_alge_cuda.cuh" -#include "cs_defs.h" - -/*---------------------------------------------------------------------------- - * Standard C library headers - *----------------------------------------------------------------------------*/ - -#include -#include -#include -#include -#include -#include -#include - -#if defined(HAVE_MPI) -#include -#endif - -#include - -/*---------------------------------------------------------------------------- - * Local headers - *----------------------------------------------------------------------------*/ - -#include "bft_error.h" -#include "bft_mem.h" - -#include "cs_base_accel.h" -#include "cs_base_cuda.h" -#include "cs_blas.h" -#include "cs_cell_to_vertex.h" -#include "cs_ext_neighborhood.h" -#include "cs_field.h" -#include "cs_field_pointer.h" -#include "cs_halo.h" -#include "cs_halo_perio.h" -#include "cs_log.h" -#include "cs_math.h" -#include "cs_mesh.h" -#include "cs_mesh_adjacencies.h" -#include "cs_mesh_quantities.h" -#include "cs_parall.h" -#include "cs_porous_model.h" -#include "cs_prototypes.h" -#include "cs_timer.h" -#include "cs_timer_stats.h" +#include "cs_gradient.h" +#include "cs_gradient_lsq_vector.cuh" +#include "cs_gradient_lsq_vector_gather.cuh" +#include "cs_gradient_lsq_vector_gather_v2.cuh" +#include "cs_gradient_lsq_vector_gather_v3.cuh" +#include "cs_gradient_lsq_vector_v2.cuh" +#include "cs_gradient_lsq_vector_v3.cuh" +#include "cs_gradient_priv.h" +#include "cs_reconstruct_vector_gradient_gather.cuh" +#include "cs_reconstruct_vector_gradient_gather_v2.cuh" +#include "cs_reconstruct_vector_gradient_gather_v3.cuh" +#include "cs_reconstruct_vector_gradient_gather_v4.cuh" +#include "cs_reconstruct_vector_gradient_gather_v5.cuh" +#include "cs_reconstruct_vector_gradient_scatter.cuh" +#include "cs_reconstruct_vector_gradient_scatter_cf.cuh" +#include "cs_reconstruct_vector_gradient_scatter_v2.cuh" +#include "cs_reconstruct_vector_gradient_scatter_v2_cf.cuh" /*---------------------------------------------------------------------------- * Header for the current file @@ -98,6 +70,36 @@ * Recompute cocg at boundaries, using saved cocgb *----------------------------------------------------------------------------*/ +#define INSTANTIATE_LSQ(name, stride) template void name (const cs_mesh_t *m,\ + const cs_mesh_adjacencies_t *madj,\ + const cs_mesh_quantities_t *fvq,\ + const cs_halo_type_t halo_type,\ + const int inc,\ + const cs_real_t (*restrict coefav)[stride],\ + const cs_real_t (*restrict coefbv)[stride][stride],\ + const cs_real_t (*restrict pvar)[stride],\ + const cs_real_t *restrict c_weight,\ + cs_cocg_6_t *restrict cocg,\ + cs_cocg_6_t *restrict cocgb,\ + cs_real_t (*restrict gradv)[stride][3],\ + cs_real_t (*restrict rhs)[stride][3],\ + cs_lnum_t n_c_iter_max,\ + cs_real_t c_eps) + +#define INSTANTIATE_RECONSTRUCT(name, stride) template void name (const cs_mesh_t *m, \ + const cs_mesh_adjacencies_t *madj, \ + const cs_mesh_quantities_t *fvq, \ + cs_halo_type_t halo_type, \ + int inc, \ + const cs_real_t (*restrict coefav)[stride], \ + const cs_real_t (*restrict coefbv)[stride][stride], \ + const cs_real_t (*restrict pvar)[stride], \ + const cs_real_t *restrict c_weight, \ + const cs_real_t (*restrict r_grad)[stride][3], \ + cs_real_t (*restrict grad)[stride][3], \ + bool test_bool, \ + bool perf) + template __global__ static void _compute_cocg_from_cocgb(cs_lnum_t n_b_cells, @@ -430,8 +432,115 @@ _init_rhsv(cs_lnum_t size, } } +__global__ static void +_init_rhs_v3(cs_lnum_t size, + double3 *restrict rhs) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + rhs[c_id] = make_double3(0.0, 0.0, 0.0); +} + +__global__ static void +_compute_gradient_lsq_v_v3(cs_lnum_t size, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + auto& gradc = gradv[c_id]; + auto& rhsc = rhs[c_id]; + auto cocgc = cocg[c_id]; + for(cs_lnum_t i = 0; i < 3; i++){ + auto& gradci = gradc[i]; + auto rhsci = rhsc[i]; + gradci[0] = rhsci[0] * cocgc[0] + + rhsci[1] * cocgc[3] + + rhsci[2] * cocgc[5]; + + gradci[1] = rhsci[0] * cocgc[3] + + rhsci[1] * cocgc[1] + + rhsci[2] * cocgc[4]; + + gradci[2] = rhsci[0] * cocgc[5] + + rhsci[1] * cocgc[4] + + rhsci[2] * cocgc[2]; + } +} + +__global__ static void +_compute_gradient_lsq_b_v(cs_lnum_t size, + cs_lnum_t n_b_cells, + cs_lnum_t *restrict b_cells, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocg, + cs_real_3_t *restrict b_face_normal, + cs_lnum_t *restrict cell_b_faces, + cs_lnum_t *restrict cell_b_faces_idx) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + cs_lnum_t _33_9_idx[9][2]; + int nn = 0; + for (int ll = 0; ll < 3; ll++) { + for (int mm = 0; mm < 3; mm++) { + _33_9_idx[nn][0] = ll; + _33_9_idx[nn][1] = mm; + nn++; + } + } + + /* Loop on boundary cells */ + cs_lnum_t c_id1 = b_cells[c_id]; + cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9]; + + cocgb[0][0] = cocg[c_id][0]; + cocgb[0][1] = cocg[c_id][3]; + cocgb[0][2] = cocg[c_id][5]; + cocgb[1][0] = cocg[c_id][3]; + cocgb[1][1] = cocg[c_id][1]; + cocgb[1][2] = cocg[c_id][4]; + cocgb[2][0] = cocg[c_id][5]; + cocgb[2][1] = cocg[c_id][4]; + cocgb[2][2] = cocg[c_id][2]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; + cs_lnum_t f_id; + cs_real_3_t normal; + cs_real_t norm, inverse_norm; + + for (cs_lnum_t index = s_id; index < e_id; index++) { + + f_id = cell_b_faces[index]; + + /* Normal is vector 0 if the b_face_normal norm is too small */ + norm = sqrt(b_face_normal[index][0]*b_face_normal[index][0] + + b_face_normal[index][1]*b_face_normal[index][1] + + b_face_normal[index][2]*b_face_normal[index][2]); + + inverse_norm = 1. / norm; + + normal[0] = inverse_norm * b_face_normal[index][0]; + normal[1] = inverse_norm * b_face_normal[index][1]; + normal[2] = inverse_norm * b_face_normal[index][2]; + + for (cs_lnum_t ii = 0; ii < 3; ii++) { + for (cs_lnum_t jj = 0; jj < 3; jj++) + cocgb[ii][jj] += normal[ii] * normal[jj]; + } + + } + +} + /*---------------------------------------------------------------------------- - * Synchronize of copy a cs_real_t type array from the host to a device. + * Synchronize of copy a T type array from the host to a device. * * parameters: * val_h <-- pointer to host data @@ -443,38 +552,6 @@ _init_rhsv(cs_lnum_t size, * after use if non-NULL) *----------------------------------------------------------------------------*/ -static void -_sync_or_copy_real_h2d(const cs_real_t *val_h, - cs_lnum_t n_vals, - int device_id, - cudaStream_t stream, - const cs_real_t **val_d, - void **buf_d) -{ - const cs_real_t *_val_d = NULL; - void *_buf_d = NULL; - - cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); - size_t size = n_vals * sizeof(cs_real_t); - - if (alloc_mode == CS_ALLOC_HOST) { - CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); - cs_cuda_copy_h2d(_buf_d, val_h, size); - _val_d = (const cs_real_t *)_buf_d; - } - else { - _val_d = (const cs_real_t *)cs_get_device_ptr((void *)val_h); - - if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) - cudaMemPrefetchAsync(val_h, size, device_id, stream); - else - cs_sync_h2d(val_h); - } - - *val_d = _val_d; - *buf_d = _buf_d; -} - /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ /*============================================================================= @@ -749,3 +826,1362 @@ cs_gradient_scalar_lsq_cuda(const cs_mesh_t *m, } /*----------------------------------------------------------------------------*/ +/*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ + +/*============================================================================= + * Semi-private function definitions + *============================================================================*/ + +/*---------------------------------------------------------------------------- + * Compute cell gradient using least-squares reconstruction for non-orthogonal + * meshes (nswrgp > 1). + * + * Optionally, a volume force generating a hydrostatic pressure component + * may be accounted for. + * + * cocg is computed to account for variable B.C.'s (flux). + * + * parameters: + * m <-- pointer to associated mesh structure + * madj <-- pointer to mesh adjacencies structure + * fvq <-- pointer to associated finite volume quantities + * halo_type <-- halo type (extended or not) + * inc <-- if 0, solve on increment; 1 otherwise + * coefav <-- B.C. coefficients for boundary face normals + * coefbv <-- B.C. coefficients for boundary face normals + * pvar <-- variable + * gradv --> gradient of pvar (du_i/dx_j : gradv[][i][j]) + *----------------------------------------------------------------------------*/ +extern "C" void +cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_halo_type_t halo_type, + const int inc, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict c_weight, + cs_cocg_6_t *restrict cocg, + cs_cocg_6_t *restrict cocgb, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs) +{ + const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const cs_lnum_t n_b_faces = m->n_b_faces; + const cs_lnum_t n_i_faces = m->n_i_faces; + + + int device_id; + cudaGetDevice(&device_id); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t start, mem_h2d, init, i_faces, halo, b_faces, gradient, gradient_b, stop; + float msec = 0.0f, msecTotal = 0.0f; + CS_CUDA_CHECK(cudaEventCreate(&start)); + CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); + CS_CUDA_CHECK(cudaEventCreate(&init)); + CS_CUDA_CHECK(cudaEventCreate(&i_faces)); + CS_CUDA_CHECK(cudaEventCreate(&halo)); + CS_CUDA_CHECK(cudaEventCreate(&b_faces)); + CS_CUDA_CHECK(cudaEventCreate(&gradient)); + CS_CUDA_CHECK(cudaEventCreate(&gradient_b)); + CS_CUDA_CHECK(cudaEventCreate(&stop)); + + // Record the start event + CS_CUDA_CHECK(cudaEventRecord(start, stream)); + + cs_real_33_t *rhs_d; + CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_33_t))); + + + cs_real_33_t *grad_d = NULL; + CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_33_t))); + + void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, + *_cell_cells_idx_d = NULL; + const cs_real_3_t *pvar_d = NULL, *coefa_d = NULL; + const cs_real_33_t *coefb_d = NULL; + const cs_lnum_t *cell_cells_idx_d = NULL; + + // cs_cuda_copy_h2d(rhs_d, rhs, n_cells * sizeof(cs_real_33_t)); + + unsigned int blocksize = 256; + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells); + const cs_lnum_t *restrict b_cells + = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_cells); + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); + const cs_lnum_t *restrict cell_cells_lst + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx); + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces); + const cs_lnum_t *restrict cell_i_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces); + const short int *restrict cell_i_faces_sgn + = (const short int *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces_sgn); + const int n_i_groups = m->i_face_numbering->n_groups; + const int n_i_threads = m->i_face_numbering->n_threads; + const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; + const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index; + + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); + const cs_real_3_t *restrict cell_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); + const cs_real_3_t *restrict cell_f_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); + const cs_real_t *restrict weight + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight); + const cs_real_t *restrict b_dist + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->b_dist); + const cs_real_3_t *restrict b_face_normal + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal); + const cs_real_3_t *restrict b_face_cog + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_cog); + + const cs_real_t *restrict cell_f_cen_1d + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); + const cs_real_3_t *restrict diipb + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); + + cs_lnum_t stride = 3; + + // printf("n_i_thread:%d\tn_i_groups:%d\tn_cells%d\n", n_i_threads, n_i_groups, n_cells); + + _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, + &pvar_d, &_pvar_d); + + _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream, + &coefa_d, &_coefa_d); + _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream, + &coefb_d, &_coefb_d); + + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + + // _init_rhs<<>> + // (n_cells_ext, + // rhs_d); + cudaMemset(rhs_d, 0, n_cells_ext*sizeof(cs_real_33_t)); + + // _init_rhs_v2<<>> + // (n_cells_ext*3*3, + // rhs_d); + + // _init_rhs_v3<<>> + // (n_cells_ext*3, + // rhs_d); + + CS_CUDA_CHECK(cudaEventRecord(init, stream)); + + + // _compute_rhs_lsq_v_i_face_v0<<>> + // (n_i_faces, + // i_face_cells, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + + // _compute_rhs_lsq_v_i_face_cf<<>> + // (n_i_faces, + // i_face_cells, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + // _compute_rhs_lsq_v_i_face<<>> + // (n_i_faces, + // i_face_cells, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + + _compute_rhs_lsq_v_i_face_v2cf<<>> + (n_i_faces, + i_face_cells, + cell_f_cen, + rhs_d, + pvar_d, + weight, + c_weight); + + // _compute_rhs_lsq_v_i_face_v3<<>> + // (n_i_faces*3*3, + // i_face_cells, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + // assert(cell_cells_idx); + // assert(cell_cells); + // assert(cell_f_cen); + // assert(rhs_d); + // assert(pvar_d); + // assert(weight); + // _compute_rhs_lsq_v_i_face_gather<<>> + // (n_cells, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + + // _compute_rhs_lsq_v_i_face_gather_v2<<>> + // (n_cells, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + + // _compute_rhs_lsq_v_i_face_gather_v4<<>> + // (n_cells, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + + CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); + + if(halo_type == CS_HALO_EXTENDED && cell_cells_idx != NULL){ + + _compute_rhs_lsq_v_b_neighbor<<>> + (n_cells, + cell_cells_idx, + cell_cells, + cell_f_cen, + rhs_d, + pvar_d); + } + CS_CUDA_CHECK(cudaEventRecord(halo, stream)); + + // _compute_rhs_lsq_v_b_face<<n_b_cells, blocksize), blocksize, 0, stream>>> + // (m->n_b_faces, + // b_face_cells, + // cell_f_cen, + // b_face_normal, + // rhs_d, + // pvar_d, + // b_dist, + // coefb_d, + // coefa_d, + // inc); + + // _compute_rhs_lsq_v_b_face_gather_stride_v2<3, cs_real_3_t, cs_real_33_t><<n_b_cells, blocksize), blocksize, 0, stream>>> + // (m->n_b_cells, + // cell_b_faces_idx, + // cell_b_faces, + // b_cells, + // b_face_cog, + // cell_cen, + // rhs_d, + // pvar_d, + // coefb_d, + // coefa_d, + // cocg, + // cocgb, + // inc); + + _compute_rhs_lsq_v_b_face_gather_v3<<n_b_cells, blocksize), blocksize, 0, stream>>> + (m->n_b_cells, + cell_b_faces_idx, + cell_b_faces, + b_cells, + b_face_normal, + rhs_d, + pvar_d, + b_dist, + coefb_d, + coefa_d, + inc); + + // _compute_rhs_lsq_v_b_face_v2<<n_b_cells, blocksize), blocksize, 0, stream>>> + // (m->n_b_faces, + // b_face_cells, + // cell_f_cen, + // b_face_normal, + // rhs_d, + // pvar_d, + // b_dist, + // coefb_d, + // coefa_d, + // inc); + + CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); + + + // if (rhs_d != NULL) { + // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + // cs_cuda_copy_d2h(rhs, rhs_d, size); + // } + // else + // cs_sync_d2h(rhs); + + // /* Compute gradient */ + // /*------------------*/ + + // _compute_gradient_lsq_v<<>> + // (n_cells, + // grad_d, + // rhs_d, + // cocg); + + // _compute_gradient_lsq_v_v4<<>> + // (n_cells, + // grad_d, + // rhs_d, + // cocg); + + + // _compute_gradient_lsq_v_v5<<>> + // (n_cells*3*3, + // gradv_d, + // rhs_d, + // cocg); + + _compute_gradient_lsq_v_v6<<>> + (n_cells*3*3, + grad_d, + rhs_d, + cocg); + + CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); + + _compute_gradient_lsq_b_v<<n_b_cells, blocksize), blocksize, 0, stream>>> + (m->n_b_cells, + b_cells, + cell_b_faces_idx, + cell_b_faces, + b_face_normal, + diipb, + pvar_d, + b_dist, + coefb_d, + coefa_d, + grad_d, + rhs_d, + cocgb, + inc); + + CS_CUDA_CHECK(cudaEventRecord(gradient_b, stream)); + + // /* Sync to host */ + if (grad_d != NULL) { + size_t size = n_cells * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(gradv, grad_d, size); + } + else + cs_sync_d2h(gradv); + + CS_CUDA_CHECK(cudaEventRecord(stop, stream)); + CS_CUDA_CHECK(cudaEventSynchronize(stop)); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + printf("lsq Kernels :"); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); + printf("I_faces = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, halo)); + printf("Halo = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, halo, b_faces)); + printf("B_faces = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces, gradient)); + printf("Gradient = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, gradient, gradient_b)); + printf("Gradient_b = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient_b)); + printf("Total kernel = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); + + printf("\n"); + + + if (_pvar_d != NULL) + CS_CUDA_CHECK(cudaFree(_pvar_d)); + if (_coefa_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefa_d)); + if (_coefb_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefb_d)); + + CS_CUDA_CHECK(cudaFree(rhs_d)); + CS_CUDA_CHECK(cudaFree(grad_d)); + +} + +/*----------------------------------------------------------------------------*/ +/*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ + +/*============================================================================= + * Semi-private function definitions + *============================================================================*/ + +/*---------------------------------------------------------------------------- + * Compute cell gradient using least-squares reconstruction for non-orthogonal + * meshes (nswrgp > 1). + * + * Optionally, a volume force generating a hydrostatic pressure component + * may be accounted for. + * + * cocg is computed to account for variable B.C.'s (flux). + * + * parameters: + * m <-- pointer to associated mesh structure + * madj <-- pointer to mesh adjacencies structure + * fvq <-- pointer to associated finite volume quantities + * halo_type <-- halo type (extended or not) + * inc <-- if 0, solve on increment; 1 otherwise + * coefav <-- B.C. coefficients for boundary face normals + * coefbv <-- B.C. coefficients for boundary face normals + * pvar <-- variable + * gradv --> gradient of pvar (du_i/dx_j : gradv[][i][j]) + *----------------------------------------------------------------------------*/ +template +void +cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_halo_type_t halo_type, + const int inc, + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *restrict c_weight, + cs_cocg_6_t *restrict cocg, + cs_cocg_6_t *restrict cocgb, + cs_real_t (*restrict gradv)[stride][3], + cs_real_t (*restrict rhs)[stride][3], + cs_lnum_t n_c_iter_max, + cs_real_t c_eps) +{ + const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const cs_lnum_t n_b_faces = m->n_b_faces; + const cs_lnum_t n_i_faces = m->n_i_faces; + + + int device_id; + cudaGetDevice(&device_id); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t start, mem_h2d, init, i_faces, halo, b_faces, gradient, stop; + float msec = 0.0f, msecTotal = 0.0f; + CS_CUDA_CHECK(cudaEventCreate(&start)); + CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); + CS_CUDA_CHECK(cudaEventCreate(&init)); + CS_CUDA_CHECK(cudaEventCreate(&i_faces)); + CS_CUDA_CHECK(cudaEventCreate(&halo)); + CS_CUDA_CHECK(cudaEventCreate(&b_faces)); + CS_CUDA_CHECK(cudaEventCreate(&gradient)); + CS_CUDA_CHECK(cudaEventCreate(&stop)); + + // Record the start event + CS_CUDA_CHECK(cudaEventRecord(start, stream)); + + decltype(rhs) rhs_d; + CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_t)*stride*3)); + + + decltype(gradv) grad_d = NULL; + CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_t)*stride*3)); + + void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, + *_cell_cells_idx_d = NULL; + decltype(pvar) pvar_d = NULL, coefa_d = NULL; + decltype(coefbv) coefb_d = NULL; + const cs_lnum_t *cell_cells_idx_d = NULL; + + unsigned int blocksize = 256; + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells); + const cs_lnum_t *restrict b_cells + = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_cells); + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); + const cs_lnum_t *restrict cell_cells_lst + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx); + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces); + const cs_lnum_t *restrict cell_i_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces); + const short int *restrict cell_i_faces_sgn + = (const short int *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces_sgn); + const int n_i_groups = m->i_face_numbering->n_groups; + const int n_i_threads = m->i_face_numbering->n_threads; + const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; + const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index; + + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); + const cs_real_3_t *restrict cell_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); + const cs_real_3_t *restrict cell_f_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); + const cs_real_t *restrict weight + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight); + const cs_real_t *restrict b_dist + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->b_dist); + const cs_real_3_t *restrict b_face_normal + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal); + const cs_real_3_t *restrict b_face_cog + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_cog); + const cs_real_3_t *restrict diipb + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); + + _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, + &pvar_d, &_pvar_d); + + _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream, + &coefa_d, &_coefa_d); + _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream, + &coefb_d, &_coefb_d); + + cs_cuda_copy_h2d(grad_d, gradv, sizeof(cs_real_t) * n_cells * stride * 3); + + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + + CS_CUDA_CHECK(cudaEventRecord(init, stream)); + + CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); + + CS_CUDA_CHECK(cudaEventRecord(halo, stream)); + + // assert(b_cells); + // assert(cell_b_faces_idx); + // assert(cell_b_faces); + // assert(b_face_cog); + // assert(cell_cen); + // assert(diipb); + // assert(grad_d); + // assert(coefb_d); + // assert(cocg); + + // for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + // for (cs_lnum_t i = 0; i < stride; i++) { + // for (int j = 0; j < 3; ++j) { + // // if(fabs(gradv[c_id][i][j]) != 0.0) + // // printf("grad = %f\t", gradv[c_id][i][j]); + // } + // } + // } + + _compute_gradient_lsq_b_strided_v<<n_b_cells, blocksize), blocksize, 0, stream>>> + (m->n_b_cells, + b_cells, + cell_b_faces_idx, + cell_b_faces, + b_face_cog, + cell_cen, + diipb, + grad_d, + coefb_d, + cocg, + n_c_iter_max, + c_eps); + + CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); + + CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); + + // /* Sync to host */ + if (grad_d != NULL) { + size_t size = n_cells * sizeof(cs_real_t) * stride * 3; + cs_cuda_copy_d2h(gradv, grad_d, size); + } + else + cs_sync_d2h(gradv); + + CS_CUDA_CHECK(cudaEventRecord(stop, stream)); + CS_CUDA_CHECK(cudaEventSynchronize(stop)); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + printf("lsq Kernels :"); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + // printf("Kernels execution time in us: \t"); + // printf("Init = %f\t", msec*1000.f); + + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); + // printf("I_faces = %f\t", msec*1000.f); + + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, halo)); + // printf("Halo = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, halo, b_faces)); + printf("B_faces = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces, gradient)); + printf("Gradient = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient)); + printf("Total kernel = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); + + printf("\n"); + + + if (_pvar_d != NULL) + CS_CUDA_CHECK(cudaFree(_pvar_d)); + if (_coefa_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefa_d)); + if (_coefb_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefb_d)); + + CS_CUDA_CHECK(cudaFree(rhs_d)); + CS_CUDA_CHECK(cudaFree(grad_d)); + +} + +INSTANTIATE_LSQ(cs_lsq_vector_gradient_strided_cuda, 1); +INSTANTIATE_LSQ(cs_lsq_vector_gradient_strided_cuda, 3); +INSTANTIATE_LSQ(cs_lsq_vector_gradient_strided_cuda, 6); + + + + +/*---------------------------------------------------------------------------- + * Reconstruct the gradient of a vector using a given gradient of + * this vector (typically lsq). + * + * parameters: + * m <-- pointer to associated mesh structure + * fvq <-- pointer to associated finite volume quantities + * cpl <-- structure associated with internal coupling, or NULL + * inc <-- if 0, solve on increment; 1 otherwise + * coefav <-- B.C. coefficients for boundary face normals + * coefbv <-- B.C. coefficients for boundary face normals + * pvar <-- variable + * c_weight <-- weighted gradient coefficient variable + * r_grad --> gradient used for reconstruction + * grad --> gradient of pvar (du_i/dx_j : grad[][i][j]) + *----------------------------------------------------------------------------*/ +template +void +cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + cs_halo_type_t halo_type, + int inc, + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *restrict c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + bool test_bool, + bool perf) +{ + const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_b_cells = m->n_b_cells; + const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const cs_lnum_t n_b_faces = m->n_b_faces; + const cs_lnum_t n_i_faces = m->n_i_faces; + + int device_id; + cudaGetDevice(&device_id); + + + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t start, mem_h2d, init, i_faces, b_faces_1, b_faces_2, b_faces_3, stop; + float msec = 0.0f, msec_tot; + CS_CUDA_CHECK(cudaEventCreate(&start)); + CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); + CS_CUDA_CHECK(cudaEventCreate(&init)); + CS_CUDA_CHECK(cudaEventCreate(&i_faces)); + CS_CUDA_CHECK(cudaEventCreate(&b_faces_1)); + CS_CUDA_CHECK(cudaEventCreate(&b_faces_2)); + CS_CUDA_CHECK(cudaEventCreate(&b_faces_3)); + CS_CUDA_CHECK(cudaEventCreate(&stop)); + + + // Record the start event + CS_CUDA_CHECK(cudaEventRecord(start, stream)); + + decltype(grad) grad_d; + CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells_ext * sizeof(cs_real_t)*stride*3)); + + void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, + *_cell_cells_idx_d = NULL, *_r_grad_d = NULL; + decltype(pvar) pvar_d = NULL, coefa_d = NULL; + decltype(coefbv) coefb_d = NULL; + decltype(r_grad) r_grad_d = NULL; + const cs_lnum_t *cell_cells_idx_d = NULL; + + + unsigned int blocksize = 256; + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells); + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx); + const int n_i_groups + = m->i_face_numbering->n_groups; + const int n_i_threads + = m->i_face_numbering->n_threads; + cs_lnum_t *restrict i_group_index; + CS_CUDA_CHECK(cudaMalloc(&i_group_index, sizeof(int)*n_i_groups * n_i_threads * 2)); + cs_cuda_copy_h2d(i_group_index, (void *)m->i_face_numbering->group_index, sizeof(int)*n_i_groups * n_i_threads * 2); + + const int n_b_groups + = m->b_face_numbering->n_groups; + const int n_b_threads + = m->b_face_numbering->n_threads; + cs_lnum_t *restrict b_group_index; + CS_CUDA_CHECK(cudaMalloc(&b_group_index, sizeof(int)*n_i_groups * n_i_threads * 2)); + cs_cuda_copy_h2d(b_group_index, (void *)m->b_face_numbering->group_index, sizeof(int)*n_b_groups * n_b_threads * 2); + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); + // if (madj->cell_i_faces == NULL) { + cs_mesh_adjacencies_update_cell_i_faces(); + // } + assert(madj->cell_i_faces); + const cs_lnum_t n_cells_i_face = (madj->cell_cells_idx[n_cells]); + cs_lnum_t *restrict cell_i_faces; + CS_CUDA_CHECK(cudaMalloc(&cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face)); + cs_cuda_copy_h2d(cell_i_faces, madj->cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face); + assert(cell_i_faces); + + short int *restrict cell_i_faces_sgn; + CS_CUDA_CHECK(cudaMalloc(&cell_i_faces_sgn, sizeof(short int)*n_cells_i_face)); + cs_cuda_copy_h2d(cell_i_faces_sgn, madj->cell_i_faces_sgn, sizeof(short int)*n_cells_i_face); + + const cs_lnum_t *restrict b_cells + = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_cells); + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces); + + cs_real_t *restrict cell_f_vol; + CS_CUDA_CHECK(cudaMalloc(&cell_f_vol, n_cells * sizeof(cs_real_t))); + cs_cuda_copy_h2d(cell_f_vol, (void *)fvq->cell_f_vol, sizeof(cs_real_t)*n_cells); + if (cs_glob_porous_model == 1 || cs_glob_porous_model == 2) + cell_f_vol = fvq->cell_vol; + const cs_real_3_t *restrict cell_f_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); + const cs_real_t *restrict weight + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight); + cs_real_3_t *restrict i_f_face_normal; + CS_CUDA_CHECK(cudaMalloc(&i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces)); + cs_cuda_copy_h2d(i_f_face_normal, (void *)fvq->i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces); + + const cs_real_3_t *restrict b_f_face_normal + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_normal); + cs_real_3_t *restrict dofij; + CS_CUDA_CHECK(cudaMalloc(&dofij, sizeof(cs_real_3_t)*n_i_faces)); + cs_cuda_copy_h2d(dofij, (void *)fvq->dofij, sizeof(cs_real_3_t)*n_i_faces); + const cs_real_3_t *restrict diipb + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); + cs_real_33_t *restrict corr_grad_lin; + CS_CUDA_CHECK(cudaMalloc(&corr_grad_lin, n_cells * sizeof(cs_real_33_t))); + cs_cuda_copy_h2d(corr_grad_lin, (void *)fvq->corr_grad_lin, sizeof(cs_real_33_t)*n_cells); + const cs_lnum_t has_dc + = fvq->has_disable_flag; + int *restrict c_disable_flag; + CS_CUDA_CHECK(cudaMalloc(&c_disable_flag, n_cells * sizeof(int))); + cs_cuda_copy_h2d(c_disable_flag, (void *)fvq->c_disable_flag, sizeof(int)*n_cells); + + + _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, + &pvar_d, &_pvar_d); + + _sync_or_copy_real_h2d(r_grad, n_cells_ext, device_id, stream, + &r_grad_d, &_r_grad_d); + + _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream, + &coefa_d, &_coefa_d); + _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream, + &coefb_d, &_coefb_d); + + + // ----------------------------Begin of Kernels part 1------------------------------------------- + + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + + /* Initialization */ + + cudaMemset(grad_d, 0, n_cells * sizeof(cs_real_t)*stride*3); + + CS_CUDA_CHECK(cudaEventRecord(init, stream)); + + + /* Interior faces contribution */ + + /*************************************Kernels Scatter**************************************************/ + _compute_reconstruct_v_i_face<<>> + (n_i_faces, + i_face_cells, + pvar_d, + weight, + c_weight, + r_grad_d, + grad_d, + dofij, + i_f_face_normal); + + // _compute_reconstruct_v_i_face_v2<<>> + // (n_i_faces * 3, + // i_face_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal); + + /*************************************Kernels Scatter conflict free**************************************/ + // _compute_reconstruct_v_i_face_cf<<>> + // (n_i_faces, + // i_face_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal); + + // _compute_reconstruct_v_i_face_v2_cf<<>> + // (n_i_faces * 3, + // i_face_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal); + + /*************************************Kernels Gather**************************************************/ + // _compute_reconstruct_v_i_face_gather<<>> + // ( n_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn); + + + // _compute_reconstruct_v_i_face_gather_v2<<>> + // ( n_cells * 3 * 3, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn); + + + + /*************************************Kernels Gather registers memory************************************/ + // _compute_reconstruct_v_i_face_gather_v3<<>> + // ( n_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn); + + + // _compute_reconstruct_v_i_face_gather_v4<<>> + // ( n_cells * 3 * 3, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn); + + + + + /*************************************Kernels Gather shared memory***************************************/ + // _compute_reconstruct_v_i_face_gather_v5<<>> + // ( n_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn); + + CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); + + // ----------------------------End of Kernels part 1------------------------------------------- + + // if (grad_d != NULL) { + // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + // cs_cuda_copy_d2h(grad, grad_d, size); + // } + // else + // cs_sync_d2h(grad); + + // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + // cs_cuda_copy_d2h(r_grad, r_grad_d, size); + + + /* Contribution from coupled faces */ + // if (cpl != NULL) { + // cs_internal_coupling_initialize_vector_gradient(cpl, c_weight, pvar, grad); + // cs_internal_coupling_reconstruct_vector_gradient(cpl, r_grad, grad); + // } + + // cs_cuda_copy_h2d(grad_d, grad, n_cells_ext * sizeof(cs_real_33_t)); + + CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream)); + + // ----------------------------Begin of Kernels part 2------------------------------------------- + + + /*************************************Kernels Scatter**************************************************/ + _compute_reconstruct_v_b_face<<>> + ( n_b_faces, + coefb_d, + coefa_d, + pvar_d, + inc, + diipb, + r_grad_d, + grad_d, + b_f_face_normal, + b_face_cells); + + + // _compute_reconstruct_v_b_face_v2<<>> + // ( n_b_faces * 3, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_face_cells); + + /*************************************Kernels Scatter conflict free************************************/ + // _compute_reconstruct_v_b_face_cf<<>> + // ( n_b_faces, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_face_cells); + + // _compute_reconstruct_v_b_face_v2_cf<<>> + // ( n_b_faces * 3, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_face_cells); + + /*************************************Kernels Gather**************************************************/ + // _compute_reconstruct_v_b_face_gather<<>> + // ( n_b_cells, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); + + + // _compute_reconstruct_v_b_face_gather_v2<<>> + // ( n_b_cells * 3, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); + + /*************************************Kernels Gather registers memory***************************************/ + // _compute_reconstruct_v_b_face_gather_v3<<>> + // ( n_b_cells, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); + + + // _compute_reconstruct_v_b_face_gather_v4<<>> + // ( n_b_cells * 3, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); + + + + /*************************************Kernels Gather shared memory***************************************/ + // _compute_reconstruct_v_b_face_gather_v5<<>> + // ( n_b_cells, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); + + + CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); + + // _compute_reconstruct_correction<<>> + // ( n_cells, + // has_dc, + // c_disable_flag, + // cell_f_vol, + // grad_d, + // corr_grad_lin, + // test_bool + // ); + + _compute_reconstruct_correction_v2<<>> + ( n_cells * 3, + has_dc, + c_disable_flag, + cell_f_vol, + grad_d, + corr_grad_lin, + test_bool + ); + CS_CUDA_CHECK(cudaEventRecord(b_faces_3, stream)); + + // ----------------------------End of Kernels part 2------------------------------------------- + + /* Sync to host */ + if (grad_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * stride * 3; + cs_cuda_copy_d2h(grad, grad_d, size); + } + else + cs_sync_d2h(grad); + + + CS_CUDA_CHECK(cudaEventRecord(stop, stream)); + CS_CUDA_CHECK(cudaEventSynchronize(stop)); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + if(perf){ + printf("reconstruct Kernels times:\t"); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); + printf("I_faces = %f\t", msec*1000.f); + + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, b_faces_1)); + // printf("CPU part = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_2)); + printf("B_faces = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_2, b_faces_3)); + printf("Correction = %f\t", msec*1000.f); + + printf("\n"); + + msec_tot = 0.0f; + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, i_faces)); + printf("reconstruct Total kernel part 1= %f\t", msec*1000.f); + msec_tot = msec; + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_3)); + printf("Total kernel part 2= %f\t", msec*1000.f); + msec_tot += msec; + + printf("Total kernel 1 and 2= %f\t", msec_tot*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); + + printf("\n"); + } + + if (_pvar_d != NULL) + CS_CUDA_CHECK(cudaFree(_pvar_d)); + if (_coefa_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefa_d)); + if (_coefb_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefb_d)); + if (_r_grad_d != NULL) + CS_CUDA_CHECK(cudaFree(_r_grad_d)); + + CS_CUDA_CHECK(cudaFree(cell_i_faces)); + CS_CUDA_CHECK(cudaFree(cell_i_faces_sgn)); + + CS_CUDA_CHECK(cudaFree(i_group_index)); + CS_CUDA_CHECK(cudaFree(b_group_index)); + CS_CUDA_CHECK(cudaFree(cell_f_vol)); + CS_CUDA_CHECK(cudaFree(i_f_face_normal)); + CS_CUDA_CHECK(cudaFree(dofij)); + CS_CUDA_CHECK(cudaFree(corr_grad_lin)); + CS_CUDA_CHECK(cudaFree(c_disable_flag)); + CS_CUDA_CHECK(cudaFree(grad_d)); +} + + +/*---------------------------------------------------------------------------- + * _gradient_vector the gradient of a vector using a given gradient of + * this vector (typically lsq). + * + * parameters: + * m <-- pointer to associated mesh structure + * fvq <-- pointer to associated finite volume quantities + * cpl <-- structure associated with internal coupling, or NULL + * inc <-- if 0, solve on increment; 1 otherwise + * coefav <-- B.C. coefficients for boundary face normals + * coefbv <-- B.C. coefficients for boundary face normals + * pvar <-- variable + * c_weight <-- weighted gradient coefficient variable + * r_grad --> gradient used for reconstruction + * grad --> gradient of pvar (du_i/dx_j : grad[][i][j]) + *----------------------------------------------------------------------------*/ +extern "C" void +_gradient_vector_cuda(const cs_mesh_t *mesh, + cs_real_3_t *_bc_coeff_a, + cs_real_33_t *_bc_coeff_b, + bool a_null, + bool b_null, + bool perf) +{ + const cs_lnum_t n_b_faces = mesh->n_b_faces; + + int device_id; + cudaGetDevice(&device_id); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t start, mem_h2d, init1, init2, stop; + float msec = 0.0f; + CS_CUDA_CHECK(cudaEventCreate(&start)); + CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); + CS_CUDA_CHECK(cudaEventCreate(&init1)); + CS_CUDA_CHECK(cudaEventCreate(&init2)); + CS_CUDA_CHECK(cudaEventCreate(&stop)); + + + // Record the start event + CS_CUDA_CHECK(cudaEventRecord(start, stream)); + + unsigned int blocksize = 256; + + cs_real_3_t *_bc_coeff_a_d; + CS_CUDA_CHECK(cudaMalloc(&_bc_coeff_a_d, n_b_faces * sizeof(cs_real_3_t))); + cs_real_33_t *_bc_coeff_b_d; + CS_CUDA_CHECK(cudaMalloc(&_bc_coeff_b_d, n_b_faces * sizeof(cs_real_33_t))); + + + /* Initialization */ + + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + + if(a_null){ + cudaMemset(_bc_coeff_a_d, 0, n_b_faces * sizeof(cs_real_3_t)); + } + + CS_CUDA_CHECK(cudaEventRecord(init1, stream)); + + if(b_null){ + cudaMemset(_bc_coeff_b_d, 0, n_b_faces * sizeof(cs_real_33_t)); + _set_one_to_coeff_b<<< get_gridsize(n_b_faces * 3, blocksize), blocksize, 0, stream>>> + (n_b_faces * 3, _bc_coeff_b_d); + } + + CS_CUDA_CHECK(cudaEventRecord(init2, stream)); + + + /* Sync to host */ + if (_bc_coeff_a_d != NULL) { + size_t size = n_b_faces * sizeof(cs_real_t) * 3; + cs_cuda_copy_d2h(_bc_coeff_a, _bc_coeff_a_d, size); + } + else + cs_sync_d2h(_bc_coeff_a); + /* Sync to host */ + if (_bc_coeff_b_d != NULL) { + size_t size = n_b_faces * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(_bc_coeff_b, _bc_coeff_b_d, size); + } + else + cs_sync_d2h(_bc_coeff_b); + + + CS_CUDA_CHECK(cudaEventRecord(stop, stream)); + CS_CUDA_CHECK(cudaEventSynchronize(stop)); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + if(perf){ + printf("reconstruct Kernels times:\t"); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init1)); + printf("Kernels execution time in us: \t"); + printf("Init1 = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init1, init2)); + printf("Init2 = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); + + printf("\n"); + } + CS_CUDA_CHECK(cudaFree(_bc_coeff_a_d)); + CS_CUDA_CHECK(cudaFree(_bc_coeff_b_d)); +} + +INSTANTIATE_RECONSTRUCT(cs_reconstruct_vector_gradient_cuda, 1); +INSTANTIATE_RECONSTRUCT(cs_reconstruct_vector_gradient_cuda, 3); +INSTANTIATE_RECONSTRUCT(cs_reconstruct_vector_gradient_cuda, 6); diff --git a/src/alge/cs_gradient_lsq_vector.cuh b/src/alge/cs_gradient_lsq_vector.cuh new file mode 100644 index 0000000000..0ecacd1d3d --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector.cuh @@ -0,0 +1,595 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------- + * Initialize RHS with null values + *----------------------------------------------------------------------------*/ + +__global__ static void +_init_rhs(cs_lnum_t n_cells_ext, + cs_real_33_t *restrict rhs) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id < n_cells_ext) { + for (cs_lnum_t i = 0; i < 3; i++) + for (cs_lnum_t j = 0; j < 3; j++) + rhs[c_id][i][j] = 0.0; + } +} + +__global__ static void +_compute_rhs_lsq_v_i_face_v0(cs_lnum_t n_i_faces, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *cell_f_cen, + cs_real_33_t *rhs, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight != NULL){ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1][i][j], c_weight[c_id2] * _denom * fctb[j]); + atomicAdd(&rhs[c_id2][i][j], c_weight[c_id1] * _denom * fctb[j]); + } + } + } + else{ + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1][i][j], fctb[j]); + atomicAdd(&rhs[c_id2][i][j], fctb[j]); + } + } + } +} + +__global__ static void +_compute_rhs_lsq_v_i_face(cs_lnum_t n_i_faces, + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); + } + } +} + +__global__ static void +_compute_rhs_lsq_v_i_face_cf(cs_lnum_t size, + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + using Cell = AtomicCell; + Cell _rhs1, _rhs2; + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + _rhs1[i][j].get() = _weight2 * fctb[j]; + _rhs2[i][j].get() = _weight1 * fctb[j]; + //atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); + //atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); + } + } + +#if 1 + Cell::ref(rhs[c_id1]).conflict_free_add(-1u, _rhs1); + Cell::ref(rhs[c_id2]).conflict_free_add(-1u, _rhs2); +#else + Cell::ref(rhs[c_id1]).atomic_add(_rhs1); + Cell::ref(rhs[c_id2]).atomic_add(_rhs2); +#endif +} + +__global__ static void +_compute_rhs_lsq_v_b_neighbor(cs_lnum_t n_cells, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + cs_real_t dc[3], ddc, pfac; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + cs_lnum_t c_id2 = cell_cells[index]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0] * dc[0] + dc[1] * dc[1] + dc[2] * dc[2]); + + for (cs_lnum_t i = 0; i < 3; i++) { + + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + rhs[c_id1][i][j] += dc[j] * pfac; + } + } + } + +} + +__global__ static void +_compute_rhs_lsq_v_b_face(cs_lnum_t n_b_faces, + const cs_lnum_t *restrict b_face_cells, + const cs_real_3_t *restrict cell_f_cen, + const cs_real_3_t *restrict b_face_normal, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + + cs_lnum_t c_id1; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + c_id1 = b_face_cells[f_id]; + + cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id1][0] + + coefbv[f_id][1][i] * pvar[c_id1][1] + + coefbv[f_id][2][i] * pvar[c_id1][2] + - pvar[c_id1][i]); + + atomicAdd(&rhs[c_id1][i][0], n_d_dist[0] * pfac); + atomicAdd(&rhs[c_id1][i][1], n_d_dist[1] * pfac); + atomicAdd(&rhs[c_id1][i][2], n_d_dist[2] * pfac); + } +} + +__global__ static void +_compute_gradient_lsq_v(cs_lnum_t n_cells, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= n_cells) + return; + + for(cs_lnum_t i = 0; i < 3; i++){ + gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + + rhs[c_id][i][1] * cocg[c_id][3] + + rhs[c_id][i][2] * cocg[c_id][5]; + + gradv[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + + rhs[c_id][i][1] * cocg[c_id][1] + + rhs[c_id][i][2] * cocg[c_id][4]; + + gradv[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + + rhs[c_id][i][1] * cocg[c_id][4] + + rhs[c_id][i][2] * cocg[c_id][2]; + } +} + +__global__ static void +_compute_gradient_lsq_b_v(cs_lnum_t n_b_cells, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_real_3_t *restrict b_face_normal, + const cs_real_3_t *restrict diipb, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocgb_s, + const int inc) +{ + size_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (c_idx >= n_b_cells) + return; + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; + + cs_lnum_t f_id; + cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9]; + cs_real_3_t normal; + + cs_lnum_t _33_9_idx[9][2]; + int nn = 0; + for (int ll = 0; ll < 3; ll++) { + for (int mm = 0; mm < 3; mm++) { + _33_9_idx[nn][0] = ll; + _33_9_idx[nn][1] = mm; + nn++; + } + } + + auto _cocg = cocgb_s[c_idx]; + auto _rhs = rhs[c_id]; + + cocgb[0][0] = _cocg[0]; + cocgb[0][1] = _cocg[3]; + cocgb[0][2] = _cocg[5]; + cocgb[1][0] = _cocg[3]; + cocgb[1][1] = _cocg[1]; + cocgb[1][2] = _cocg[4]; + cocgb[2][0] = _cocg[5]; + cocgb[2][1] = _cocg[4]; + cocgb[2][2] = _cocg[2]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + cs_math_3_normalize_cuda(b_face_normal[f_id], normal); + for (cs_lnum_t ii = 0; ii < 3; ii++) { + for (cs_lnum_t jj = 0; jj < 3; jj++) + cocgb[ii][jj] += normal[ii] * normal[jj]; + } + } + + for (int ll = 0; ll < 9; ll++) { + + int ll_9 = ll*(ll+1)/2; + + for (int mm = 0; mm <= ll; mm++) { + cocgb_v[ll_9+mm] = 0.; + + int pp = _33_9_idx[ll][0]; + int qq = _33_9_idx[ll][1]; + + int rr = _33_9_idx[mm][0]; + int ss = _33_9_idx[mm][1]; + + if (pp == rr) + cocgb_v[ll_9+mm] = cocgb[qq][ss]; + + rhsb_v[ll] = _rhs[pp][qq]; + } + } + + cs_real_3_t nb; + cs_real_t a[3], bt[3][3], db, db2; + for (cs_lnum_t i = s_id; i < e_id; i++) { + + f_id = cell_b_faces[i]; + + auto iipbf = diipb[f_id]; + + cs_math_3_normalize_cuda(b_face_normal[f_id], nb); + + db = 1./b_dist[f_id]; + db2 = db*db; + + for (int ll = 0; ll < 3; ll++) { + for (int pp = 0; pp < 3; pp++) + bt[ll][pp] = coefbv[f_id][ll][pp]; + } + for (int ll = 0; ll < 3; ll++) { + a[ll] = inc*coefav[f_id][ll]; + bt[ll][ll] -= 1; + } + + for (int ll = 0; ll < 9; ll++) { + + int kk = _33_9_idx[ll][0]; + int qq = _33_9_idx[ll][1]; + + int ll_9 = ll*(ll+1)/2; + for (int pp = 0; pp <= ll; pp++) { + + int rr = _33_9_idx[pp][0]; + int ss = _33_9_idx[pp][1]; + + cs_real_t cocgv = 0.; + for (int mm = 0; mm < 3; mm++) + cocgv += bt[mm][kk]*bt[mm][rr]; + cocgb_v[ll_9+pp] += cocgv*(iipbf[qq]*iipbf[ss])*db2; + + cocgb_v[ll_9+pp] -= ( nb[ss]*bt[rr][kk]*iipbf[qq] + + nb[qq]*bt[kk][rr]*iipbf[ss]) + *db; + } + } + + for (int ll = 0; ll < 9; ll++) { + int pp = _33_9_idx[ll][0]; + int qq = _33_9_idx[ll][1]; + + cs_real_t rhsv = 0.; + for (int rr = 0; rr < 3; rr++) { + rhsv += bt[rr][pp]*diipb[f_id][qq] + *(a[rr]+ bt[rr][0]*pvar[c_id][0] + + bt[rr][1]*pvar[c_id][1] + + bt[rr][2]*pvar[c_id][2]); + } + + rhsb_v[ll] -= rhsv*db2; + } + + } + _fact_crout_pp_cuda<9>(cocgb_v); + + _fw_and_bw_ldtl_pp_cuda<9>(cocgb_v, x, rhsb_v); + + for (int kk = 0; kk < 9; kk++) { + int ii = _33_9_idx[kk][0]; + int jj = _33_9_idx[kk][1]; + gradv[c_id][ii][jj] = x[kk]; + } +} + +template +__global__ static void +_compute_gradient_lsq_b_strided_v(const cs_lnum_t n_b_cells, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_real_3_t *restrict b_face_cog, + const cs_real_3_t *restrict cell_cen, + const cs_real_3_t *restrict diipb, + cs_real_t (*restrict gradv)[stride][3], + const cs_real_t (*restrict coefbv)[stride][stride], + cs_cocg_6_t *restrict cocg, + cs_lnum_t n_c_iter_max, + cs_real_t c_eps) +{ + size_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (c_idx >= n_b_cells) + return; + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; + + auto c_grad = gradv[c_id]; + auto _cocg = cocg[c_id]; + auto _cell_cen = cell_cen[c_id]; + + cs_real_t grad_0[stride][3], grad_i[stride][3], rhs_c[stride][3], dif[3], grad_c[stride][3], + var_ip_f[stride]; + + cs_real_t ref_norm = 0.0, ddif, c_norm = 0; + cs_lnum_t n_c_it, f_id; + cs_real_t eps_dvg = 1e-2; + cs_real_t cs_math_epzero = 1e-12; + + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + grad_0[i][j] = c_grad[i][j]; + grad_i[i][j] = c_grad[i][j]; + } + } + + ref_norm = 0; + for (cs_lnum_t kk = 0; kk < stride; kk++) { + for (cs_lnum_t ll = 0; ll < 3; ll++) + ref_norm += cs_math_fabs_cuda(c_grad[kk][ll]); + } + + c_norm = 0; + + for (n_c_it = 0; n_c_it < n_c_iter_max; n_c_it++) { + + for (cs_lnum_t ll = 0; ll < stride; ll++) { + rhs_c[ll][0] = 0; + rhs_c[ll][1] = 0; + rhs_c[ll][2] = 0; + } + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + for (cs_lnum_t ii = 0; ii < 3; ii++) + dif[ii] = b_face_cog[f_id][ii] - _cell_cen[ii]; + + ddif = 1. / cs_math_3_square_norm_cuda(dif); + + for (cs_lnum_t ll = 0; ll < stride; ll++) { + var_ip_f[ll] = cs_math_3_dot_product_cuda(c_grad[ll], diipb[f_id]); + } + + auto b = coefbv[f_id]; + + for (cs_lnum_t kk = 0; kk < stride; kk++) { + cs_real_t pfac = 0; + for (cs_lnum_t ll = 0; ll < stride; ll++) { + pfac += b[kk][ll] * var_ip_f[ll] * ddif; + } + + for (cs_lnum_t ll = 0; ll < 3; ll++) + rhs_c[kk][ll] += dif[ll] * pfac; + } + + } + + for(cs_lnum_t i = 0; i < stride; i++){ + grad_c[i][0] = rhs_c[i][0] * _cocg[0] + + rhs_c[i][1] * _cocg[3] + + rhs_c[i][2] * _cocg[5]; + + grad_c[i][1] = rhs_c[i][0] * _cocg[3] + + rhs_c[i][1] * _cocg[1] + + rhs_c[i][2] * _cocg[4]; + + grad_c[i][2] = rhs_c[i][0] * _cocg[5] + + rhs_c[i][1] * _cocg[4] + + rhs_c[i][2] * _cocg[2]; + } + + c_norm = 0.0; + for (cs_lnum_t ii = 0; ii < stride; ii++) { + for (cs_lnum_t jj = 0; jj < 3; jj++) { + c_grad[ii][jj] = grad_0[ii][jj] + grad_c[ii][jj]; + c_norm += cs_math_fabs_cuda(c_grad[ii][jj] - grad_i[ii][jj]); + grad_i[ii][jj] = c_grad[ii][jj]; + } + } + + if (c_norm < ref_norm * c_eps || c_norm < cs_math_epzero) + break; + } + + for (cs_lnum_t ii = 0; ii < stride; ii++) { + for (cs_lnum_t jj = 0; jj < 3; jj++) { + gradv[c_id][ii][jj] = c_grad[ii][jj]; + } + } + + if (c_norm > eps_dvg * ref_norm) { + for (cs_lnum_t ii = 0; ii < stride; ii++) { + for (cs_lnum_t jj = 0; jj < 3; jj++) { + gradv[c_id][ii][jj] = grad_0[ii][jj]; + } + } + + n_c_it *= -1; + } +} diff --git a/src/alge/cs_gradient_lsq_vector_gather.cuh b/src/alge/cs_gradient_lsq_vector_gather.cuh new file mode 100644 index 0000000000..586764e259 --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector_gather.cuh @@ -0,0 +1,294 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +__global__ static void +_compute_rhs_lsq_v_i_face_gather(cs_lnum_t n_cells, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _denom, _pond, pfac; + cs_lnum_t c_id2, f_id; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight != NULL){ + f_id = cell_i_faces[index]; + _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + rhs[c_id1][i][j] += c_weight[c_id2] * _denom * fctb[j]; + } + } + } + else{ + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + rhs[c_id1][i][j] += fctb[j]; + } + } + } +} +} + +__global__ static void +_compute_rhs_lsq_v_b_face_gather(cs_lnum_t n_b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict b_cells, + const cs_real_3_t *restrict b_face_normal, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_idx >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t f_id; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + + cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id][0] + + coefbv[f_id][1][i] * pvar[c_id][1] + + coefbv[f_id][2][i] * pvar[c_id][2] + - pvar[c_id][i]); + + rhs[c_id][i][0] += n_d_dist[0] * pfac; + rhs[c_id][i][1] += n_d_dist[1] * pfac; + rhs[c_id][i][2] += n_d_dist[2] * pfac; + } + } +} + +template +__global__ static void +_compute_rhs_lsq_v_b_face_gather_stride(cs_lnum_t n_b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict b_cells, + const cs_real_3_t *restrict b_face_cog, + const cs_real_3_t *restrict cell_cen, + cs_real_33_t *restrict rhs, + const val_t *restrict pvar, + const coefb_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + cs_cocg_6_t *restrict cocg, + const cs_cocg_6_t *restrict cocgb, + const int inc) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_idx >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t f_id; + cs_real_t dif[stride], ddif, pfac, norm, var_f[stride]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + for(cs_lnum_t ll = 0; ll < 6; ll++) + cocg[c_id][ll] = cocgb[c_idx][ll]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + + for (cs_lnum_t ll = 0; ll < 3; ll++) + dif[ll] = b_face_cog[f_id][ll] - cell_cen[c_id][ll]; + + ddif = 1. / cs_math_3_square_norm_cuda(dif); + + cocg[c_id][0] += dif[0]*dif[0]*ddif; + cocg[c_id][1] += dif[1]*dif[1]*ddif; + cocg[c_id][2] += dif[2]*dif[2]*ddif; + cocg[c_id][3] += dif[0]*dif[1]*ddif; + cocg[c_id][4] += dif[1]*dif[2]*ddif; + cocg[c_id][5] += dif[0]*dif[2]*ddif; + + for (cs_lnum_t kk = 0; kk < stride; kk++) { + var_f[kk] = coefav[f_id][kk]*inc; + for (cs_lnum_t ll = 0; ll < stride; ll++) { + var_f[kk] += coefbv[f_id][ll][kk] * pvar[c_id][ll]; + } + + pfac = (var_f[kk] - pvar[c_id][kk]) * ddif; + + for (cs_lnum_t ll = 0; ll < 3; ll++) + rhs[c_id][kk][ll] += dif[ll] * pfac; + } + } + _math_6_inv_cramer_sym_in_place_cuda(cocg[c_id]); +} + + +template +__global__ static void +_compute_rhs_lsq_v_b_face_gather_stride_v2(cs_lnum_t n_b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict b_cells, + const cs_real_3_t *restrict b_face_cog, + const cs_real_3_t *restrict cell_cen, + cs_real_33_t *restrict rhs, + const val_t *restrict pvar, + const coefb_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + cs_cocg_6_t *restrict cocg, + const cs_cocg_6_t *restrict cocgb, + const int inc) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_idx >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t f_id; + cs_real_t dif[stride], ddif, pfac, norm, var_f[stride]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + for(cs_lnum_t ll = 0; ll < 6; ll++) + cocg[c_id][ll] = cocgb[c_idx][ll]; + + __shared__ cs_real_t _rhs[256][3][3]; + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _rhs[lindex][i][j] = rhs[c_id][i][j]; + } + } + + auto _pvar = pvar[c_id]; + auto _cocg = cocg[c_id]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + auto _coefbv = coefbv[f_id]; + auto _coefav = coefav[f_id]; + + + for (cs_lnum_t ll = 0; ll < 3; ll++) + dif[ll] = b_face_cog[f_id][ll] - cell_cen[c_id][ll]; + + ddif = 1. / cs_math_3_square_norm_cuda(dif); + + _cocg[0] += dif[0]*dif[0]*ddif; + _cocg[1] += dif[1]*dif[1]*ddif; + _cocg[2] += dif[2]*dif[2]*ddif; + _cocg[3] += dif[0]*dif[1]*ddif; + _cocg[4] += dif[1]*dif[2]*ddif; + _cocg[5] += dif[0]*dif[2]*ddif; + + for (cs_lnum_t kk = 0; kk < stride; kk++) { + var_f[kk] = _coefav[kk]*inc; + for (cs_lnum_t ll = 0; ll < stride; ll++) { + var_f[kk] += _coefbv[ll][kk] * _pvar[ll]; + } + + pfac = (var_f[kk] - _pvar[kk]) * ddif; + + for (cs_lnum_t ll = 0; ll < 3; ll++) + _rhs[lindex][kk][ll] += dif[ll] * pfac; + } + } + + cocg[c_id][0] += _cocg[0]; + cocg[c_id][1] += _cocg[1]; + cocg[c_id][2] += _cocg[2]; + cocg[c_id][3] += _cocg[3]; + cocg[c_id][4] += _cocg[4]; + cocg[c_id][5] += _cocg[5]; + + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id][i][j] = _rhs[lindex][i][j]; + } + } + // _math_6_inv_cramer_sym_in_place_cuda(cocg[c_id]); +} diff --git a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh new file mode 100644 index 0000000000..cdb831140c --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh @@ -0,0 +1,168 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +__global__ static void +_compute_rhs_lsq_v_i_face_gather_v2(cs_lnum_t n_cells, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac; + cs_lnum_t c_id2, f_id; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + __shared__ cs_real_t _rhs[3][3]; + + auto temp_rhs = rhs[c_id1]; + _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2]; + _rhs[1][0]= temp_rhs[1][0]; _rhs[1][1]= temp_rhs[1][1]; _rhs[1][2]= temp_rhs[1][2]; + _rhs[2][0]= temp_rhs[2][0]; _rhs[2][1]= temp_rhs[2][1]; _rhs[2][2]= temp_rhs[2][2]; + + auto _pvar1 = pvar[c_id1]; + + auto _cell_f_cen1 = cell_f_cen[c_id1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + + auto _cell_f_cen2 = cell_f_cen[c_id2]; + + dc[0] = _cell_f_cen2[0] - _cell_f_cen1[0]; + dc[1] = _cell_f_cen2[1] - _cell_f_cen1[1]; + dc[2] = _cell_f_cen2[2] - _cell_f_cen1[2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight = 1.; + } + else{ + f_id = cell_i_faces[index]; + _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight = c_weight[c_id2] * _denom; + } + + auto _pvar2 = pvar[c_id2]; + // _pvar2[0]= temp_pvar2[0]; _pvar2[1]= temp_pvar2[1]; _pvar2[2]= temp_pvar2[2]; + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (_pvar2[i] - _pvar1[i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + _rhs[i][j] += _weight * fctb[j]; + } + } + + } + rhs[c_id1][0][0] = _rhs[0][0]; rhs[c_id1][0][1] = _rhs[0][1]; rhs[c_id1][0][2] = _rhs[0][2]; + rhs[c_id1][1][0] = _rhs[1][0]; rhs[c_id1][1][1] = _rhs[1][1]; rhs[c_id1][1][2] = _rhs[1][2]; + rhs[c_id1][2][0] = _rhs[2][0]; rhs[c_id1][2][1] = _rhs[2][1]; rhs[c_id1][2][2] = _rhs[2][2]; +} + +__global__ static void +_compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t n_b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict b_cells, + const cs_real_3_t *restrict b_face_normal, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_idx >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t f_id; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + cs_real_t _rhs[3][3]; + + auto temp_rhs = rhs[c_id]; + _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2]; + _rhs[1][0]= temp_rhs[1][0]; _rhs[1][1]= temp_rhs[1][1]; _rhs[1][2]= temp_rhs[1][2]; + _rhs[2][0]= temp_rhs[2][0]; _rhs[2][1]= temp_rhs[2][1]; _rhs[2][2]= temp_rhs[2][2]; + + auto _pvar1 = pvar[c_id]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + + auto _coefav = coefav[f_id]; + auto _coefbv = coefbv[f_id]; + + cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = _coefav[i]*inc + + ( _coefbv[0][i] * _pvar1[0] + + _coefbv[1][i] * _pvar1[1] + + _coefbv[2][i] * _pvar1[2] + - _pvar1[i]); + + _rhs[i][0] += n_d_dist[0] * pfac; + _rhs[i][1] += n_d_dist[1] * pfac; + _rhs[i][2] += n_d_dist[2] * pfac; + } + + } + rhs[c_id][0][0] = _rhs[0][0]; rhs[c_id][0][1] = _rhs[0][1]; rhs[c_id][0][2] = _rhs[0][2]; + rhs[c_id][1][0] = _rhs[1][0]; rhs[c_id][1][1] = _rhs[1][1]; rhs[c_id][1][2] = _rhs[1][2]; + rhs[c_id][2][0] = _rhs[2][0]; rhs[c_id][2][1] = _rhs[2][1]; rhs[c_id][2][2] = _rhs[2][2]; +} diff --git a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh new file mode 100644 index 0000000000..37ce174c8d --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh @@ -0,0 +1,264 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +__global__ static void +_compute_rhs_lsq_v_i_face_gather_v3(cs_lnum_t n_cells, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac; + cs_lnum_t c_id2, f_id; + + // size_t c_id1 = c_id / (3*3); + // size_t i = (c_id / 3) % 3; + // size_t j = c_id % 3; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + __shared__ cs_real_t _rhs[256][3][3]; + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _rhs[lindex][i][j] = rhs[c_id1][i][j]; + } + } + // __syncthreads(); + auto _pvar1 = pvar[c_id1]; + + auto _cell_f_cen1 = cell_f_cen[c_id1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + + auto _cell_f_cen2 = cell_f_cen[c_id2]; + + dc[0] = _cell_f_cen2[0] - _cell_f_cen1[0]; + dc[1] = _cell_f_cen2[1] - _cell_f_cen1[1]; + dc[2] = _cell_f_cen2[2] - _cell_f_cen1[2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight = 1.; + } + else{ + f_id = cell_i_faces[index]; + _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight = c_weight[c_id2] * _denom; + } + + auto _pvar2 = pvar[c_id2]; + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (_pvar2[i] - _pvar1[i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + _rhs[lindex][i][j] += _weight * fctb[j]; + } + } + + } + // __syncthreads(); + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id1][i][j] = _rhs[lindex][i][j]; + } + } +} + +__global__ static void +_compute_rhs_lsq_v_i_face_gather_v4(cs_lnum_t n_cells, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac; + cs_lnum_t c_id2, f_id; + + // size_t c_id1 = c_id / (3*3); + // size_t i = (c_id / 3) % 3; + // size_t j = c_id % 3; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + __shared__ cs_real_t _rhs[256][3][3]; + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _rhs[lindex][i][j] = 0.0; + } + } + // __syncthreads(); + auto _pvar1 = pvar[c_id1]; + + auto _cell_f_cen1 = cell_f_cen[c_id1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + + auto _cell_f_cen2 = cell_f_cen[c_id2]; + + dc[0] = _cell_f_cen2[0] - _cell_f_cen1[0]; + dc[1] = _cell_f_cen2[1] - _cell_f_cen1[1]; + dc[2] = _cell_f_cen2[2] - _cell_f_cen1[2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight = 1.; + } + else{ + f_id = cell_i_faces[index]; + _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight = c_weight[c_id2] * _denom; + } + + auto _pvar2 = pvar[c_id2]; + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (_pvar2[i] - _pvar1[i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + _rhs[lindex][i][j] += _weight * fctb[j]; + } + } + + } + // __syncthreads(); + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id1][i][j] = _rhs[lindex][i][j]; + } + } +} + +__global__ static void +_compute_rhs_lsq_v_b_face_gather_v3(cs_lnum_t n_b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict b_cells, + const cs_real_3_t *restrict b_face_normal, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_idx >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t f_id; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + __shared__ cs_real_t _rhs[256][3][3]; + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _rhs[lindex][i][j] = rhs[c_id][i][j]; + } + } + + // __syncthreads(); + + auto _pvar1 = pvar[c_id]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + + auto _coefav = coefav[f_id]; + auto _coefbv = coefbv[f_id]; + + cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = _coefav[i]*inc + + ( _coefbv[0][i] * _pvar1[0] + + _coefbv[1][i] * _pvar1[1] + + _coefbv[2][i] * _pvar1[2] + - _pvar1[i]); + + _rhs[lindex][i][0] += n_d_dist[0] * pfac; + _rhs[lindex][i][1]+= n_d_dist[1] * pfac; + _rhs[lindex][i][2] += n_d_dist[2] * pfac; + } + + } + // __syncthreads(); + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id][i][j] = _rhs[lindex][i][j]; + } + } +} diff --git a/src/alge/cs_gradient_lsq_vector_v2.cuh b/src/alge/cs_gradient_lsq_vector_v2.cuh new file mode 100644 index 0000000000..a342fd67f7 --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector_v2.cuh @@ -0,0 +1,249 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------- + * Initialize RHS with null values + *----------------------------------------------------------------------------*/ + +__global__ static void +_init_rhs_v2(cs_lnum_t n_cells_g, + cs_real_33_t *restrict _rhs) +{ + cs_real_t *rhs = (cs_real_t *) _rhs; + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= n_cells_g) + return; + + rhs[c_id] = 0.0; +} + +__global__ static void +_compute_rhs_lsq_v_i_face_v2(cs_lnum_t n_i_faces, + const cs_lnum_t *restrict i_face_cells, + const cs_real_t *restrict cell_f_cen, + cs_real_33_t *restrict _rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_real_t *rhs = (cs_real_t *) _rhs; + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id*2]; + c_id2 = i_face_cells[f_id*2 + 1]; + + dc[0] = cell_f_cen[c_id2*3] - cell_f_cen[c_id1*3]; + dc[1] = cell_f_cen[c_id2*3 + 1] - cell_f_cen[c_id1*3 + 1]; + dc[2] = cell_f_cen[c_id2*3 + 2] - cell_f_cen[c_id1*3 + 2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1*3*3 + i*3 + j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2*3*3 + i*3 + j], _weight1 * fctb[j]); + } + } +} + +__global__ static void +_compute_rhs_lsq_v_i_face_v2cf(cs_lnum_t size, + const cs_lnum_2_t *restrict _i_face_cells, + const cs_real_3_t *restrict _cell_f_cen, + cs_real_33_t *restrict _rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_real_t *rhs = (cs_real_t *) _rhs; + cs_lnum_t *i_face_cells = (cs_lnum_t *) _i_face_cells; + cs_real_t *cell_f_cen = (cs_real_t *) _cell_f_cen; + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id*2]; + c_id2 = i_face_cells[f_id*2 + 1]; + + dc[0] = cell_f_cen[c_id2*3] - cell_f_cen[c_id1*3]; + dc[1] = cell_f_cen[c_id2*3 + 1] - cell_f_cen[c_id1*3 + 1]; + dc[2] = cell_f_cen[c_id2*3 + 2] - cell_f_cen[c_id1*3 + 2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + using Cell = AtomicCell; + Cell _rhs1, _rhs2; + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + //_rhs1[i][j].get() += _weight2 * fctb[j]; + //_rhs2[i][j].get() += _weight1 * fctb[j]; + atomicAdd(&rhs[c_id1*3*3 + i*3 + j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2*3*3 + i*3 + j], _weight1 * fctb[j]); + } + } + //reinterpret_cast(&rhs[c_id1*3*3][0][0])->atomic_add(_rhs1); + //reinterpret_cast(&rhs[c_id2*3*3][0][0])->atomic_add(_rhs2); +} + +__global__ static void +_compute_rhs_lsq_v_b_face_v2(cs_lnum_t n_b_faces, + const cs_lnum_t *restrict b_face_cells, + const cs_real_3_t *restrict cell_f_cen, + const cs_real_3_t *restrict b_face_normal, + cs_real_33_t *restrict _rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) +{ + cs_real_t *rhs = (cs_real_t *) _rhs; + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + + cs_lnum_t c_id1; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + c_id1 = b_face_cells[f_id]; + + cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id1][0] + + coefbv[f_id][1][i] * pvar[c_id1][1] + + coefbv[f_id][2][i] * pvar[c_id1][2] + - pvar[c_id1][i]); + + atomicAdd(&rhs[c_id1*3*3 + i*3], n_d_dist[0] * pfac); + atomicAdd(&rhs[c_id1*3*3 + i*3 + 1], n_d_dist[1] * pfac); + atomicAdd(&rhs[c_id1*3*3 + i*3 + 2], n_d_dist[2] * pfac); + } +} + +__global__ static void +_compute_gradient_lsq_v_v2(cs_lnum_t n_cells_g, + cs_real_33_t *restrict _gradv, + cs_real_33_t *restrict _rhs, + cs_cocg_6_t *restrict cocg) +{ + cs_real_t *rhs = (cs_real_t *) _rhs; + cs_real_t *gradv = (cs_real_t *) _gradv; + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= n_cells_g) + return; + + for(cs_lnum_t i = 0; i < 3; i++){ + gradv[c_id*3*3 + i*3] = rhs[c_id*3*3 + i*3] * cocg[c_id][0] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][5]; + + gradv[c_id*3*3 + i*3 + 1] = rhs[c_id*3*3 + i*3] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][1] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][4]; + + gradv[c_id*3*3 + i*3 + 2] = rhs[c_id*3*3 + i*3] * cocg[c_id][5] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][4] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][2]; + } +} + +__global__ static void +_compute_gradient_lsq_v_v4(cs_lnum_t n_cells, + cs_real_33_t *restrict gradv_m, + cs_real_33_t *restrict rhs_m, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= n_cells) + return; + + cs_real_t *rhs = (cs_real_t *) rhs_m; + cs_real_t *gradv = (cs_real_t *) gradv_m; + + for(cs_lnum_t i = 0; i < 3; i++){ + gradv[c_id*3*3 + i*3] = rhs[c_id*3*3 + i*3] * cocg[c_id][0] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][5]; + + gradv[c_id*3*3 + i*3 + 1] = rhs[c_id*3*3 + i*3] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][1] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][4]; + + gradv[c_id*3*3 + i*3 + 2] = rhs[c_id*3*3 + i*3] * cocg[c_id][5] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][4] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][2]; + } +} diff --git a/src/alge/cs_gradient_lsq_vector_v3.cuh b/src/alge/cs_gradient_lsq_vector_v3.cuh new file mode 100644 index 0000000000..135d0a2520 --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector_v3.cuh @@ -0,0 +1,214 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + +__global__ static void +_compute_rhs_lsq_v_i_face_v3(cs_lnum_t n_i_faces, + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + size_t f_id1 = f_id / (3*3); + size_t i = (f_id / 3) % 3; + size_t j = f_id % 3; + + c_id1 = i_face_cells[f_id1][0]; + c_id2 = i_face_cells[f_id1][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id1]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + //for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + //for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); + //} + //} +} + +__global__ static void +_compute_rhs_lsq_v_i_face_v3cf(cs_lnum_t size, + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + size_t f_id1 = f_id / (3*3); + size_t i = (f_id / 3) % 3; + size_t j = f_id % 3; + + c_id1 = i_face_cells[f_id1][0]; + c_id2 = i_face_cells[f_id1][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id1]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + using Cell = AtomicCell; + + //for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + //for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + Cell::ref(rhs[c_id1][i][j]).conflict_free_add(-1u, Cell::ref(_weight2 * fctb[j])); + Cell::ref(rhs[c_id2][i][j]).conflict_free_add(-1u, Cell::ref(_weight1 * fctb[j])); + //atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); + //atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); + //} + //} +} + +__global__ static void +_compute_gradient_lsq_v_v5(cs_lnum_t n_cells, + cs_real_t *restrict _gradv, + cs_real_t *restrict _rhs, + cs_cocg_6_t *restrict cocg) +{ + cs_real_t *rhs = (cs_real_t *) _rhs; + cs_real_t *gradv = (cs_real_t *) _gradv; + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= n_cells) + return; + + size_t c_id1 = c_id / (3*3); + size_t i = (c_id / 3) % 3; + size_t j = c_id % 3; + + auto cocg_temp = cocg[c_id1]; + cs_real_t _cocg[3]; + + _cocg[0] = cocg_temp[5]; + _cocg[1] = cocg_temp[4]; + _cocg[2] = cocg_temp[2]; + + if(j == 0){ + _cocg[0] = cocg_temp[0]; + _cocg[1] = cocg_temp[3]; + _cocg[2] = cocg_temp[5]; + } + + if(j == 1){ + _cocg[0] = cocg_temp[3]; + _cocg[1] = cocg_temp[1]; + _cocg[2] = cocg_temp[4]; + } + + gradv[c_id] = rhs[c_id1*3*3 + i*3] * _cocg[0] + + rhs[c_id1*3*3 + i*3 + 1] * _cocg[1] + + rhs[c_id1*3*3 + i*3 + 2] * _cocg[2]; + +} + +__global__ static void +_compute_gradient_lsq_v_v6(cs_lnum_t n_cells, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= n_cells) + return; + + size_t c_id1 = c_id / (3*3); + size_t i = (c_id / 3) % 3; + size_t j = c_id % 3; + + auto cocg_temp = cocg[c_id1]; + cs_real_t _cocg[3]; + + _cocg[0] = cocg_temp[5]; + _cocg[1] = cocg_temp[4]; + _cocg[2] = cocg_temp[2]; + + if(j == 0){ + _cocg[0] = cocg_temp[0]; + _cocg[1] = cocg_temp[3]; + _cocg[2] = cocg_temp[5]; + } + + if(j == 1){ + _cocg[0] = cocg_temp[3]; + _cocg[1] = cocg_temp[1]; + _cocg[2] = cocg_temp[4]; + } + + gradv[c_id1][i][j] = rhs[c_id1][i][0] * _cocg[0] + + rhs[c_id1][i][1] * _cocg[1] + + rhs[c_id1][i][2] * _cocg[2]; + +} diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index 399a9a02e8..867db2cac8 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -109,12 +109,92 @@ cs_gradient_scalar_lsq_cuda(const cs_mesh_t *m, cs_cocg_6_t *restrict cocgb, cs_real_3_t *restrict grad); -#endif /* defined(HAVE_CUDA) */ +void +cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_halo_type_t halo_type, + const int inc, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict c_weight, + cs_cocg_6_t *restrict cocg, + cs_cocg_6_t *restrict cocgb, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs); + +void +cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_internal_coupling_t *cpl, + cs_halo_type_t halo_type, + int inc, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const bool *coupled_faces, + cs_lnum_t cpl_stride, + bool test_bool, + bool perf); + +void +_gradient_vector_cuda(const cs_mesh_t *mesh, + cs_real_3_t *_bc_coeff_a, + cs_real_33_t *_bc_coeff_b, + bool a_null, + bool b_null, + bool perf); + +#endif + +/* defined(HAVE_CUDA) */ /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ /*----------------------------------------------------------------------------*/ END_C_DECLS - +#ifdef __cplusplus +/** + * This template will be instantited with stride = 1, 3, 6, 9 +*/ +template +void +cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_halo_type_t halo_type, + const int inc, + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *restrict c_weight, + cs_cocg_6_t *restrict cocg, + cs_cocg_6_t *restrict cocgb, + cs_real_t (*restrict gradv)[stride][3], + cs_real_t (*restrict rhs)[stride][3], + cs_lnum_t n_c_iter_max, + cs_real_t c_eps); + +template +void +cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + cs_halo_type_t halo_type, + int inc, + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *restrict c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + bool test_bool, + bool perf); +#endif #endif /* __CS_GRADIENT_CUDA_H__ */ diff --git a/src/alge/cs_reconstruct_vector_gradient_gather.cuh b/src/alge/cs_reconstruct_vector_gradient_gather.cuh new file mode 100644 index 0000000000..c7262866b2 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_gather.cuh @@ -0,0 +1,142 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +template +__global__ static void +_compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + cs_lnum_t c_id2, f_id; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + for (cs_lnum_t i = 0; i < stride; i++) { + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + grad[c_id1][i][j] += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j]; + } + } + } +} + + + +template +__global__ static void +_compute_reconstruct_v_b_face_gather(cs_lnum_t n_b_cells, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_id1 >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + for (cs_lnum_t i = 0; i < stride; i++) { + + pfac = inc*coefav[f_id][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < stride; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; + } + } + } +} diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh new file mode 100644 index 0000000000..ff1723ba0f --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh @@ -0,0 +1,144 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +template +__global__ static void +_compute_reconstruct_v_i_face_gather_v2(cs_lnum_t n_cells, + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + + cs_lnum_t c_id2, f_id; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + size_t c_idx = c_id1 / (stride*3); + size_t i = (c_id1 / 3) % stride; + size_t j = c_id1 % 3; + + cs_lnum_t s_id = cell_cells_idx[c_idx]; + cs_lnum_t e_id = cell_cells_idx[c_idx + 1]; + + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_idx] // cell weighting active + / ( pond * c_weight[c_idx] + + (1.0-pond)* c_weight[c_id2]); + + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_idx][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_idx][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_idx][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_idx][i][2] + + r_grad[c_id2][i][2])); + + grad[c_idx][i][j] += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j]; + } +} + + + +template +__global__ static void +_compute_reconstruct_v_b_face_gather_v2(cs_lnum_t n_b_cells, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_idx >= n_b_cells){ + return; + } + + size_t c_id1 = c_idx / stride; + size_t i = c_idx % stride; + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + pfac = inc*coefav[f_id][i]; + + pfac += coefbv[f_id][i][0] * pvar[c_id][0] + + coefbv[f_id][i][1] * pvar[c_id][1] + + coefbv[f_id][i][2] * pvar[c_id][2]; + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < stride; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + grad[c_id][i][0] += (pfac + rfac) * b_f_face_normal[f_id][0]; + grad[c_id][i][1] += (pfac + rfac) * b_f_face_normal[f_id][1]; + grad[c_id][i][2] += (pfac + rfac) * b_f_face_normal[f_id][2]; + + } +} diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh new file mode 100644 index 0000000000..aa53aa9f9e --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh @@ -0,0 +1,173 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + +template +__global__ static void +_compute_reconstruct_v_i_face_gather_v3(cs_lnum_t n_cells, + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + cs_lnum_t c_id2, f_id; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + auto _grad = grad[c_id1]; + auto _pvar1 = pvar[c_id1]; + auto _r_grad1 = r_grad[c_id1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + auto _pvar2 = pvar[c_id2]; + auto _r_grad2 = r_grad[c_id2]; + auto _dofij = dofij[f_id]; + auto _i_f_face_normal = i_f_face_normal[f_id]; + auto _cell_i_faces_sgn = cell_i_faces_sgn[index]; + + pond = (_cell_i_faces_sgn > 0) ? weight[f_id] : 1. - weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + for (cs_lnum_t i = 0; i < stride; i++) { + pfaci = (1.0-ktpond) * (_pvar2[i] - _pvar1[i]); + + /* Reconstruction part */ + rfac = 0.5 * ( _dofij[0]*( _r_grad1[i][0] + + _r_grad2[i][0]) + + _dofij[1]*( _r_grad1[i][1] + + _r_grad2[i][1]) + + _dofij[2]*( _r_grad1[i][2] + + _r_grad2[i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + _grad[i][j] += _cell_i_faces_sgn * (pfaci + rfac) * _i_f_face_normal[j]; + } + } + } + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + grad[c_id1][i][j] = _grad[i][j]; + } + } +} + + + +template +__global__ static void +_compute_reconstruct_v_b_face_gather_v3(cs_lnum_t n_b_cells, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_id1 >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + auto _grad = grad[c_id]; + auto _r_grad = r_grad[c_id]; + auto _pvar = pvar[c_id]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + auto _diipb = diipb[f_id]; + auto _coefav = coefav[f_id]; + auto _coefbv = coefbv[f_id]; + auto _b_f_face_normal = b_f_face_normal[f_id]; + + for (cs_lnum_t i = 0; i < stride; i++) { + + pfac = inc*_coefav[i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += _coefbv[i][k] * _pvar[k]; + } + + pfac -= _pvar[i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < stride; k++) { + vecfac = _r_grad[k][0] * _diipb[0] + + _r_grad[k][1] * _diipb[1] + + _r_grad[k][2] * _diipb[2]; + rfac += _coefbv[i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + _grad[i][j] += (pfac + rfac) * _b_f_face_normal[j]; + } + + } + } + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + grad[c_id][i][j] = _grad[i][j]; + } + } + +} diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh new file mode 100644 index 0000000000..35eed09b76 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh @@ -0,0 +1,150 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +template +__global__ static void +_compute_reconstruct_v_i_face_gather_v4(cs_lnum_t n_cells, + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + + cs_lnum_t c_id2, f_id; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + size_t c_idx = c_id1 / (stride*3); + size_t i = (c_id1 / 3) % stride; + size_t j = c_id1 % 3; + + cs_lnum_t s_id = cell_cells_idx[c_idx]; + cs_lnum_t e_id = cell_cells_idx[c_idx + 1]; + + auto _grad = grad[c_idx][i][j]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_idx] // cell weighting active + / ( pond * c_weight[c_idx] + + (1.0-pond)* c_weight[c_id2]); + + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_idx][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_idx][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_idx][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_idx][i][2] + + r_grad[c_id2][i][2])); + + _grad += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j]; + } + grad[c_idx][i][j] = _grad; +} + + + +template +__global__ static void +_compute_reconstruct_v_b_face_gather_v4(cs_lnum_t n_b_cells, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_idx >= n_b_cells){ + return; + } + + size_t c_id1 = c_idx / stride; + size_t i = c_idx % stride; + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + auto _grad = grad[c_id][i]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + pfac = inc*coefav[f_id][i]; + + pfac += coefbv[f_id][i][0] * pvar[c_id][0] + + coefbv[f_id][i][1] * pvar[c_id][1] + + coefbv[f_id][i][2] * pvar[c_id][2]; + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < stride; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + _grad[0] += (pfac + rfac) * b_f_face_normal[f_id][0]; + _grad[1] += (pfac + rfac) * b_f_face_normal[f_id][1]; + _grad[2] += (pfac + rfac) * b_f_face_normal[f_id][2]; + } + grad[c_id][i][0] = _grad[0]; + grad[c_id][i][1] = _grad[1]; + grad[c_id][i][2] = _grad[2]; +} diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh new file mode 100644 index 0000000000..cd7ebe49e1 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh @@ -0,0 +1,191 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + + +template +__global__ static void +_compute_reconstruct_v_i_face_gather_v5(cs_lnum_t n_cells, + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + cs_lnum_t c_id2, f_id; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + __shared__ cs_real_t _grad[256][stride][3]; + + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _grad[lindex][i][j] = grad[c_id1][i][j]; + } + } + + + auto _pvar1 = pvar[c_id1]; + auto _r_grad1 = r_grad[c_id1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + auto _pvar2 = pvar[c_id2]; + auto _r_grad2 = r_grad[c_id2]; + auto _dofij = dofij[f_id]; + auto _i_f_face_normal = i_f_face_normal[f_id]; + auto _cell_i_faces_sgn = cell_i_faces_sgn[index]; + + pond = (_cell_i_faces_sgn > 0) ? weight[f_id] : 1. - weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + for (cs_lnum_t i = 0; i < stride; i++) { + pfaci = (1.0-ktpond) * (_pvar2[i] - _pvar1[i]); + + /* Reconstruction part */ + rfac = 0.5 * ( _dofij[0]*( _r_grad1[i][0] + + _r_grad2[i][0]) + + _dofij[1]*( _r_grad1[i][1] + + _r_grad2[i][1]) + + _dofij[2]*( _r_grad1[i][2] + + _r_grad2[i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + _grad[lindex][i][j] += _cell_i_faces_sgn * (pfaci + rfac) * _i_f_face_normal[j]; + } + } + } + + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + grad[c_id1][i][j] = _grad[lindex][i][j]; + } + } + +} + + + +template +__global__ static void +_compute_reconstruct_v_b_face_gather_v5(cs_lnum_t n_b_cells, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_id1 >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + __shared__ cs_real_t _grad[256][stride][3]; + + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _grad[lindex][i][j] = grad[c_id][i][j]; + } + } + + auto _r_grad = r_grad[c_id]; + auto _pvar = pvar[c_id]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + auto _diipb = diipb[f_id]; + auto _coefav = coefav[f_id]; + auto _coefbv = coefbv[f_id]; + auto _b_f_face_normal = b_f_face_normal[f_id]; + + for (cs_lnum_t i = 0; i < stride; i++) { + + pfac = inc*_coefav[i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += _coefbv[i][k] * _pvar[k]; + } + + pfac -= _pvar[i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < stride; k++) { + vecfac = _r_grad[k][0] * _diipb[0] + + _r_grad[k][1] * _diipb[1] + + _r_grad[k][2] * _diipb[2]; + rfac += _coefbv[i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + _grad[lindex][i][j] += (pfac + rfac) * _b_f_face_normal[j]; + } + } + } + + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + grad[c_id][i][j] = _grad[lindex][i][j]; + } + } +} diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh new file mode 100644 index 0000000000..a0d0f2b000 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh @@ -0,0 +1,174 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +template +__global__ static void +_compute_reconstruct_v_i_face(cs_lnum_t n_i_faces, + const cs_lnum_2_t *i_face_cells, + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + cs_lnum_t c_id1, c_id2; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + pond = weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + + for (cs_lnum_t i = 0; i < stride; i++) { + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_id][j]); + atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_id][j])); + + } + } + +} + +template +__global__ static void +_compute_reconstruct_v_b_face(cs_lnum_t n_b_faces, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + cs_lnum_t c_id; + cs_real_t pfac, rfac, vecfac; + + c_id = b_face_cells[f_id]; + + for (cs_lnum_t i = 0; i < stride; i++) { + + pfac = inc*coefav[f_id][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++) + atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_id][j]); + + } +} + +template +__global__ static void +_compute_reconstruct_correction(cs_lnum_t n_cells, + cs_lnum_t has_dc, + const int *restrict c_disable_flag, + const cs_real_t *restrict cell_f_vol, + cs_real_t (*restrict grad)[stride][3], + const cs_real_33_t *restrict corr_grad_lin, + bool test_bool + ) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id >= n_cells){ + return; + } + cs_real_t dvol; + /* Is the cell disabled (for solid or porous)? Not the case if coupled */ + if (has_dc * c_disable_flag[has_dc * c_id] == 0) + dvol = 1. / cell_f_vol[c_id]; + else + dvol = 0.; + + + for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t j = 0; j < 3; j++) + grad[c_id][i][j] *= dvol; + } + + + if (test_bool) { + cs_real_t gradpa[3]; + // printf("dvol = %.17lg\n", dvol); + for (cs_lnum_t i = 0; i < stride; i++) { + for (cs_lnum_t j = 0; j < 3; j++) { + gradpa[j] = grad[c_id][i][j]; + grad[c_id][i][j] = 0.; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + for (cs_lnum_t k = 0; k < 3; k++){ + grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; + } + } + } + } + +} diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh new file mode 100644 index 0000000000..f68189bfca --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh @@ -0,0 +1,134 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +template +__global__ static void +_compute_reconstruct_v_i_face_cf(cs_lnum_t n_i_faces, + const cs_lnum_2_t *i_face_cells, + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + cs_lnum_t c_id1, c_id2; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + pond = weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + + using Cell = AtomicCell; + Cell grad_cf1, grad_cf2; + + + for (cs_lnum_t i = 0; i < stride; i++) { + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + grad_cf1[i][j].get() = (pfaci + rfac) * i_f_face_normal[f_id][j]; + grad_cf2[i][j].get() = - ((pfacj + rfac) * i_f_face_normal[f_id][j]); + } + } + Cell::ref(grad[c_id1]).conflict_free_add(-1u, grad_cf1); + Cell::ref(grad[c_id2]).conflict_free_add(-1u, grad_cf2); + +} + +template +__global__ static void +_compute_reconstruct_v_b_face_cf(cs_lnum_t n_b_faces, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + cs_lnum_t c_id; + cs_real_t pfac, rfac, vecfac; + + c_id = b_face_cells[f_id]; + + using Cell = AtomicCell; + Cell grad_cf; + + for (cs_lnum_t i = 0; i < stride; i++) { + + pfac = inc*coefav[f_id][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < stride; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + grad_cf[i][j].get() = (pfac + rfac) * b_f_face_normal[f_id][j]; + } + } + Cell::ref(grad[c_id]).conflict_free_add(-1u, grad_cf); +} diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh new file mode 100644 index 0000000000..a2dfcf3c4c --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -0,0 +1,182 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + + +template +__global__ static void +_compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, + const cs_lnum_2_t *i_face_cells, + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + + size_t f_idt = f_id / stride; + size_t i = f_id % stride; + + cs_lnum_t c_id1, c_id2; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + c_id1 = i_face_cells[f_idt][0]; + c_id2 = i_face_cells[f_idt][1]; + + pond = weight[f_idt]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_idt][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_idt][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_idt][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_idt][j]); + atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_idt][j])); + } + +} + +template +__global__ static void +_compute_reconstruct_v_b_face_v2(cs_lnum_t n_b_faces, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + + size_t f_idt = f_id / stride; + size_t i = f_id % stride; + + cs_lnum_t c_id; + cs_real_t pond, ktpond, pfac, rfac, vecfac; + + // if (coupled_faces[f_idt * cpl_stride]) + // return; + + c_id = b_face_cells[f_idt]; + + pfac = inc*coefav[f_idt][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_idt][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + +// /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_idt][0] + + r_grad[c_id][k][1] * diipb[f_idt][1] + + r_grad[c_id][k][2] * diipb[f_idt][2]; + rfac += coefbv[f_idt][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_idt][j]); + } + +} + + +template +__global__ static void +_compute_reconstruct_correction_v2( cs_lnum_t n_cells, + cs_lnum_t has_dc, + const int *restrict c_disable_flag, + const cs_real_t *restrict cell_f_vol, + cs_real_t (*restrict grad)[stride][3], + const cs_real_33_t *restrict corr_grad_lin, + bool test_bool + ) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_id >= n_cells){ + return; + } + + size_t c_idt = c_id / stride; + size_t i = c_id % stride; + + cs_real_t dvol; + /* Is the cell disabled (for solid or porous)? Not the case if coupled */ + if (has_dc * c_disable_flag[has_dc * c_idt] == 0) + dvol = 1. / cell_f_vol[c_idt]; + else + dvol = 0.; + + for (cs_lnum_t j = 0; j < 3; j++){ + grad[c_idt][i][j] *= dvol; + } + + + if (test_bool) { + cs_real_t gradpa[3]; + for (cs_lnum_t j = 0; j < 3; j++) { + gradpa[j] = grad[c_idt][i][j]; + } + + for (cs_lnum_t j = 0; j < 3; j++) { + grad[c_idt][i][j] = corr_grad_lin[c_idt][j][0] * gradpa[0] + + corr_grad_lin[c_idt][j][1] * gradpa[1] + + corr_grad_lin[c_idt][j][2] * gradpa[2]; + } + } + +} \ No newline at end of file diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh new file mode 100644 index 0000000000..ae4dbd5092 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh @@ -0,0 +1,141 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + +template +__global__ static void +_compute_reconstruct_v_i_face_v2_cf(cs_lnum_t n_i_faces, + const cs_lnum_2_t *i_face_cells, + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + + size_t f_idt = f_id / stride; + size_t i = f_id % stride; + + cs_lnum_t c_id1, c_id2; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + c_id1 = i_face_cells[f_idt][0]; + c_id2 = i_face_cells[f_idt][1]; + + pond = weight[f_idt]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_idt][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_idt][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_idt][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + using Cell = AtomicCell; + Cell grad_cf1, grad_cf2; + + for (cs_lnum_t j = 0; j < 3; j++) { + grad_cf1[j].get() = (pfaci + rfac) * i_f_face_normal[f_idt][j]; + grad_cf2[j].get() = - ((pfacj + rfac) * i_f_face_normal[f_idt][j]); + } + Cell::ref(grad[c_id1][i]).conflict_free_add(-1u, grad_cf1); + Cell::ref(grad[c_id2][i]).conflict_free_add(-1u, grad_cf2); + +} + + + + +template +__global__ static void +_compute_reconstruct_v_b_face_v2_cf(cs_lnum_t n_b_faces, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + + size_t f_idt = f_id / stride; + size_t i = f_id % stride; + + cs_lnum_t c_id; + cs_real_t pond, ktpond, pfac, rfac, vecfac; + + c_id = b_face_cells[f_idt]; + + pfac = inc*coefav[f_idt][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_idt][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + +// /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < stride; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_idt][0] + + r_grad[c_id][k][1] * diipb[f_idt][1] + + r_grad[c_id][k][2] * diipb[f_idt][2]; + rfac += coefbv[f_idt][i][k] * vecfac; + } + + using Cell = AtomicCell; + Cell grad_cf; + + for (cs_lnum_t j = 0; j < 3; j++){ + grad_cf[j].get() = (pfac + rfac) * b_f_face_normal[f_idt][j]; + } + Cell::ref(grad[c_id][i]).conflict_free_add(-1u, grad_cf); + +} diff --git a/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh b/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh new file mode 100644 index 0000000000..868a179d12 --- /dev/null +++ b/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh @@ -0,0 +1,109 @@ +__global__ static void +cs_slope_test_gradient_vector_cuda_i_gather( const cs_lnum_t n_cells, + const cs_real_3_t *restrict i_face_cog, + const cs_real_3_t *restrict cell_cen, + const cs_real_3_t *pvar, + const cs_real_t *restrict i_massflux, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + cs_real_33_t *grad, + cs_real_33_t *grdpa) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + cs_real_t difv[3], djfv[3], vfac[3]; + cs_real_t pif, pjf, pfac, face_sgn; + cs_lnum_t c_id2, f_id; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + face_sgn = cell_i_faces_sgn[index]; + + for (int jsou = 0; jsou < 3; jsou++) { + difv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id1][jsou]; + djfv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id2][jsou]; + } + + for (int isou = 0; isou < 3; isou++) { + pif = pvar[c_id1][isou]; + pjf = pvar[c_id2][isou]; + for (int jsou = 0; jsou < 3; jsou++) { + pif = pif + grad[c_id1][isou][jsou] * difv[jsou]; + pjf = pjf + grad[c_id2][isou][jsou] * djfv[jsou]; + } + + pfac = pjf; + if (i_massflux[f_id] * face_sgn > 0.) + pfac = pif; + pfac *= face_sgn; + + for (int jsou = 0; jsou < 3; jsou++) { + vfac[jsou] = pfac*i_f_face_normal[f_id][jsou]; + grdpa[c_id1][isou][jsou] += vfac[jsou]; + } + } + } +} + + +__global__ static void +cs_slope_test_gradient_vector_cuda_b_gather(const cs_lnum_t n_b_cells, + const cs_real_3_t *pvar, + const cs_real_3_t *restrict diipb, + const int inc, + const cs_real_3_t *coefa, + const cs_real_33_t *coefb, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_real_33_t *grad, + cs_real_33_t *grdpa) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_id1 >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_real_t diipbv[3]; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + for (int jsou = 0; jsou < 3; jsou++) + diipbv[jsou] = diipb[f_id][jsou]; + + for (int isou = 0; isou < 3; isou++) { + pfac = inc*coefa[f_id][isou]; + /*coefu is a matrix */ + for (int jsou = 0; jsou < 3; jsou++) + pfac += coefb[f_id][jsou][isou]*( pvar[c_id][jsou] + + grad[c_id][jsou][0]*diipbv[0] + + grad[c_id][jsou][1]*diipbv[1] + + grad[c_id][jsou][2]*diipbv[2]); + + for (int jsou = 0; jsou < 3; jsou++) + grdpa[c_id][isou][jsou] += pfac*b_f_face_normal[f_id][jsou]; + } + } +} diff --git a/src/alge/cs_slope_test_gradient_vector_cuda_scatter.cuh b/src/alge/cs_slope_test_gradient_vector_cuda_scatter.cuh new file mode 100644 index 0000000000..80daba1938 --- /dev/null +++ b/src/alge/cs_slope_test_gradient_vector_cuda_scatter.cuh @@ -0,0 +1,113 @@ +__global__ static void +cs_slope_test_gradient_vector_cuda_i( const cs_lnum_t n_i_faces, + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict i_face_cog, + const cs_real_3_t *restrict cell_cen, + const cs_real_3_t *pvar, + const cs_real_t *restrict i_massflux, + const cs_real_3_t *restrict i_f_face_normal, + cs_real_33_t *grad, + cs_real_33_t *grdpa) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + cs_real_t difv[3], djfv[3], vfac[3]; + cs_real_t pif, pjf, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + for (int jsou = 0; jsou < 3; jsou++) { + difv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id1][jsou]; + djfv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id2][jsou]; + } + + /* x-y-z component, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + pif = pvar[c_id1][isou]; + pjf = pvar[c_id2][isou]; + for (int jsou = 0; jsou < 3; jsou++) { + pif = pif + grad[c_id1][isou][jsou]*difv[jsou]; + pjf = pjf + grad[c_id2][isou][jsou]*djfv[jsou]; + } + + pfac = pjf; + if (i_massflux[f_id] > 0.) pfac = pif; + + /* U gradient */ + + for (int jsou = 0; jsou < 3; jsou++) { + vfac[jsou] = pfac*i_f_face_normal[f_id][jsou]; + atomicAdd(&grdpa[c_id1][isou][jsou], vfac[jsou]); + atomicAdd(&grdpa[c_id2][isou][jsou],- vfac[jsou]); + } + } +} + + +__global__ static void +cs_slope_test_gradient_vector_cuda_b(const cs_lnum_t n_b_faces, + const cs_real_3_t *pvar, + const cs_lnum_t *restrict b_face_cells, + const cs_real_3_t *restrict diipb, + const int inc, + const cs_real_3_t *coefa, + const cs_real_33_t *coefb, + const cs_real_3_t *restrict b_f_face_normal, + const cs_real_33_t *grad, + cs_real_33_t *grdpa) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + + cs_real_t diipbv[3]; + cs_lnum_t c_id1 = b_face_cells[f_id]; + cs_real_t pfac; + + for (int jsou = 0; jsou < 3; jsou++) + diipbv[jsou] = diipb[f_id][jsou]; + + /* x-y-z components, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + pfac = inc*coefa[f_id][isou]; + /*coefu is a matrix */ + for (int jsou = 0; jsou < 3; jsou++) + pfac += coefb[f_id][jsou][isou]*( pvar[c_id1][jsou] + + grad[c_id1][jsou][0]*diipbv[0] + + grad[c_id1][jsou][1]*diipbv[1] + + grad[c_id1][jsou][2]*diipbv[2]); + + for (int jsou = 0; jsou < 3; jsou++) + atomicAdd(&grdpa[c_id1][isou][jsou], pfac*b_f_face_normal[f_id][jsou]); + } + +} + + + +__global__ static void +cs_slope_test_gradient_vector_cuda_f(const cs_lnum_t n_cells, + cs_real_t *cell_vol, + cs_real_33_t *grdpa) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_idx >= n_cells){ + return; + } + size_t c_id = c_idx / (3*3); + size_t i = (c_idx / 3) % 3; + size_t j = c_idx % 3; + + cs_real_t unsvol = 1./cell_vol[c_id]; + grdpa[c_id][i][j] *= unsvol; +} diff --git a/src/base/cs_base_cuda.cu b/src/base/cs_base_cuda.cu index 289a7deeea..21561ae5f8 100644 --- a/src/base/cs_base_cuda.cu +++ b/src/base/cs_base_cuda.cu @@ -224,7 +224,7 @@ cs_cuda_mem_free(void *p, CS_CUDA_CHECK_CALL(cudaFree(p), file_name, line_num); #if 0 - CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num); + CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num)); #endif } @@ -257,7 +257,7 @@ cs_cuda_mem_free_host(void *p, CS_CUDA_CHECK_CALL(cudaFreeHost(p), file_name, line_num); #if 0 - CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num); + CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num)); #endif } @@ -325,11 +325,13 @@ cs_cuda_copy_h2d_async(void *dst, /*----------------------------------------------------------------------------*/ void -cs_cuda_copy_d2h(void *dst, +_cs_cuda_copy_d2h(void *dst, const void *src, - size_t size) + size_t size, + const char* filename, + long line) { - CS_CUDA_CHECK(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); + CS_CUDA_CHECK_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost), filename, line); } /*----------------------------------------------------------------------------*/ diff --git a/src/base/cs_base_cuda.h b/src/base/cs_base_cuda.h index fdcf8fac52..88c6748943 100644 --- a/src/base/cs_base_cuda.h +++ b/src/base/cs_base_cuda.h @@ -286,9 +286,13 @@ cs_cuda_copy_h2d_async(void *dst, /*----------------------------------------------------------------------------*/ void -cs_cuda_copy_d2h(void *dst, +_cs_cuda_copy_d2h(void *dst, const void *src, - size_t size); + size_t size, + const char* filename, + long line); + +#define cs_cuda_copy_d2h(dst, src, size) _cs_cuda_copy_d2h(dst, src, size, __FILE__, __LINE__) /*----------------------------------------------------------------------------*/ /*!