diff --git a/src/alge/Makefile.am b/src/alge/Makefile.am
index 0118586293..d3660e1030 100644
--- a/src/alge/Makefile.am
+++ b/src/alge/Makefile.am
@@ -198,6 +198,7 @@ libcsalge_a_SOURCES += \
 cs_benchmark_cuda.cu \
 cs_blas_cuda.cu \
 cs_gradient_cuda.cu \
+cs_convection_diffusion_cuda.cu \
 cs_matrix_spmv_cuda.cu \
 cs_sles_it_cuda.cu \
 cs_sles_pc_cuda.cu
diff --git a/src/alge/cs_alge_cuda.cuh b/src/alge/cs_alge_cuda.cuh
new file mode 100644
index 0000000000..11c5123553
--- /dev/null
+++ b/src/alge/cs_alge_cuda.cuh
@@ -0,0 +1,428 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+#pragma once
+
+#include "cs_defs.h"
+
+/*----------------------------------------------------------------------------
+ * Standard C library headers
+ *----------------------------------------------------------------------------*/
+
+#include <assert.h>
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <chrono>
+
+#if defined(HAVE_MPI)
+#include <mpi.h>
+#endif
+
+#include <cuda_runtime_api.h>
+
+/*----------------------------------------------------------------------------
+ *  Local headers
+ *----------------------------------------------------------------------------*/
+
+#include "bft_error.h"
+#include "bft_mem.h"
+
+#include "cs_base_accel.h"
+#include "cs_base_cuda.h"
+#include "cs_blas.h"
+#include "cs_cell_to_vertex.h"
+#include "cs_ext_neighborhood.h"
+#include "cs_field.h"
+#include "cs_field_pointer.h"
+#include "cs_halo.h"
+#include "cs_halo_perio.h"
+#include "cs_log.h"
+#include "cs_math.h"
+#include "cs_mesh.h"
+#include "cs_mesh_adjacencies.h"
+#include "cs_mesh_quantities.h"
+#include "cs_parall.h"
+#include "cs_porous_model.h"
+#include "cs_prototypes.h"
+#include "cs_timer.h"
+#include "cs_timer_stats.h"
+
+BEGIN_C_DECLS
+
+  typedef cs_real_t  cs_cocg_t;
+  typedef cs_real_t  cs_cocg_6_t[6];
+  typedef cs_real_t  cs_cocg_33_t[3][3];
+
+END_C_DECLS
+
+template <typename T>
+static void
+_sync_or_copy_real_h2d(const  T   *val_h,
+                       cs_lnum_t           n_vals,
+                       int                 device_id,
+                       cudaStream_t        stream,
+                       const T   **val_d,
+                       void              **buf_d)
+{
+  const T  *_val_d = NULL;
+  void             *_buf_d = NULL;
+
+  cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h);
+  size_t size = n_vals * sizeof(T);
+
+  if (alloc_mode == CS_ALLOC_HOST) {
+    CS_CUDA_CHECK(cudaMalloc(&_buf_d, size));
+    cs_cuda_copy_h2d(_buf_d, val_h, size);
+    _val_d = (const T *)_buf_d;
+  }
+  else {
+    _val_d = (const T *)cs_get_device_ptr((void *)val_h);
+
+    if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED)
+      cudaMemPrefetchAsync(val_h, size, device_id, stream);
+    else
+      cs_sync_h2d(val_h);
+  }
+
+  *val_d = _val_d;
+  *buf_d = _buf_d;
+}
+
+/* Compute gridsize*/
+
+static unsigned int 
+get_gridsize(unsigned int size, unsigned int blocksize){
+  unsigned int gridsize = (unsigned int)ceil((double)size / blocksize);
+
+  return gridsize;
+}
+
+
+__device__ static cs_real_t
+cs_math_fabs_cuda(cs_real_t  x)
+{
+  cs_real_t ret = (x <  0) ? -x : x;
+
+  return ret;
+}
+
+__device__ static cs_real_t
+cs_math_3_dot_product_cuda(const cs_real_t  u[3],
+                      const cs_real_t  v[3])
+{
+  cs_real_t prod = u[0]*v[0] + u[1]*v[1] + u[2]*v[2];
+
+  return prod;
+}
+
+__global__ static void
+_set_one_to_coeff_b(const cs_lnum_t            n_b_faces,
+                    cs_real_33_t   *_bc_coeff_b)
+{
+  cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_idx >= n_b_faces){
+    return;
+  }
+
+  cs_lnum_t f_id = c_idx / 3;
+  size_t i = c_idx % 3;
+  
+  _bc_coeff_b[f_id][i][i] = 1;
+}
+
+__device__ static void cs_math_3_normalize_cuda(const cs_real_t in[3],
+                                         cs_real_t out[3])
+{
+  cs_real_t norm = sqrt(in[0]*in[0] 
+          + in[1]*in[1]
+          + in[2]*in[2]);
+
+  cs_real_t inverse_norm =  1. / norm;
+
+  out[0] = inverse_norm * in[0];
+  out[1] = inverse_norm * in[1];
+  out[2] = inverse_norm * in[2];
+}
+
+__device__ static cs_real_t cs_math_3_square_norm_cuda(const cs_real_t in[3]){
+  cs_real_t norm = in[0]*in[0] + in[1]*in[1] + in[2]*in[2];
+  return norm;
+}
+
+__device__ static void _math_6_inv_cramer_sym_in_place_cuda(cs_cocg_t in[6]){
+  cs_real_t in00 = in[1]*in[2] - in[4]*in[4];
+  cs_real_t in01 = in[4]*in[5] - in[3]*in[2];
+  cs_real_t in02 = in[3]*in[4] - in[1]*in[5];
+  cs_real_t in11 = in[0]*in[2] - in[5]*in[5];
+  cs_real_t in12 = in[3]*in[5] - in[0]*in[4];
+  cs_real_t in22 = in[0]*in[1] - in[3]*in[3];
+
+  cs_real_t det_inv = 1. / (in[0]*in00 + in[3]*in01 + in[5]*in02);
+
+  in[0] = in00 * det_inv;
+  in[1] = in11 * det_inv;
+  in[2] = in22 * det_inv;
+  in[3] = in01 * det_inv;
+  in[4] = in12 * det_inv;
+  in[5] = in02 * det_inv;
+}
+
+template <int d_size>
+__device__ static void
+_fact_crout_pp_cuda(cs_real_t  *ad)
+{
+  cs_real_t aux[d_size];
+  for (int kk = 0; kk < d_size - 1; kk++) {
+    int kk_d_size = kk*(kk + 1)/2;
+    for (int ii = kk + 1; ii < d_size; ii++) {
+      int ii_d_size = ii*(ii + 1)/2;
+      aux[ii] = ad[ii_d_size + kk];
+      ad[ii_d_size + kk] =   ad[ii_d_size + kk]
+                           / ad[kk_d_size + kk];
+      for (int jj = kk + 1; jj < ii + 1; jj++) {
+        ad[ii_d_size + jj] = ad[ii_d_size + jj] - ad[ii_d_size + kk]*aux[jj];
+      }
+    }
+  }
+}
+
+template <int d_size>
+__device__ static void
+_fw_and_bw_ldtl_pp_cuda(const cs_real_t mat[],
+                         cs_real_t x[],
+                   const cs_real_t b[])
+{
+  cs_real_t  aux[d_size];
+
+  for (int ii = 0; ii < d_size; ii++) {
+    int ii_d_size = ii*(ii + 1)/2;
+    aux[ii] = b[ii];
+    for (int jj = 0; jj < ii; jj++) {
+      aux[ii] -= aux[jj]*mat[ii_d_size + jj];
+    }
+  }
+
+  for (int ii = 0; ii < d_size; ii++) {
+    int ii_d_size = ii*(ii + 1)/2;
+    aux[ii] /= mat[ii_d_size + ii];
+  }
+
+  for (int ii = d_size - 1; ii >= 0; ii--) {
+    x[ii] = aux[ii];
+    for (int jj = d_size - 1; jj > ii; jj--) {
+      int jj_d_size = jj*(jj + 1)/2;
+      x[ii] -= x[jj]*mat[jj_d_size + ii];
+    }
+  }
+}
+
+template <class V>
+__device__ uint32_t _conflict_mask(uint32_t mask, V v) noexcept {
+#if __CUDA_ARCH__ >= 700
+  return __match_any_sync(mask, v);
+#else
+  uint32_t lanemask_eq = 1u << (threadIdx.x % 32);
+  if (!(mask & lanemask_eq))
+    return 0;
+  uint32_t ref, ballot;
+  int leader;
+  goto entry;
+loop:
+  mask &= ~ballot;
+entry:
+  leader = __ffs(mask) - 1;
+  ref = __shfl_sync(mask, v, leader);
+  ballot = __ballot_sync(mask, v == ref);
+  if (!(ballot & lanemask_eq))
+    goto loop;
+  return ballot;
+#endif
+}
+
+template <class T>
+__device__ bool _reduce_add(uint32_t mask, uint32_t peers, T& v) noexcept {
+  int laneid = threadIdx.x % 32;
+  uint32_t lanemask_lt = (1u << laneid) - 1;
+  uint32_t lanemask_gt = -2u << laneid;
+  int rank = __popc(peers & lanemask_lt);
+  bool is_leader = rank == 0;
+
+  peers &= lanemask_gt;
+  while (__any_sync(mask, peers)) {
+    int next = __ffs(peers);
+
+    auto tmp = v.shuffle(mask, next - 1);
+    if (next) {
+      v.add(tmp);
+    }
+
+    peers &= __ballot_sync(mask, !(rank & 1));
+
+    rank >>= 1;
+  }
+
+  return is_leader;
+}
+
+
+template <class T, size_t...>
+class AtomicCell {
+  private:
+    T value = {};
+  public:
+    using inner_type = T;
+  public:
+    __device__ AtomicCell() noexcept = default;
+    __device__ AtomicCell(T value) noexcept : value(value) {}
+    __device__ void add(const AtomicCell&restrict other) restrict noexcept {
+      value += other.value;
+    }
+    __device__ void atomic_add(const AtomicCell&restrict other) restrict noexcept {
+      atomicAdd(&value, other.value);
+    }
+    __device__ AtomicCell exchange(const AtomicCell&restrict other) restrict noexcept {
+      AtomicCell previous = *this;
+      *this = other;
+      return previous;
+    }
+    __device__ AtomicCell atomic_exchange(const AtomicCell&restrict other) restrict noexcept {
+      return AtomicCell(atomicExch(&value, other.value));
+    }
+    __device__ AtomicCell shuffle(uint32_t mask, unsigned laneid) const noexcept {
+      return AtomicCell(__shfl_sync(mask, value, laneid));
+    }
+    __device__ uint32_t conflict_mask(uint32_t mask) const noexcept {
+      return _conflict_mask(mask, (uintptr_t)this);
+    }
+    __device__ bool reduce_add(uint32_t mask, uint32_t peers) noexcept {
+      return _reduce_add(mask, peers, *this);
+    }
+    __device__ void conflict_free_add(uint32_t mask, AtomicCell other) noexcept {
+      uint32_t peers = conflict_mask(mask);
+      if (other.reduce_add(mask, peers)) {
+        atomic_add(other);
+      }
+    }
+    __device__ inner_type& operator*() noexcept {
+      return value;
+    }
+    __device__ inner_type const& operator*() const noexcept {
+      return value;
+    }
+    __device__ inner_type* operator->() noexcept {
+      return &value;
+    }
+    __device__ inner_type const* operator->() const noexcept {
+      return &value;
+    }
+    __device__ inner_type& get() noexcept {
+      return value;
+    }
+    __device__ inner_type const& get() const noexcept {
+      return value;
+    }
+    static __device__ AtomicCell& ref(inner_type& r) noexcept {
+      return reinterpret_cast<AtomicCell&>(r);
+    }
+    static __device__ AtomicCell const& ref(inner_type const& r) noexcept {
+      return reinterpret_cast<AtomicCell const&>(r);
+    }
+};
+
+template <class T, size_t Head, size_t... Tail>
+class AtomicCell<T, Head, Tail...> {
+  private:
+    AtomicCell<T, Tail...> data[Head];
+  public:
+    using inner_type = typename AtomicCell<T, Tail...>::inner_type[Head];
+  public:
+    __device__ AtomicCell() noexcept = default;
+    __device__ void add(const AtomicCell&restrict other) restrict noexcept {
+      for (size_t i = 0; i < Head; ++i) {
+        data[i].add(other.data[i]);
+      }
+    }
+    __device__ void atomic_add(const AtomicCell&restrict other) restrict noexcept {
+      for (size_t i = 0; i < Head; ++i) {
+        data[i].atomic_add(other.data[i]);
+      }
+    }
+    __device__ AtomicCell exchange(const AtomicCell&restrict other) restrict noexcept {
+      AtomicCell previous;
+      for (size_t i = 0; i < Head; ++i) {
+        previous.data[i] = data[i].exchange(other.data[i]);
+      }
+      return previous;
+    }
+    __device__ AtomicCell atomic_exchange(const AtomicCell&restrict other) restrict noexcept {
+      AtomicCell previous;
+      for (size_t i = 0; i < Head; ++i) {
+        previous.data[i] = data[i].atomic_exchange(other.data[i]);
+      }
+      return previous;
+    }
+    __device__ AtomicCell shuffle(uint32_t mask, unsigned laneid) const noexcept {
+      AtomicCell shuffled;
+      for (size_t i = 0; i < Head; ++i) {
+        shuffled.data[i] = data[i].shuffle(mask, laneid);
+      }
+      return shuffled;
+    }
+    __device__ uint32_t conflict_mask(uint32_t mask) const noexcept {
+      return _conflict_mask(mask, (uintptr_t)this);
+    }
+    __device__ bool reduce_add(uint32_t mask, uint32_t peers) noexcept {
+      return _reduce_add(mask, peers, *this);
+    }
+    __device__ void conflict_free_add(uint32_t mask, AtomicCell other) noexcept {
+      uint32_t peers = conflict_mask(mask);
+      if (other.reduce_add(mask, peers)) {
+        atomic_add(other);
+      }
+    }
+    __device__ AtomicCell<T, Tail...>& operator[](size_t i) noexcept {
+      return data[i];
+    }
+    __device__ AtomicCell<T, Tail...> const& operator[](size_t i) const noexcept {
+      return data[i];
+    }
+    __device__ inner_type& get() noexcept {
+      return reinterpret_cast<inner_type&>(*this);
+    }
+    __device__ inner_type const& get() const noexcept {
+      return reinterpret_cast<inner_type const&>(*this);
+    }
+    static __device__ AtomicCell& ref(inner_type& r) noexcept {
+      return reinterpret_cast<AtomicCell&>(r);
+    }
+    static __device__ AtomicCell const& ref(inner_type const& r) noexcept {
+      return reinterpret_cast<AtomicCell const&>(r);
+    }
+};
diff --git a/src/alge/cs_convection_diffusion.cxx b/src/alge/cs_convection_diffusion.cxx
index c35c15bbf2..3ebbad926a 100644
--- a/src/alge/cs_convection_diffusion.cxx
+++ b/src/alge/cs_convection_diffusion.cxx
@@ -80,7 +80,7 @@
 /*----------------------------------------------------------------------------
  *  Header for the current file
  *----------------------------------------------------------------------------*/
-
+#include "time.h"
 #include "cs_convection_diffusion.h"
 #include "cs_convection_diffusion_priv.h"
 
@@ -1211,6 +1211,271 @@ cs_slope_test_gradient(int                     f_id,
 
 }
 
+#if defined(HAVE_OPENMP_TARGET)
+// #pragma omp declare target
+// const cs_real_t cs_math_zero_threshold = FLT_MIN;
+// #pragma omp end declare target
+
+void
+cs_slope_test_gradient_vector_target(const int              inc,
+                              const cs_halo_type_t   halo_type,
+                              const cs_real_33_t    *grad,
+                              cs_real_33_t          *grdpa,
+                              const cs_real_3_t     *pvar,
+                              const cs_real_3_t     *coefa,
+                              const cs_real_33_t    *coefb,
+                              const cs_real_t       *i_massflux)
+{
+  const cs_mesh_t  *m = cs_glob_mesh;
+  const cs_mesh_adjacencies_t *madj = cs_glob_mesh_adjacencies;
+  const cs_halo_t  *halo = m->halo;
+  cs_mesh_quantities_t  *fvq = cs_glob_mesh_quantities;
+
+  const cs_lnum_t n_cells                   = m->n_cells;
+  const cs_lnum_t n_b_cells                 = m->n_b_cells;
+  const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts;
+  const cs_lnum_t n_i_faces                 = m->n_i_faces;
+  const cs_lnum_t n_b_faces                 = m->n_b_faces;
+
+  const cs_lnum_2_t *restrict i_face_cells
+    = (const cs_lnum_2_t *restrict)m->i_face_cells;
+  const cs_lnum_t *restrict b_face_cells
+    = (const cs_lnum_t *restrict)m->b_face_cells;
+  const cs_lnum_t *restrict b_cells
+    = (const cs_lnum_t *restrict)m->b_cells;
+  const cs_real_t *restrict cell_vol = fvq->cell_vol;
+  const cs_real_3_t *restrict cell_cen
+    = (const cs_real_3_t *restrict)fvq->cell_cen;
+  const cs_real_3_t *restrict i_f_face_normal
+    = (const cs_real_3_t *restrict)fvq->i_f_face_normal;
+  const cs_real_3_t *restrict b_f_face_normal
+    = (const cs_real_3_t *restrict)fvq->b_f_face_normal;
+  const cs_real_3_t *restrict i_face_cog
+    = (const cs_real_3_t *restrict)fvq->i_face_cog;
+  const cs_real_3_t *restrict diipb
+    = (const cs_real_3_t *restrict)fvq->diipb;
+  const cs_lnum_t *restrict cell_cells_idx
+    = (const cs_lnum_t *restrict)madj->cell_cells_idx;
+  const cs_lnum_t *restrict cell_b_faces_idx
+    = (const cs_lnum_t *restrict)madj->cell_b_faces_idx;
+  const cs_lnum_t *restrict cell_cells
+    = (const cs_lnum_t *restrict)madj->cell_cells;
+  const short int *restrict cell_i_faces_sgn
+    = (const short int *restrict)madj->cell_i_faces_sgn;
+  const cs_lnum_t *restrict cell_i_faces
+    = (const cs_lnum_t *restrict)madj->cell_i_faces;
+  const cs_lnum_t *restrict cell_b_faces
+    = (const cs_lnum_t *restrict)madj->cell_b_faces;
+
+  const int n_i_groups = m->i_face_numbering->n_groups;
+  const int n_i_threads = m->i_face_numbering->n_threads;
+  const int n_b_threads = m->b_face_numbering->n_threads;
+  const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index;
+  const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index;
+
+  bool scatter = true;
+
+#pragma omp target data map(tofrom: grdpa[0:n_cells_ext]) \
+                        map(to: grad[0:n_cells_ext], \
+                                i_face_cog[0:n_i_faces], \
+                                cell_i_faces_sgn[0:n_i_faces], \
+                                cell_i_faces[0:n_i_faces], \
+                                cell_cen[0:n_cells_ext], \
+                                pvar[0:n_cells_ext], \
+                                i_massflux[0:n_i_faces], \
+                                i_f_face_normal[0:n_i_faces], \
+                                b_face_cells[0:n_b_faces], \
+                                coefb[0:n_b_faces], \
+                                coefa[0:n_b_faces], \
+                                cell_cells_idx[0:n_cells_ext], \
+                                cell_cells[0:n_cells_ext], \
+                                b_cells[0:n_cells], \
+                                cell_b_faces_idx[0:n_cells+1], \
+                                cell_vol[0:n_cells_ext], \
+                                i_face_cells[0:n_i_faces])
+{
+  if(scatter){
+    #pragma omp target teams distribute parallel for \
+                        schedule(static,1)
+    for (cs_lnum_t face_id = 0; face_id < n_i_faces; face_id++){
+
+      cs_real_t difv[3], djfv[3];
+
+      cs_lnum_t ii = i_face_cells[face_id][0];
+      cs_lnum_t jj = i_face_cells[face_id][1];
+
+      for (int jsou = 0; jsou < 3; jsou++) {
+        difv[jsou] = i_face_cog[face_id][jsou] - cell_cen[ii][jsou];
+        djfv[jsou] = i_face_cog[face_id][jsou] - cell_cen[jj][jsou];
+      }
+
+      /* x-y-z component, p = u, v, w */
+
+      for (int isou = 0; isou < 3; isou++) {
+        cs_real_t pif = pvar[ii][isou];
+        cs_real_t pjf = pvar[jj][isou];
+        for (int jsou = 0; jsou < 3; jsou++) {
+          pif = pif + grad[ii][isou][jsou]*difv[jsou];
+          pjf = pjf + grad[jj][isou][jsou]*djfv[jsou];
+        }
+
+        cs_real_t pfac = pjf;
+        if (i_massflux[face_id] > 0.) pfac = pif;
+
+        /* U gradient */
+
+        cs_real_t vfac[3];
+
+        for (int jsou = 0; jsou < 3; jsou++) {
+          vfac[jsou] = pfac*i_f_face_normal[face_id][jsou];
+          #pragma omp atomic
+          grdpa[ii][isou][jsou] = grdpa[ii][isou][jsou] + vfac[jsou];
+          #pragma omp atomic
+          grdpa[jj][isou][jsou] = grdpa[jj][isou][jsou] - vfac[jsou];
+        }
+      }
+
+    }
+
+    #pragma omp target teams distribute parallel for \
+                    schedule(static,1) if(m->n_b_faces > CS_THR_MIN)
+    for (cs_lnum_t face_id = 0; face_id < n_b_faces; face_id++) {
+
+      cs_real_t diipbv[3];
+      cs_lnum_t ii = b_face_cells[face_id];
+
+      for (int jsou = 0; jsou < 3; jsou++){
+        diipbv[jsou] = diipb[face_id][jsou];
+      }
+
+      /* x-y-z components, p = u, v, w */
+
+      for (int isou = 0; isou < 3; isou++) {
+        cs_real_t pfac = inc*coefa[face_id][isou];
+        /*coefu is a matrix */
+        for (int jsou =  0; jsou < 3; jsou++)
+          pfac += coefb[face_id][jsou][isou]*(  pvar[ii][jsou]
+                                              + grad[ii][jsou][0]*diipbv[0]
+                                              + grad[ii][jsou][1]*diipbv[1]
+                                              + grad[ii][jsou][2]*diipbv[2]);
+
+        for (int jsou = 0; jsou < 3; jsou++){
+          #pragma omp atomic
+          grdpa[ii][isou][jsou] += pfac*b_f_face_normal[face_id][jsou];
+        }
+      }
+
+    }
+
+  }
+  else{
+    #pragma omp target teams distribute parallel for \
+                         schedule(static,1)
+    for (cs_lnum_t ii = 0; ii < n_cells; ii++){
+
+      cs_lnum_t s_id = cell_cells_idx[ii];
+      cs_lnum_t e_id = cell_cells_idx[ii+1];
+
+      cs_real_t difv[3], djfv[3];
+
+      cs_lnum_t jj, face_id, face_sgn;
+
+      for(cs_lnum_t index = s_id; index < e_id; index++){
+
+        jj = cell_cells[index];
+        face_id = cell_i_faces[index];
+        face_sgn = cell_i_faces_sgn[index];
+
+        for (int jsou = 0; jsou < 3; jsou++) {
+          difv[jsou] = i_face_cog[face_id][jsou] - cell_cen[ii][jsou];
+          djfv[jsou] = i_face_cog[face_id][jsou] - cell_cen[jj][jsou];
+        }
+
+        for (int isou = 0; isou < 3; isou++) {
+          cs_real_t pif = pvar[ii][isou];
+          cs_real_t pjf = pvar[jj][isou];
+          for (int jsou = 0; jsou < 3; jsou++) {
+            pif = pif + grad[ii][isou][jsou]*difv[jsou];
+            pjf = pjf + grad[jj][isou][jsou]*djfv[jsou];
+          }
+
+          cs_real_t pfac = pjf;
+          if (i_massflux[face_id]*face_sgn > 0.) pfac = pif;
+
+          pfac *= face_sgn;
+
+          cs_real_t vfac[3];
+
+          for (int jsou = 0; jsou < 3; jsou++) {
+            vfac[jsou] = pfac*i_f_face_normal[face_id][jsou];
+            grdpa[ii][isou][jsou] = grdpa[ii][isou][jsou] + vfac[jsou];
+          }
+        }
+      }
+
+    }
+
+    #pragma omp target teams distribute parallel for \
+                       schedule(static,1) if(m->n_b_faces > CS_THR_MIN)
+    for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) {
+
+      cs_lnum_t ii = b_cells[c_idx];
+
+      cs_lnum_t s_id = cell_b_faces_idx[ii];
+      cs_lnum_t e_id = cell_b_faces_idx[ii+1];
+
+      cs_lnum_t face_id;
+
+      cs_real_t diipbv[3];
+
+      for(cs_lnum_t index = s_id; index < e_id; index++){
+
+        face_id = cell_b_faces[index];
+
+        for (int jsou = 0; jsou < 3; jsou++){
+          diipbv[jsou] = diipb[face_id][jsou];
+        }
+
+        /* x-y-z components, p = u, v, w */
+
+        for (int isou = 0; isou < 3; isou++) {
+          cs_real_t pfac = inc*coefa[face_id][isou];
+          /*coefu is a matrix */
+          for (int jsou =  0; jsou < 3; jsou++)
+            pfac += coefb[face_id][jsou][isou]*(  pvar[ii][jsou]
+                                                + grad[ii][jsou][0]*diipbv[0]
+                                                + grad[ii][jsou][1]*diipbv[1]
+                                                + grad[ii][jsou][2]*diipbv[2]);
+
+          for (int jsou = 0; jsou < 3; jsou++){
+            grdpa[ii][isou][jsou] += pfac*b_f_face_normal[face_id][jsou];
+          }
+        }
+      }
+
+    }
+  }
+
+  #pragma omp target teams distribute parallel for 
+  for (cs_lnum_t cell_id = 0; cell_id < n_cells; cell_id++) {
+    cs_real_t unsvol = 1./cell_vol[cell_id];
+    for (int isou = 0; isou < 3; isou++) {
+      for (int jsou = 0; jsou < 3; jsou++){
+        grdpa[cell_id][isou][jsou] = grdpa[cell_id][isou][jsou]*unsvol;
+      }
+    }
+  }
+}
+  /* Handle parallelism and periodicity */
+
+  if (halo != NULL) {
+    cs_halo_sync_var_strided(halo, halo_type, (cs_real_t *)grdpa, 9);
+    if (m->n_init_perio > 0)
+      cs_halo_perio_sync_var_sym_tens(halo, halo_type, (cs_real_t *)grdpa);
+  }
+}
+#endif
+
 /*----------------------------------------------------------------------------*/
 /*!
  * \brief Compute the upwind gradient used in the pure SOLU schemes
@@ -3996,6 +4261,20 @@ cs_face_convection_scalar(int                        idtvar,
   BFT_FREE(courant);
 }
 
+void cs_math_3_normalize_target_cd(const cs_real_t in[3],
+                                         cs_real_t out[3])
+{
+  cs_real_t norm = sqrt(in[0]*in[0] 
+          + in[1]*in[1]
+          + in[2]*in[2]);
+
+  cs_real_t inverse_norm =  1. / norm;
+
+  out[0] = inverse_norm * in[0];
+  out[1] = inverse_norm * in[1];
+  out[2] = inverse_norm * in[2];
+}
+
 /*----------------------------------------------------------------------------*/
 /*!
  * \brief Add the explicit part of the convection/diffusion terms of a transport
@@ -4107,7 +4386,10 @@ cs_convection_diffusion_vector(int                         idtvar,
   cs_mesh_quantities_t  *fvq = cs_glob_mesh_quantities;
 
   const cs_lnum_t n_cells = m->n_cells;
+  const cs_lnum_t n_b_cells = m->n_b_cells;
   const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts;
+  const cs_lnum_t n_i_faces = m->n_i_faces;
+  const cs_lnum_t n_b_faces = m->n_b_faces;
   const int n_i_groups = m->i_face_numbering->n_groups;
   const int n_i_threads = m->i_face_numbering->n_threads;
   const int n_b_threads = m->b_face_numbering->n_threads;
@@ -4270,67 +4552,216 @@ cs_convection_diffusion_vector(int                         idtvar,
          - when we have convection, we are not in pure upwind
            and we have not shunted the slope test. */
 
-  if (  (idiffp != 0 && ircflp == 1) || ivisep == 1
-     || (   iconvp != 0 && iupwin == 0
-         && (ischcp == 0 || ircflp == 1 || isstpp == 0))) {
 
-    if (f_id != -1) {
-      /* Get the calculation option from the field */
-      if (f->type & CS_FIELD_VARIABLE && eqp.iwgrec == 1) {
-        if (eqp.idiff > 0) {
-          int key_id = cs_field_key_id("gradient_weighting_id");
-          int diff_id = cs_field_get_key_int(f, key_id);
-          if (diff_id > -1) {
-            cs_field_t *weight_f = cs_field_by_id(diff_id);
-            gweight = weight_f->val;
-            cs_field_synchronize(weight_f, halo_type);
+
+  /* Timing the computation */
+
+  clock_t start, stop, start_slope, stop_slope;
+  unsigned long elapsed, elapsed_cuda, elapsed_slope;
+
+  cs_real_33_t *grad_cpu, *grad_gpu;
+  cs_real_33_t *grdpa_cpu, *grdpa_gpu;
+  
+  bool compute_cuda;
+  bool compute_cpu;
+  bool res_cpu;
+  bool perf;
+  bool accuracy;
+
+#if defined(HAVE_CUDA)
+  compute_cuda = (cs_get_device_id() > -1) ? true : false;
+#else
+  compute_cuda = false;
+#endif
+
+res_cpu = !compute_cuda;
+
+#if defined(DEBUG)
+  compute_cpu = true;
+  perf        = true;
+  accuracy    = true;
+#elif defined(NDEBUG)
+  compute_cpu = true;
+  perf        = false;
+  accuracy    = false;
+#else
+  compute_cpu = false;
+  perf        = false;
+  accuracy    = false;
+#endif
+
+
+  // Pour l'instant ces lignes sont pour moi
+  // Elles seront à enlever
+  // compute_cuda  = true;
+  compute_cpu   = true;
+  // res_cpu       = false;
+
+  // A ne pas garder dans la version finale
+  perf        = true;
+  // accuracy    = false;
+
+#if defined(HAVE_CUDA)
+  if(compute_cuda){
+    if(!res_cpu){
+      grad_gpu = grad;
+      grdpa_gpu = grdpa;
+    } else {
+      BFT_MALLOC(grad_gpu, n_cells_ext, cs_real_33_t);
+      BFT_MALLOC(grdpa_gpu, n_cells_ext, cs_real_33_t);
+    }
+    if(perf){
+      start = clock();
+    }
+    
+    bool flag1 = (  (idiffp != 0 && ircflp == 1) || ivisep == 1
+      || (   iconvp != 0 && iupwin == 0
+          && (ischcp == 0 || ircflp == 1 || isstpp == 0)));
+
+    if (flag1) {
+
+      if (f_id != -1) {
+        /* Get the calculation option from the field */
+        if (f->type & CS_FIELD_VARIABLE && eqp.iwgrec == 1) {
+          if (eqp.idiff > 0) {
+            int key_id = cs_field_key_id("gradient_weighting_id");
+            int diff_id = cs_field_get_key_int(f, key_id);
+            if (diff_id > -1) {
+              cs_field_t *weight_f = cs_field_by_id(diff_id);
+              gweight = weight_f->val;
+              cs_field_synchronize(weight_f, halo_type);
+            }
           }
         }
       }
-    }
 
-    cs_gradient_vector_synced_input(var_name,
-                                    gradient_type,
-                                    halo_type,
-                                    inc,
-                                    nswrgp,
-                                    iwarnp,
-                                    imligp,
-                                    epsrgp,
-                                    climgp,
-                                    coefav,
-                                    coefbv,
-                                    _pvar,
-                                    gweight, /* weighted gradient */
-                                    cpl,
-                                    grad);
+      cs_gradient_vector_synced_input(var_name,
+                                      gradient_type,
+                                      halo_type,
+                                      inc,
+                                      nswrgp,
+                                      iwarnp,
+                                      imligp,
+                                      epsrgp,
+                                      climgp,
+                                      coefav,
+                                      coefbv,
+                                      _pvar,
+                                      gweight, /* weighted gradient */
+                                      cpl,
+                                      grad_gpu);
+    }
+
+    bool flag2 = (iconvp > 0 && iupwin == 0 && isstpp == 0);
+
+    cs_convection_diffusion_vector_cuda(m,
+                                        cs_glob_mesh_adjacencies,
+                                        fvq,
+                                        _pvar,
+                                        i_massflux,
+                                        grad_gpu,
+                                        grdpa_gpu,
+                                        coefav,
+                                        coefbv,
+                                        inc,
+                                        flag1,
+                                        flag2,
+                                        perf);
 
-  }
-  else {
-#   pragma omp parallel for
-    for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) {
-      for (int isou = 0; isou < 3; isou++) {
-        for (int jsou = 0; jsou < 3; jsou++)
-          grad[cell_id][isou][jsou] = 0.;
+    /* Handle parallelism and periodicity */
+    if (flag2){
+      if (halo != NULL) {
+        cs_halo_sync_var_strided(halo, halo_type, (cs_real_t *)grdpa_gpu, 9);
+        if (m->n_init_perio > 0)
+          cs_halo_perio_sync_var_sym_tens(halo, halo_type, (cs_real_t *)grdpa_gpu);
       }
     }
+
+    if(perf){
+      stop = clock();
+      elapsed_cuda = (stop - start) * 1e6 / CLOCKS_PER_SEC;
+    }
   }
+#endif
 
-  /* ======================================================================
-     ---> Compute uncentered gradient grdpa for the slope test
-     ======================================================================*/
+  if(compute_cpu){
+    if(res_cpu){
+      grad_cpu = grad;
+      grdpa_cpu = grdpa;
+    } else {
+      BFT_MALLOC(grad_cpu, n_cells_ext, cs_real_33_t);
+      BFT_MALLOC(grdpa_cpu, n_cells_ext, cs_real_33_t);
+    }
 
-# pragma omp parallel for
-  for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) {
-    for (int jsou = 0; jsou < 3; jsou++) {
-      for (int isou = 0; isou < 3; isou++)
-        grdpa[cell_id][isou][jsou] = 0.;
+    if(perf){
+      start = clock();
     }
-  }
 
-  if (iconvp > 0 && iupwin == 0 && isstpp == 0) {
+    if (  (idiffp != 0 && ircflp == 1) || ivisep == 1
+      || (   iconvp != 0 && iupwin == 0
+          && (ischcp == 0 || ircflp == 1 || isstpp == 0))) {
+
+      if (f_id != -1) {
+        /* Get the calculation option from the field */
+        if (f->type & CS_FIELD_VARIABLE && eqp.iwgrec == 1) {
+          if (eqp.idiff > 0) {
+            int key_id = cs_field_key_id("gradient_weighting_id");
+            int diff_id = cs_field_get_key_int(f, key_id);
+            if (diff_id > -1) {
+              cs_field_t *weight_f = cs_field_by_id(diff_id);
+              gweight = weight_f->val;
+              cs_field_synchronize(weight_f, halo_type);
+            }
+          }
+        }
+      }
+
+      cs_gradient_vector_synced_input(var_name,
+                                      gradient_type,
+                                      halo_type,
+                                      inc,
+                                      nswrgp,
+                                      iwarnp,
+                                      imligp,
+                                      epsrgp,
+                                      climgp,
+                                      coefav,
+                                      coefbv,
+                                      _pvar,
+                                      gweight, /* weighted gradient */
+                                      cpl,
+                                      grad_cpu);
+    }
+    else {
+  #   pragma omp parallel for
+      for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) {
+        for (int isou = 0; isou < 3; isou++) {
+          for (int jsou = 0; jsou < 3; jsou++)
+            grad_cpu[cell_id][isou][jsou] = 0.;
+        }
+      }
+    }
+
+/* ======================================================================
+    ---> Compute uncentered gradient grdpa for the slope test
+    ======================================================================*/
 
-    _slope_test_gradient_strided<3>(inc,
+  # pragma omp parallel for
+    for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) {
+      for (int jsou = 0; jsou < 3; jsou++) {
+        for (int isou = 0; isou < 3; isou++)
+          grdpa_cpu[cell_id][isou][jsou] = 0.;
+      }
+    }
+
+    if (iconvp > 0 && iupwin == 0 && isstpp == 0) {
+
+
+      if(compute_cpu){
+        if(perf){
+          start_slope = clock();
+        }
+        _slope_test_gradient_strided<3>(inc,
                                     halo_type,
                                     (const cs_real_33_t *)grad,
                                     grdpa,
@@ -4339,22 +4770,110 @@ cs_convection_diffusion_vector(int                         idtvar,
                                     coefbv,
                                     i_massflux);
 
-  }
-
-  /* ======================================================================
-     ---> Contribution from interior faces
-     ======================================================================*/
+      if(perf){
+        stop_slope = clock();
+        elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC;
+        printf("convection compute Slope time in us: CPU = %ld\n", elapsed_slope);
+      }
+      }
 
-  n_upwind = 0;
+      #if defined(HAVE_OPENMP_TARGET)
+      if(compute_cuda){
+        if(perf){
+          start_slope = clock();
+        }
+        cs_slope_test_gradient_vector_target(inc,
+                                            halo_type,
+                                            (const cs_real_33_t *)grad_cpu,
+                                            grdpa_cpu,
+                                            _pvar,
+                                            coefav,
+                                            coefbv,
+                                            i_massflux);
+        if(perf){
+          stop_slope = clock();
+          elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC;
+          printf("convection compute Slope time in us: OMP = %ld\n", elapsed_slope);
+        }
+      }
+      #endif
+    }
 
-  if (n_cells_ext > n_cells) {
-#   pragma omp parallel for if(n_cells_ext -n_cells > CS_THR_MIN)
-    for (cs_lnum_t cell_id = n_cells; cell_id < n_cells_ext; cell_id++) {
-      for (int isou = 0; isou < 3; isou++)
-        rhs[cell_id][isou] = 0.;
+    if(perf){
+      stop = clock();
+      elapsed = (stop - start) * 1e6 / CLOCKS_PER_SEC;
     }
   }
 
+  /* Performances */
+  if(perf){
+    #if defined(HAVE_CUDA)
+      if(compute_cuda){
+        printf("convection Compute and tranferts time in us: CUDA = %ld\n", elapsed_cuda);
+      }
+    #endif
+
+    if(compute_cpu){
+      printf("convection compute time in us: CPU = %ld\n", elapsed);
+    }
+  }
+
+  /* Accuracy grad_cpu and grad_gpu */
+  if(accuracy){
+    #if defined(HAVE_CUDA)
+      if(compute_cuda){
+        if(compute_cpu){
+          cs_real_t cpu, cuda;
+          double err;
+          for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) {
+            for (cs_lnum_t i = 0; i < 3; i++) {
+              for (int j  =0; j < 3; ++j) {
+                cpu = grdpa_cpu[c_id][i][j];
+                cuda = grdpa_gpu[c_id][i][j];
+                err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) );
+                if (err> 1e-6) {
+                  printf("slop_test DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err);
+                }
+              }
+            }
+          }
+        }
+      }
+    #endif
+  }
+
+// Free memory 
+#if defined(HAVE_CUDA)
+  if(compute_cuda){
+    if(res_cpu){
+      BFT_FREE(grad_gpu);
+      BFT_FREE(grdpa_gpu);
+    }
+  }
+#endif
+
+// Free memory
+  if(compute_cpu){
+    if(!res_cpu){
+      BFT_FREE(grad_cpu);
+      BFT_FREE(grdpa_cpu);
+    }
+  }
+
+
+    /* ======================================================================
+      ---> Contribution from interior faces
+      ======================================================================*/
+
+  n_upwind = 0;
+
+  if (n_cells_ext > n_cells) {
+#   pragma omp parallel for if(n_cells_ext -n_cells > CS_THR_MIN)
+    for (cs_lnum_t cell_id = n_cells; cell_id < n_cells_ext; cell_id++) {
+      for (int isou = 0; isou < 3; isou++)
+        rhs[cell_id][isou] = 0.;
+    }
+  }
   /* --> Pure upwind flux
      =====================*/
 
@@ -4925,7 +5444,11 @@ cs_convection_diffusion_vector(int                         idtvar,
       /* Unsteady */
     }
     else {
-
+      // ---------------OMP and CUDA here ---------------------
+if(compute_cpu){
+  if(perf){
+    start_slope = clock();
+  }
       for (int g_id = 0; g_id < n_i_groups; g_id++) {
 #       pragma omp parallel for reduction(+:n_upwind)
         for (int t_id = 0; t_id < n_i_threads; t_id++) {
@@ -5047,6 +5570,164 @@ cs_convection_diffusion_vector(int                         idtvar,
           }
         }
       }
+      if(perf){
+        stop_slope = clock();
+        elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC;
+        printf("idtvar => 0 unsteady i_faces time in us: CPU = %ld\n", elapsed_slope);
+      }
+} //compute_cpu
+
+    #if defined(HAVE_OPENMP_TARGET)
+    if(compute_cuda){
+      if(perf){
+        start_slope = clock();
+      }
+    #pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \
+                            map(to: i_face_cells[0:n_i_faces], \
+                                i_massflux[0:n_i_faces], \
+                                i_f_face_factor[0:n_i_faces], \
+                                i_face_u_normal[0:n_i_faces], \
+                                i_visc[0:n_i_faces], \
+                                i_face_cog[0:n_i_faces], \
+                                i_dist[0:n_i_faces], \
+                                weight[0:n_i_faces], \
+                                diipf[0:n_i_faces], \
+                                djjpf[0:n_i_faces], \
+                                i_pvar[0:n_i_faces], \
+                                grad[0:n_cells_ext], \
+                                grdpa[0:n_cells_ext], \
+                                cell_cen[0:n_cells_ext], \
+                                _pvar[0:n_cells_ext])
+    {
+      #pragma omp target teams distribute parallel for reduction(+:n_upwind) \
+                          firstprivate(cs_math_zero_threshold, \
+                                iconvp, thetap, ischcp, blencp, blend_st, \
+                                 imasac, idiffp, ircflp) \
+                                schedule(static,1)
+        for (cs_lnum_t face_id = 0; face_id < n_i_faces; face_id++) {
+
+          cs_lnum_t ii = i_face_cells[face_id][0];
+          cs_lnum_t jj = i_face_cells[face_id][1];
+
+          cs_real_t fluxi[3], fluxj[3] ;
+          for (int isou =  0; isou < 3; isou++) {
+            fluxi[isou] = 0;
+            fluxj[isou] = 0;
+          }
+          cs_real_3_t pip, pjp;
+          cs_real_3_t pif, pjf;
+          bool upwind_switch = false;
+          cs_real_3_t _pi, _pj;
+
+          for (int i = 0; i < 3; i++) {
+            _pi[i]  = _pvar[ii][i];
+            _pj[i]  = _pvar[jj][i];
+          }
+
+          /* Scaling due to mass balance in porous modelling */
+          if (i_f_face_factor != NULL) {
+            const cs_real_t *n = i_face_u_normal[face_id];
+            cs_math_3_normal_scaling(n, i_f_face_factor[face_id][0], _pi);
+            cs_math_3_normal_scaling(n, i_f_face_factor[face_id][1], _pj);
+          }
+
+          cs_real_t bldfrp = (cs_real_t) ircflp;
+          /* Local limitation of the reconstruction */
+          if (df_limiter != NULL && ircflp > 0)
+            bldfrp = cs_math_fmax(cs_math_fmin(df_limiter[ii], df_limiter[jj]),
+                                  0.);
+
+          cs_i_cd_unsteady_slope_test_strided<3>(&upwind_switch,
+                                                   iconvp,
+                                                   bldfrp,
+                                                   ischcp,
+                                                   blencp,
+                                                   blend_st,
+                                                   weight[face_id],
+                                                   i_dist[face_id],
+                                                   cell_cen[ii],
+                                                   cell_cen[jj],
+                                                   i_face_u_normal[face_id],
+                                                   i_face_cog[face_id],
+                                                   diipf[face_id],
+                                                   djjpf[face_id],
+                                                   i_massflux[face_id],
+                                                   grad[ii],
+                                                   grad[jj],
+                                                   grdpa[ii],
+                                                   grdpa[jj],
+                                                   _pi,
+                                                   _pj,
+                                                   pif,
+                                                   pjf,
+                                                   pip,
+                                                   pjp);
+
+            cs_i_conv_flux_strided<3>(iconvp,
+                                      thetap,
+                                      imasac,
+                                      _pvar[ii],
+                                      _pvar[jj],
+                                      pif,
+                                      pif, /* no relaxation */
+                                      pjf,
+                                      pjf, /* no relaxation */
+                                      i_massflux[face_id],
+                                      fluxi,
+                                      fluxj);
+
+
+            cs_i_diff_flux_strided<3>(idiffp,
+                                      thetap,
+                                      pip,
+                                      pjp,
+                                      pip, /* no relaxation */
+                                      pjp, /* no relaxation */
+                                      i_visc[face_id],
+                                      fluxi,
+                                      fluxj);
+
+          if (upwind_switch) {
+
+            /* in parallel, face will be counted by one and only one rank */
+            if (ii < n_cells)
+              n_upwind++;
+
+            if (v_slope_test != NULL) {
+              v_slope_test[ii] += fabs(i_massflux[face_id]) / cell_vol[ii];
+              v_slope_test[jj] += fabs(i_massflux[face_id]) / cell_vol[jj];
+            }
+          }
+          /* Saving velocity at internal faces, if needed */
+          if (i_pvar != NULL) {
+            if (i_massflux[face_id] >= 0.) {
+              for (cs_lnum_t i = 0; i < 3; i++)
+                i_pvar[face_id][i] += thetap * pif[i];
+            }
+            else {
+              for (cs_lnum_t i = 0; i < 3; i++)
+                i_pvar[face_id][i] += thetap * pjf[i];
+            }
+          }
+
+          for (int isou = 0; isou < 3; isou++) {
+            
+            #pragma omp atomic
+            rhs[ii][isou] -= fluxi[isou];
+            #pragma omp atomic
+            rhs[jj][isou] += fluxj[isou];
+
+          } /* isou */
+
+        }
+      } // target data
+      if(perf){
+        stop_slope = clock();
+        elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC;
+        printf("idtvar => 0 unsteady i_faces time in us: OMP = %ld\n", elapsed_slope);
+      }
+    } // compute_cuda
+      #endif
 
     } /* idtvar */
 
@@ -5266,7 +5947,11 @@ cs_convection_diffusion_vector(int                         idtvar,
       /* Unsteady */
     }
     else {
-
+      // ---------------OMP and CUDA here ---------------------
+if(compute_cpu){
+  if(perf){
+    start_slope = clock();
+  }
 #     pragma omp parallel for if(m->n_b_faces > CS_THR_MIN)
       for (int t_id = 0; t_id < n_b_threads; t_id++) {
         for (cs_lnum_t face_id = b_group_index[t_id*2];
@@ -5460,6 +6145,240 @@ cs_convection_diffusion_vector(int                         idtvar,
           BFT_FREE(df_limiter_local);
         }
       }
+  if(perf){
+    stop_slope = clock();
+    elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC;
+    printf("idtvar => 0 unsteady b_faces time in us: CPU = %ld\n", elapsed_slope);
+  }
+} // compute_cpu
+
+#if defined(HAVE_OPENMP_TARGET)
+if(compute_cuda){
+  if(perf){
+    start_slope = clock();
+  }
+#pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \
+                        map(to: b_face_cells[0:n_b_faces], \
+                                b_massflux[0:n_b_faces], \
+                                b_f_face_factor[0:n_b_faces], \
+                                b_face_u_normal[0:n_b_faces], \
+                                bc_type[0:n_b_faces], \
+                                b_visc[0:n_b_faces], \
+                                b_face_cells[0:n_b_faces], \
+                                b_face_surf[0:n_b_faces], \
+                                coefav[0:n_b_faces], \
+                                coefbv[0:n_b_faces], \
+                                cofafv[0:n_b_faces], \
+                                cofbfv[0:n_b_faces], \
+                                diipb[0:n_b_faces], \
+                                b_pvar[0:n_b_faces], \
+                                grad[0:n_cells_ext], \
+                                grdpa[0:n_cells_ext], \
+                                _pvar[0:n_cells_ext])
+{
+      #pragma omp target teams distribute parallel for \
+                          private(pvar_distant, pvar_local, df_limiter_local) \
+                                  firstprivate(cs_math_zero_threshold, iconvp, thetap, ischcp, blencp, blend_st, \
+                                 imasac, idiffp, ircflp, inc, n_local, n_distant) \
+                                schedule(static,1) if(m->n_b_faces > CS_THR_MIN)
+        for (cs_lnum_t face_id = 0; face_id < n_b_faces; face_id++) {
+
+          cs_lnum_t ii = b_face_cells[face_id];
+
+          cs_real_t fluxi[3];
+          for (int isou =  0; isou < 3; isou++) {
+            fluxi[isou] = 0;
+          }
+          cs_real_3_t pip;
+          cs_real_3_t _pi;
+          cs_real_t pfac[3];
+
+          for (int i = 0; i < 3; i++) {
+            _pi[i]  = _pvar[ii][i];
+          }
+
+          /* Scaling due to mass balance in porous modelling */
+          if (b_f_face_factor != NULL) {
+            const cs_real_t *n = b_face_u_normal[face_id];
+            cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pi);
+          }
+
+          cs_real_t bldfrp = (cs_real_t) ircflp;
+          /* Local limitation of the reconstruction */
+          if (df_limiter != NULL && ircflp > 0)
+            bldfrp = cs_math_fmax(df_limiter[ii], 0.);
+
+          cs_b_cd_unsteady_strided<3>(bldfrp,
+                                      diipb[face_id],
+                                      grad[ii],
+                                      _pi,
+                                      pip);
+          cs_b_upwind_flux_strided<3>(iconvp,
+                                      thetap,
+                                      imasac,
+                                      inc,
+                                      bc_type[face_id],
+                                      _pi,
+                                      _pi, /* no relaxation */
+                                      pip,
+                                      coefav[face_id],
+                                      coefbv[face_id],
+                                      b_massflux[face_id],
+                                      pfac,
+                                      fluxi);
+
+          /* Saving velocity on boundary faces */
+          if (b_pvar != NULL) {
+            if (b_massflux[face_id] >= 0.) {
+              for (cs_lnum_t i = 0; i < 3; i++)
+                b_pvar[face_id][i] += thetap * _pi[i];
+            }
+            else {
+              for (cs_lnum_t i = 0; i < 3; i++) {
+                b_pvar[face_id][i] += thetap * pfac[i];
+              }
+            }
+          }
+
+          cs_b_diff_flux_strided<3>(idiffp,
+                                    thetap,
+                                    inc,
+                                    pip,
+                                    cofafv[face_id],
+                                    cofbfv[face_id],
+                                    b_visc[face_id],
+                                    fluxi);
+
+          for(int isou = 0; isou < 3; isou++) {
+            #pragma omp atomic
+            rhs[ii][isou] -= fluxi[isou];
+          }
+
+        }
+      }
+
+      /* The variable is internally coupled and an implicit contribution
+       * is required */
+      if (icoupl > 0) {
+        /* Prepare data for sending */
+        BFT_MALLOC(pvar_distant, n_distant, cs_real_3_t);
+
+        for (cs_lnum_t ii = 0; ii < n_distant; ii++) {
+          cs_lnum_t face_id = faces_distant[ii];
+          cs_lnum_t jj = b_face_cells[face_id];
+
+          cs_real_3_t pip;
+          cs_real_3_t _pj;
+
+          for (int i = 0; i < 3; i++) {
+            _pj[i]  = _pvar[jj][i];
+          }
+
+          cs_real_t bldfrp = (cs_real_t) ircflp;
+          /* Local limitation of the reconstruction */
+          /* Note: to be treated exactly as a internal face, should be a bending
+           * between the two cells... */
+          if (df_limiter != NULL && ircflp > 0)
+            bldfrp = cs_math_fmax(df_limiter[jj], 0.);
+
+          /* Scaling due to mass balance in porous modelling */
+          if (b_f_face_factor != NULL) {
+            const cs_real_t *n = b_face_u_normal[face_id];
+            cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj);
+          }
+
+          cs_b_cd_unsteady_strided<3>(bldfrp,
+                                      diipb[face_id],
+                                      grad[jj],
+                                      _pj,
+                                      pip);
+
+          for (int k = 0; k < 3; k++)
+            pvar_distant[ii][k] = pip[k];
+        }
+
+        /* Receive data */
+        BFT_MALLOC(pvar_local, n_local, cs_real_3_t);
+        cs_internal_coupling_exchange_var(cpl,
+                                          3, /* Dimension */
+                                          (cs_real_t *)pvar_distant,
+                                          (cs_real_t *)pvar_local);
+
+        if (df_limiter != NULL) {
+          BFT_MALLOC(df_limiter_local, n_local, cs_real_t);
+          cs_internal_coupling_exchange_var(cpl,
+                                            1, /* Dimension */
+                                            df_limiter,
+                                            df_limiter_local);
+        }
+
+        /* Flux contribution */
+        assert(f != NULL);
+        cs_real_t *hintp = f->bc_coeffs->hint;
+        cs_real_t *hextp = f->bc_coeffs->rcodcl2;
+        for (cs_lnum_t ii = 0; ii < n_local; ii++) {
+          cs_lnum_t face_id = faces_local[ii];
+          cs_lnum_t jj = b_face_cells[face_id];
+          cs_real_t surf = b_face_surf[face_id];
+          cs_real_t pip[3], pjp[3];
+          cs_real_t fluxi[3] = {0., 0., 0.};
+          cs_real_3_t _pj;
+
+          for (int i = 0; i < 3; i++) {
+            _pj[i]  = _pvar[jj][i];
+          }
+
+          /* Scaling due to mass balance in porous modelling */
+          if (b_f_face_factor != NULL) {
+            const cs_real_t *n = b_face_u_normal[face_id];
+            cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj);
+          }
+
+          cs_real_t bldfrp = (cs_real_t) ircflp;
+          /* Local limitation of the reconstruction */
+          if (df_limiter != NULL && ircflp > 0)
+            bldfrp = cs_math_fmax(cs_math_fmin(df_limiter_local[ii],
+                                               df_limiter[jj]),
+                                  0.);
+
+          cs_b_cd_unsteady_strided<3>(bldfrp,
+                                      diipb[face_id],
+                                      grad[jj],
+                                      _pj,
+                                      pip);
+
+          for (int k = 0; k < 3; k++)
+            pjp[k] = pvar_local[ii][k];
+
+          cs_real_t hint = hintp[face_id];
+          cs_real_t hext = hextp[face_id];
+          cs_real_t heq = _calc_heq(hint, hext)*surf;
+
+          cs_b_diff_flux_coupling_strided<3>(idiffp,
+                                             pip,
+                                             pjp,
+                                             heq,
+                                             fluxi);
+
+          for (int k = 0; k < 3; k++)
+            #pragma omp atomic
+            rhs[jj][k] -= thetap * fluxi[k];
+        }
+
+        BFT_FREE(pvar_local);
+        /* Sending structures are no longer needed */
+        BFT_FREE(pvar_distant);
+        if (df_limiter != NULL) {
+          BFT_FREE(df_limiter_local);
+        }
+      } // target data
+  if(perf){
+    stop_slope = clock();
+    elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC;
+    printf("idtvar => 0 unsteady b_faces time in us: OMP = %ld\n", elapsed_slope);
+  }
+} // compute_cuda
+#endif
     } /* idtvar */
 
     /* Boundary convective flux imposed at some faces (tags in icvfli array) */
diff --git a/src/alge/cs_convection_diffusion_cuda.cu b/src/alge/cs_convection_diffusion_cuda.cu
new file mode 100644
index 0000000000..c110ababfd
--- /dev/null
+++ b/src/alge/cs_convection_diffusion_cuda.cu
@@ -0,0 +1,291 @@
+#include "cs_alge_cuda.cuh"
+
+#include "cs_convection_diffusion.h"
+#include "cs_convection_diffusion_priv.h"
+
+#include "cs_slope_test_gradient_vector_cuda_scatter.cuh"
+#include "cs_slope_test_gradient_vector_cuda_gather.cuh"
+
+/*----------------------------------------------------------------------------
+ * _gradient_vector the gradient of a vector using a given gradient of
+ * this vector (typically lsq).
+ *
+ * parameters:
+ *   m              <-- pointer to associated mesh structure
+ *   fvq            <-- pointer to associated finite volume quantities
+ *   cpl            <-- structure associated with internal coupling, or NULL
+ *   inc            <-- if 0, solve on increment; 1 otherwise
+ *   coefav         <-- B.C. coefficients for boundary face normals
+ *   coefbv         <-- B.C. coefficients for boundary face normals
+ *   pvar           <-- variable
+ *   c_weight       <-- weighted gradient coefficient variable
+ *   r_grad         --> gradient used for reconstruction
+ *   grad           --> gradient of pvar (du_i/dx_j : grad[][i][j])
+ *----------------------------------------------------------------------------*/
+extern "C" void
+cs_convection_diffusion_vector_cuda(const cs_mesh_t             *mesh,
+                                    const cs_mesh_adjacencies_t *madj,
+                                    const cs_mesh_quantities_t  *fvq,
+                                    const cs_real_3_t  *restrict pvar,
+                                    const cs_real_t              i_massflux[],
+                                    const cs_real_33_t          *grad,
+                                    cs_real_33_t                *grdpa,
+                                    const cs_real_3_t  *restrict coefav,
+                                    const cs_real_33_t *restrict coefbv,
+                                    const int                    inc,
+                                    const bool                   flag1,
+                                    const bool                   flag2,
+                                    const bool                   perf)
+{
+  const cs_lnum_t n_cells = mesh->n_cells;
+  const cs_lnum_t n_b_cells = mesh->n_b_cells;
+  const cs_lnum_t n_cells_ext = mesh->n_cells_with_ghosts;
+  const cs_lnum_t n_i_faces   = mesh->n_i_faces;
+  const cs_lnum_t n_b_faces   = mesh->n_b_faces;
+
+  int device_id;
+  cudaGetDevice(&device_id);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  
+  cudaEvent_t start, mem_h2d, init, f_i, f_b, f_f, stop;
+  float msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventCreate(&start));
+  CS_CUDA_CHECK(cudaEventCreate(&mem_h2d));
+  CS_CUDA_CHECK(cudaEventCreate(&init));
+  CS_CUDA_CHECK(cudaEventCreate(&f_i));
+  CS_CUDA_CHECK(cudaEventCreate(&f_b));
+  CS_CUDA_CHECK(cudaEventCreate(&f_f));
+  CS_CUDA_CHECK(cudaEventCreate(&stop));
+
+
+  // Record the start event
+  CS_CUDA_CHECK(cudaEventRecord(start, stream));
+
+  unsigned int blocksize = 256;
+
+  cs_real_33_t *grad_d = NULL;
+  CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells_ext * sizeof(cs_real_33_t)));
+
+  cs_real_33_t *grdpa_d;
+  CS_CUDA_CHECK(cudaMalloc(&grdpa_d, n_cells_ext * sizeof(cs_real_33_t)));
+
+  cs_gnum_t n_upwind;
+  const cs_lnum_2_t *restrict i_face_cells
+    = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(mesh->i_face_cells);
+
+  const cs_lnum_t *restrict b_face_cells
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(mesh->b_face_cells);
+
+  const cs_real_3_t *restrict cell_cen
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen);
+
+  const cs_real_3_t *restrict diipb
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb);
+
+  const cs_real_3_t *restrict b_f_face_normal
+  = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_normal);
+
+  const cs_lnum_t *restrict cell_cells_idx
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx);
+
+  const cs_lnum_t *restrict cell_cells
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells);
+
+  const cs_lnum_t *restrict b_cells
+    = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(mesh->b_cells);
+
+  const cs_lnum_t *restrict cell_b_faces
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces);
+
+  const cs_lnum_t *restrict cell_b_faces_idx
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx);
+
+  cs_real_t *restrict i_massflux_d;
+  CS_CUDA_CHECK(cudaMalloc(&i_massflux_d, sizeof(cs_real_t)*n_i_faces));
+  cs_cuda_copy_h2d(i_massflux_d, (void *)i_massflux, sizeof(cs_real_t)*n_i_faces);
+
+  cs_real_3_t *restrict i_face_cog;
+  CS_CUDA_CHECK(cudaMalloc(&i_face_cog, sizeof(cs_real_3_t)*n_i_faces));
+  cs_cuda_copy_h2d(i_face_cog, (void *)fvq->i_face_cog, sizeof(cs_real_3_t)*n_i_faces);
+
+  cs_real_3_t *restrict i_f_face_normal;
+  CS_CUDA_CHECK(cudaMalloc(&i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces));
+  cs_cuda_copy_h2d(i_f_face_normal, (void *)fvq->i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces);
+  
+  cs_real_t *restrict cell_vol;
+  CS_CUDA_CHECK(cudaMalloc(&cell_vol, sizeof(cs_real_t)*n_cells));
+  cs_cuda_copy_h2d(cell_vol, (void *)fvq->cell_vol, sizeof(cs_real_t)*n_cells);
+
+  cs_mesh_adjacencies_update_cell_i_faces();
+  const cs_lnum_t n_cells_i_face = (madj->cell_cells_idx[n_cells]);
+
+  cs_lnum_t *restrict cell_i_faces;
+  CS_CUDA_CHECK(cudaMalloc(&cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face));
+  cs_cuda_copy_h2d(cell_i_faces, madj->cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face);
+
+  short int *restrict cell_i_faces_sgn;
+  CS_CUDA_CHECK(cudaMalloc(&cell_i_faces_sgn, sizeof(short int)*n_cells_i_face));
+  cs_cuda_copy_h2d(cell_i_faces_sgn, madj->cell_i_faces_sgn, sizeof(short int)*n_cells_i_face);
+
+
+  void *_coefb_d, *_coefa_d, *_pvar_d;
+
+  const cs_real_3_t * coefa_d = NULL;
+  const cs_real_3_t * pvar_d = NULL;
+  const cs_real_33_t * coefb_d = NULL;
+
+  /* Initialization */
+
+  _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream,
+    &pvar_d, &_pvar_d);
+  _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream,
+        &coefa_d, &_coefa_d);
+  _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream,
+        &coefb_d, &_coefb_d);
+
+  if(flag1){
+    cs_cuda_copy_h2d(grad_d, grad, sizeof(cs_real_33_t)*n_cells_ext);
+  }
+  else{
+    cudaMemset(grad_d, 0, n_cells_ext * sizeof(cs_real_33_t));
+  }
+
+  CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream));
+
+  cudaMemset(grdpa_d, 0, n_cells_ext * sizeof(cs_real_33_t));
+
+  CS_CUDA_CHECK(cudaEventRecord(init, stream));
+
+  if (flag2) {
+    cs_slope_test_gradient_vector_cuda_i<<<(unsigned int)ceil((double)n_i_faces / blocksize), blocksize, 0, stream>>>
+                                  (n_i_faces,
+                                   i_face_cells,
+                                   i_face_cog,
+                                   cell_cen,
+                                   pvar_d,
+                                   i_massflux_d,
+                                   i_f_face_normal,
+                                   grad_d,
+                                   grdpa_d);
+    
+
+    // cs_slope_test_gradient_vector_cuda_i_gather<<<(unsigned int)ceil((double)n_cells / blocksize), blocksize, 0, stream>>>
+    //                               (n_cells,
+    //                               i_face_cog,
+    //                               cell_cen,
+    //                               pvar_d,
+    //                               i_massflux_d,
+    //                               i_f_face_normal,
+    //                               cell_cells_idx,
+    //                               cell_cells,
+    //                               cell_i_faces,
+    //                               cell_i_faces_sgn,
+    //                               grad_d,
+    //                               grdpa_d);
+
+    CS_CUDA_CHECK(cudaEventRecord(f_i, stream));
+
+    cs_slope_test_gradient_vector_cuda_b<<<get_gridsize(n_b_faces, blocksize), blocksize, 0, stream>>>
+                                    (n_b_faces,
+                                     pvar_d,
+                                     b_face_cells,
+                                     diipb,
+                                     inc,
+                                     coefa_d,
+                                     coefb_d,
+                                     b_f_face_normal,
+                                     grad_d,
+                                     grdpa_d);
+
+
+    // cs_slope_test_gradient_vector_cuda_b_gather<<<get_gridsize(n_b_cells, blocksize), blocksize, 0, stream>>>
+    //                                 (n_b_cells,
+    //                                 pvar_d,
+    //                                 diipb,
+    //                                 inc,
+    //                                 coefa_d,
+    //                                 coefb_d,
+    //                                 b_f_face_normal,
+    //                                 b_cells,
+    //                                 cell_b_faces,
+    //                                 cell_b_faces_idx,
+    //                                 grad_d,
+    //                                 grdpa_d);
+
+    CS_CUDA_CHECK(cudaEventRecord(f_b, stream));
+
+    cs_slope_test_gradient_vector_cuda_f<<<get_gridsize(n_cells * 3 * 3, blocksize), blocksize, 0, stream>>>
+                                    (n_cells * 3 * 3,
+                                     cell_vol,
+                                     grdpa_d);
+
+    CS_CUDA_CHECK(cudaEventRecord(f_f, stream));
+    
+  }
+ 
+  n_upwind = 0;
+  
+  /* Sync to host */
+  if (grdpa_d != NULL) {
+    size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3;
+    cs_cuda_copy_d2h(grdpa, grdpa_d, size);
+  }
+  else
+    cs_sync_d2h(grdpa);
+  
+  CS_CUDA_CHECK(cudaEventRecord(stop, stream));
+  CS_CUDA_CHECK(cudaEventSynchronize(stop));
+
+  cudaStreamSynchronize(stream);
+  cudaStreamDestroy(stream);
+
+  if(perf){
+    printf("convection_diffusion Kernels:\n");
+    printf("Execution time in us: \t");
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init));
+    printf("Init = %f\t", msec*1000.f);
+    
+    if (flag2) {
+      msec = 0.0f;
+      CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, f_i));
+      printf("f_i = %f\t", msec*1000.f);
+
+      msec = 0.0f;
+      CS_CUDA_CHECK(cudaEventElapsedTime(&msec, f_i, f_b));
+      printf("f_b = %f\t", msec*1000.f);
+
+      msec = 0.0f;
+      CS_CUDA_CHECK(cudaEventElapsedTime(&msec, f_b, f_f));
+      printf("f_f = %f\t", msec*1000.f);
+
+    }
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop));
+    printf("Total = %f\n", msec*1000.f);
+  }
+
+  if (!flag1){
+    CS_CUDA_CHECK(cudaFree(grad_d));
+  }
+
+  if (_pvar_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_pvar_d));
+  if (_coefa_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_coefa_d));
+  if (_coefb_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_coefb_d));
+    
+  CS_CUDA_CHECK(cudaFree(grad_d));
+  CS_CUDA_CHECK(cudaFree(grdpa_d));
+  CS_CUDA_CHECK(cudaFree(i_massflux_d));
+  CS_CUDA_CHECK(cudaFree(i_f_face_normal));
+  CS_CUDA_CHECK(cudaFree(cell_vol));
+  CS_CUDA_CHECK(cudaFree(cell_i_faces));
+  CS_CUDA_CHECK(cudaFree(cell_i_faces_sgn));
+  CS_CUDA_CHECK(cudaFree(i_face_cog));
+}
diff --git a/src/alge/cs_convection_diffusion_priv.h b/src/alge/cs_convection_diffusion_priv.h
index 7cb0c1c71d..6c5a06ca5b 100644
--- a/src/alge/cs_convection_diffusion_priv.h
+++ b/src/alge/cs_convection_diffusion_priv.h
@@ -27,6 +27,35 @@
 
 /*----------------------------------------------------------------------------*/
 
+/*----------------------------------------------------------------------------
+ * Local headers
+ *----------------------------------------------------------------------------*/
+
+#include "cs_base.h"
+#include "cs_base_accel.h"
+#include "cs_halo.h"
+#include "cs_internal_coupling.h"
+#include "cs_mesh.h"
+#include "cs_mesh_quantities.h"
+
+/*----------------------------------------------------------------------------*/
+
+BEGIN_C_DECLS
+
+/*! \cond DOXYGEN_SHOULD_SKIP_THIS */
+
+/*============================================================================
+ * Macro definitions
+ *============================================================================*/
+
+/*=============================================================================
+ * Local type definitions
+ *============================================================================*/
+
+/* Type for symmetric least-squares covariance matrices
+   as they are adimensional, single-precision should be usable here */
+
+
 #include "cs_defs.h"
 
 /*----------------------------------------------------------------------------
@@ -50,6 +79,37 @@
  *  Global variables
  *============================================================================*/
 
+/*=============================================================================
+ * Semi-private function prototypes
+ *============================================================================*/
+
+#if defined(HAVE_CUDA)
+
+void
+cs_convection_diffusion_vector_cuda(const cs_mesh_t             *mesh,
+                                    const cs_mesh_adjacencies_t *madj,
+                                    const cs_mesh_quantities_t  *fvq,
+                                    const cs_real_3_t  *restrict pvar,
+                                    const cs_real_t              i_massflux[],
+                                    const cs_real_33_t          *grad,
+                                    cs_real_33_t                *grdpa,
+                                    const cs_real_3_t  *restrict coefav,
+                                    const cs_real_33_t *restrict coefbv,
+                                    const int                    inc,
+                                    const bool                   flag1,
+                                    const bool                   flag2,
+                                    const bool                   perf);
+
+#endif
+
+/* defined(HAVE_CUDA) */
+
+/*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
+
+/*----------------------------------------------------------------------------*/
+
+END_C_DECLS
+
 /*============================================================================
  * Public inlined function
  *============================================================================*/
diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx
index c4e44ad5e9..ece9a34171 100644
--- a/src/alge/cs_gradient.cxx
+++ b/src/alge/cs_gradient.cxx
@@ -39,6 +39,14 @@
 #include <stdarg.h>
 #include <string.h>
 #include <float.h>
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <iomanip>
+#include <limits>
+#include <type_traits>
+#include <iostream>
+#include <chrono>
 
 #if defined(HAVE_MPI)
 #include <mpi.h>
@@ -190,7 +198,7 @@ const cs_e2n_sum_t _e2n_sum_type = CS_E2N_SUM_SCATTER;
 
 /* Strided LSQ gradient variant */
 
-static int _use_legacy_strided_lsq_gradient = false;
+static int _use_legacy_strided_lsq_gradient = true;
 
 /*============================================================================
  * Private function definitions
@@ -690,6 +698,31 @@ _sync_scalar_gradient_halo(const cs_mesh_t  *m,
   }
 }
 
+/* Compute the unit in the last place (ULP) */
+template <class T>
+typename std::enable_if<!std::numeric_limits<T>::is_integer, T>::type
+cs_diff_ulp(T x, T y)
+{
+    // Since `epsilon()` is the gap size (ULP, unit in the last place)
+    // of floating-point numbers in interval [1, 2), we can scale it to
+    // the gap size in interval [2^e, 2^{e+1}), where `e` is the exponent
+    // of `x` and `y`.
+ 
+    // If `x` and `y` have different gap sizes (which means they have
+    // different exponents), we take the smaller one. Taking the bigger
+    // one is also reasonable, I guess.
+    const T m = std::min(std::fabs(x), std::fabs(y));
+ 
+    // Subnormal numbers have fixed exponent, which is `min_exponent - 1`.
+    const int exp = m < std::numeric_limits<T>::min()
+                  ? std::numeric_limits<T>::min_exponent - 1
+                  : std::ilogb(m);
+ 
+    // We divide the absolute difference by the epsilon times the exponent (1 ulp)
+    return std::fabs(x - y) / std::ldexp(std::numeric_limits<T>::epsilon(), exp);
+}
+
+
 /*----------------------------------------------------------------------------
  * Synchronize strided gradient ghost cell values.
  *
@@ -5449,6 +5482,8 @@ _reconstruct_strided_gradient(const cs_mesh_t              *m,
                               cs_real_t (*restrict r_grad)[stride][3],
                               cs_real_t (*restrict grad)[stride][3])
 {
+  using grad_t  = cs_real_t[stride][3];
+
   const cs_lnum_t n_cells = m->n_cells;
   const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts;
 
@@ -5483,27 +5518,114 @@ _reconstruct_strided_gradient(const cs_mesh_t              *m,
   /* Initialize gradient */
   /*---------------------*/
 
-  /* Initialization */
+  /* Timing the computation */
 
-# pragma omp parallel for
-  for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) {
-    for (cs_lnum_t i = 0; i < stride; i++) {
-      for (cs_lnum_t j = 0; j < 3; j++)
-        grad[c_id][i][j] = 0.0;
+  std::chrono::high_resolution_clock::time_point start, stop;
+  std::chrono::microseconds elapsed, elapsed_cuda;
+
+  grad_t *grad_cpu, *grad_gpu;
+  
+  bool compute_cuda;
+  bool compute_cpu;
+  bool res_cpu;
+  bool perf;
+  bool accuracy;
+
+#if defined(HAVE_CUDA)
+  compute_cuda = (cs_get_device_id() > -1) ? true : false;
+#else
+  compute_cuda = false;
+#endif
+
+res_cpu = !compute_cuda;
+
+#if defined(DEBUG)
+  compute_cpu = true;
+  perf        = true;
+  accuracy    = true;
+#elif defined(NDEBUG)
+  compute_cpu = true;
+  perf        = false;
+  accuracy    = false;
+#else
+  compute_cpu = false;
+  perf        = false;
+  accuracy    = false;
+#endif
+
+
+  // Pour l'instant ces lignes sont pour moi
+  // Elles seront à enlever
+  // compute_cuda  = true;
+  // compute_cpu   = true;
+  // res_cpu       = false;
+
+  // A ne pas garder dans la version finale
+  // perf        = false;
+  // accuracy    = false;
+
+
+#if defined(HAVE_CUDA)
+  if(compute_cuda){
+    if(!res_cpu){
+      grad_gpu = grad;
+    } else {
+      BFT_MALLOC(grad_gpu, n_cells_ext, grad_t);
+    }
+    if(perf){
+      start = std::chrono::high_resolution_clock::now();
+    }
+
+    cs_reconstruct_vector_gradient_cuda<stride>(m,
+                                        madj,
+                                        fvq, 
+                                        halo_type, 
+                                        inc,
+                                        coefav,
+                                        coefbv,
+                                        pvar,
+                                        c_weight,
+                                        r_grad,
+                                        grad_gpu,
+                                        cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION,
+                                        perf);
+    if(perf){
+      stop = std::chrono::high_resolution_clock::now();
+      elapsed_cuda = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
     }
   }
+#endif
 
-  /* Interior faces contribution */
+  if(compute_cpu){
+    if(res_cpu){
+      grad_cpu = grad;
+    } else {
+      BFT_MALLOC(grad_cpu, n_cells_ext, grad_t);
+    }
+
+    if(perf){
+      start = std::chrono::high_resolution_clock::now();
+    }
+      /* Initialization */
 
+    # pragma omp parallel for
+      for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) {
+        for (cs_lnum_t i = 0; i < stride; i++) {
+          for (cs_lnum_t j = 0; j < 3; j++)
+            grad_cpu[c_id][i][j] = 0.0;
+        }
+      }
   cs_lnum_t n_i_groups, n_i_threads;
   cs_mesh_i_faces_thread_block_count(m, CS_E2N_SUM_SCATTER, 0,
                                      &n_i_groups, &n_i_threads);
 
-  for (int g_id = 0; g_id < n_i_groups; g_id++) {
+      /* Interior faces contribution */
 
-#   pragma omp parallel for
-    for (int t_id = 0; t_id < n_i_threads; t_id++) {
+      for (int g_id = 0; g_id < n_i_groups; g_id++) {
 
+    #   pragma omp parallel for
+        for (int t_id = 0; t_id < n_i_threads; t_id++) {
+        
       cs_lnum_t s_id, e_id;
       cs_mesh_i_faces_thread_block_range(m, CS_E2N_SUM_SCATTER, g_id, t_id,
                                          n_i_threads, 0, &s_id, &e_id);
@@ -5522,12 +5644,12 @@ _reconstruct_strided_gradient(const cs_mesh_t              *m,
               + (1.0-pond)* c_weight[c_id2]);
 
         /*
-           Remark: \f$ \varia_\face = \alpha_\ij \varia_\celli
+          Remark: \f$ \varia_\face = \alpha_\ij \varia_\celli
                                     + (1-\alpha_\ij) \varia_\cellj\f$
-                   but for the cell \f$ \celli \f$ we remove
-                   \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$
-                   and for the cell \f$ \cellj \f$ we remove
-                   \f$ \varia_\cellj \sum_\face \vect{S}_\face = \vect{0} \f$
+                  but for the cell \f$ \celli \f$ we remove
+                  \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$
+                  and for the cell \f$ \cellj \f$ we remove
+                  \f$ \varia_\cellj \sum_\face \vect{S}_\face = \vect{0} \f$
         */
 
         for (cs_lnum_t i = 0; i < stride; i++) {
@@ -5544,10 +5666,9 @@ _reconstruct_strided_gradient(const cs_mesh_t              *m,
                                                     + r_grad[c_id2][i][2]));
 
           for (cs_lnum_t j = 0; j < 3; j++) {
-            grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j];
-            grad[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j];
+            grad_cpu[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j];
+            grad_cpu[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j];
           }
-
         }
 
       } /* End of loop on faces */
@@ -5570,10 +5691,10 @@ _reconstruct_strided_gradient(const cs_mesh_t              *m,
 
       cs_lnum_t f_id = cell_b_faces[fidx];
 
-      /*
-        Remark: for the cell \f$ \celli \f$ we remove
-                \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$
-      */
+        /*
+          Remark: for the cell \f$ \celli \f$ we remove
+                  \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$
+        */
 
       for (cs_lnum_t i = 0; i < stride; i++) {
 
@@ -5588,13 +5709,13 @@ _reconstruct_strided_gradient(const cs_mesh_t              *m,
         cs_real_t rfac = 0.;
         for (cs_lnum_t k = 0; k < stride; k++) {
           cs_real_t vecfac =   r_grad[c_id][k][0] * diipb[f_id][0]
-                             + r_grad[c_id][k][1] * diipb[f_id][1]
-                             + r_grad[c_id][k][2] * diipb[f_id][2];
+                              + r_grad[c_id][k][1] * diipb[f_id][1]
+                              + r_grad[c_id][k][2] * diipb[f_id][2];
           rfac += coefbv[f_id][i][k] * vecfac;
         }
 
         for (cs_lnum_t j = 0; j < 3; j++) {
-          grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j];
+          grad_cpu[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j];
         }
       }
 
@@ -5602,36 +5723,93 @@ _reconstruct_strided_gradient(const cs_mesh_t              *m,
 
   } /* loop on boundary cells */
 
-# pragma omp parallel for
-  for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) {
-    cs_real_t dvol;
-    /* Is the cell disabled (for solid or porous)? Not the case if coupled */
-    if (has_dc * c_disable_flag[has_dc * c_id] == 0)
-      dvol = 1. / cell_f_vol[c_id];
-    else
-      dvol = 0.;
+    # pragma omp parallel for
+      for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) {
+        cs_real_t dvol;
+        /* Is the cell disabled (for solid or porous)? Not the case if coupled */
+        if (has_dc * c_disable_flag[has_dc * c_id] == 0)
+          dvol = 1. / cell_f_vol[c_id];
+        else
+          dvol = 0.;
 
-    for (cs_lnum_t i = 0; i < stride; i++) {
-      for (cs_lnum_t j = 0; j < 3; j++)
-        grad[c_id][i][j] *= dvol;
-    }
+        for (cs_lnum_t i = 0; i < stride; i++) {
+          for (cs_lnum_t j = 0; j < 3; j++)
+            grad_cpu[c_id][i][j] *= dvol;
+        }
 
-    if (cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION) {
-      cs_real_t gradpa[3];
-      for (cs_lnum_t i = 0; i < stride; i++) {
-        for (cs_lnum_t j = 0; j < 3; j++) {
-          gradpa[j] = grad[c_id][i][j];
-          grad[c_id][i][j] = 0.;
+        if (cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION) {
+          cs_real_t gradpa[3];
+          for (cs_lnum_t i = 0; i < stride; i++) {
+            for (cs_lnum_t j = 0; j < 3; j++) {
+              gradpa[j] = grad_cpu[c_id][i][j];
+              grad_cpu[c_id][i][j] = 0.;
+            }
+
+            for (cs_lnum_t j = 0; j < 3; j++)
+              for (cs_lnum_t k = 0; k < 3; k++)
+                grad_cpu[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k];
+          }
         }
+      }
+    
+    if(perf){
+      stop = std::chrono::high_resolution_clock::now();
+      elapsed = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+    }
+  }
 
-        for (cs_lnum_t j = 0; j < 3; j++)
-          for (cs_lnum_t k = 0; k < 3; k++)
-            grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k];
+  /* Performances */
+  if(perf){
+    #if defined(HAVE_CUDA)
+      if(compute_cuda){
+        printf("reconstruct Compute and tranferts time in us: CUDA = %ld\n", elapsed_cuda.count());
       }
+    #endif
+
+    if(compute_cpu){
+      printf("reconstruct Compute and tranferts time in us: CPU = %ld\n", elapsed.count());
     }
   }
 
-  /* Periodicity and parallelism treatment */
+  /* Accuracy grad_cpu and grad_gpu */
+  if(accuracy){
+    #if defined(HAVE_CUDA)
+      if(compute_cuda){
+        if(compute_cpu){
+          for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) {
+            for (cs_lnum_t i = 0; i < stride; i++) {
+              for (int j  =0; j < 3; ++j) {
+                auto cpu = grad_cpu[c_id][i][j];
+                auto cuda = grad_gpu[c_id][i][j];
+                double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) );
+                if (err> 1e-6) {
+                  printf("reconstruct DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\tulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda));
+                }
+              }
+            }
+          }
+        }
+      }
+    #endif
+  }
+
+// Free memory 
+#if defined(HAVE_CUDA)
+  if(compute_cuda){
+    if(res_cpu){
+      BFT_FREE(grad_gpu);
+    }
+  }
+#endif
+
+// Free memory
+  if(compute_cpu){
+    if(!res_cpu){
+      BFT_FREE(grad_cpu);
+    }
+  }
+
+    /* Periodicity and parallelism treatment */
 
   if (m->halo != NULL) {
     cs_halo_sync_var_strided(m->halo, halo_type, (cs_real_t *)grad, stride*3);
@@ -5644,6 +5822,7 @@ _reconstruct_strided_gradient(const cs_mesh_t              *m,
                                              (cs_real_t *)grad);
     }
   }
+
 }
 
 /*----------------------------------------------------------------------------
@@ -6653,6 +6832,351 @@ _find_bc_coeffs(const char         *var_name,
  *   gradv          --> gradient of pvar (du_i/dx_j : gradv[][i][j])
  *----------------------------------------------------------------------------*/
 
+BEGIN_C_DECLS
+#if defined(HAVE_OPENMP_TARGET)
+
+void
+_lsq_vector_gradient_target(const cs_mesh_t               *m,
+                     const cs_mesh_adjacencies_t   *madj,
+                     const cs_mesh_quantities_t    *fvq,
+                     const cs_halo_type_t           halo_type,
+                     const int                      inc,
+                     const cs_real_3_t    *restrict coefav,
+                     const cs_real_33_t   *restrict coefbv,
+                     const cs_real_3_t    *restrict pvar,
+                     const cs_real_t      *restrict c_weight,
+                     cs_real_33_t         *restrict gradv,
+                     cs_cocg_6_t          *restrict cocg,
+                     cs_real_33_t         *restrict rhs)
+{
+  const cs_lnum_t n_cells                 = m->n_cells;
+  const cs_lnum_t n_b_cells                 = m->n_b_cells;
+  const cs_lnum_t n_i_faces                 = m->n_i_faces;
+  const cs_lnum_t n_b_faces                 = m->n_b_faces;
+  const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts;
+  const int n_i_groups = m->i_face_numbering->n_groups;
+  const int n_i_threads = m->i_face_numbering->n_threads;
+  const int n_b_threads = m->b_face_numbering->n_threads;
+  const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index;
+  const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index;
+
+  const cs_lnum_2_t *restrict i_face_cells
+    = (const cs_lnum_2_t *restrict)m->i_face_cells;
+  const cs_lnum_t *restrict b_face_cells
+    = (const cs_lnum_t *restrict)m->b_face_cells;
+  const cs_lnum_t *restrict cell_cells_idx
+    = (const cs_lnum_t *restrict)madj->cell_cells_idx;
+  const cs_lnum_t *restrict cell_b_faces_idx
+    = (const cs_lnum_t *restrict)madj->cell_b_faces_idx;
+  const cs_lnum_t *restrict cell_cells_lst
+    = (const cs_lnum_t *restrict)m->cell_cells_lst;
+  const cs_lnum_t *restrict b_cells
+    = (const cs_lnum_t *restrict)m->b_cells;
+
+  const cs_lnum_t *restrict cell_cells
+    = (const cs_lnum_t *restrict)madj->cell_cells;
+  const short int *restrict cell_i_faces_sgn
+    = (const short int *restrict)madj->cell_i_faces_sgn;
+  const cs_lnum_t *restrict cell_i_faces
+    = (const cs_lnum_t *restrict)madj->cell_i_faces;
+  const cs_lnum_t *restrict cell_b_faces
+    = (const cs_lnum_t *restrict)madj->cell_b_faces;
+
+  const cs_real_3_t *restrict cell_f_cen
+    = (const cs_real_3_t *restrict)fvq->cell_f_cen;
+  const cs_real_t *restrict weight = fvq->weight;
+  const cs_real_t *restrict b_dist = fvq->b_dist;
+  const cs_real_3_t *restrict b_face_normal
+    = (const cs_real_3_t *restrict)fvq->b_face_normal;
+
+  /* Timing the computation */
+
+  double t_kernel = 0.0;
+	double t_begin, t_end;
+
+  bool scatter = true;
+
+  /* Contribution from interior faces */
+  int num_device = omp_get_num_devices();
+  printf("OMP supported devices %d\n", num_device);
+  t_begin = omp_get_wtime();
+#pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \
+                        map(from: gradv[0:n_cells_ext]) \
+                        map(to: i_face_cells[0:n_i_faces], b_face_normal[0:n_b_faces], \
+                                coefav[0:n_b_faces], coefbv[0:n_b_faces], b_dist[0:n_b_faces],\
+                                cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\
+                                cell_cells_idx[0:n_cells_ext], \
+                                cell_cells_lst[0:n_cells_ext], \
+                                cell_b_faces_idx[0:n_cells+1], \
+                                b_face_cells[0:n_b_faces], \
+                                b_cells[0:n_b_cells], \
+                                cocg[0:n_cells_ext])
+{
+  #pragma omp target teams distribute parallel for \
+                       schedule(static,1)
+  for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++)  {
+    for (cs_lnum_t i = 0; i < 3; i++){
+      for (cs_lnum_t j = 0; j < 3; j++){
+        rhs[c_id][i][j] = 0.0;
+      }
+    }
+  }
+  if(scatter){
+    #pragma omp target teams distribute parallel for \
+                         schedule(static,1)
+    for (cs_lnum_t f_id = 0; f_id < n_i_faces; f_id++) {
+
+      cs_lnum_t c_id1 = i_face_cells[f_id][0];
+      cs_lnum_t c_id2 = i_face_cells[f_id][1];
+
+      cs_real_t  dc[3], fctb[3],_weight1, _weight2, _denom, _pond, pfac;
+
+      for (cs_lnum_t i = 0; i < 3; i++){
+        dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i];
+      }
+
+      cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+
+      if (c_weight == NULL){
+        _weight1 = 1.;
+        _weight2 = 1.;
+      }
+      else{
+        _pond = weight[f_id];
+        _denom = 1. / (  _pond       *c_weight[c_id1]
+                                    + (1. - _pond)*c_weight[c_id2]);
+        _weight1 = c_weight[c_id1] * _denom;
+        _weight2 = c_weight[c_id2] * _denom;
+      }
+
+      for (cs_lnum_t i = 0; i < 3; i++) {
+        pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+
+        for (cs_lnum_t j = 0; j < 3; j++) {
+          fctb[j] = dc[j] * pfac;
+          #pragma omp atomic
+          rhs[c_id1][i][j] += _weight2 * fctb[j];
+          #pragma omp atomic
+          rhs[c_id2][i][j] += _weight1 * fctb[j];
+        }
+      }
+
+    }
+  }
+  else{
+    #pragma omp target teams distribute parallel for \
+                         schedule(static,1)
+    for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) {
+
+      cs_lnum_t s_id = cell_cells_idx[c_id];
+      cs_lnum_t e_id = cell_cells_idx[c_id+1];
+
+      cs_lnum_t c_id2, f_id;
+
+      // cs_real_t _rhs[64][3][3];
+      // cs_lnum_t tid = omp_get_thread_num();
+
+      // for(cs_lnum_t i = 0; i < 3; i++){
+      //   for(cs_lnum_t j = 0; j < 3; j++){
+      //     _rhs[tid][i][j] = 0.0;
+      //   }
+      // }
+
+      cs_real_t  dc[3], fctb[3], _weight, _denom, _pond, pfac;
+      for(cs_lnum_t index = s_id; index < e_id; index++){
+
+        c_id2 = cell_cells[index];
+
+        for (cs_lnum_t i = 0; i < 3; i++){
+          dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id][i];
+        }
+
+        cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+
+      if (c_weight == NULL){
+        _weight = 1.;
+      }
+      else{
+        f_id = cell_i_faces[index];
+        _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id];
+        _denom = 1. / (  _pond       *c_weight[c_id]
+                                    + (1. - _pond)*c_weight[c_id2]);
+        _weight = c_weight[c_id2] * _denom;
+      }
+
+
+        for (cs_lnum_t i = 0; i < 3; i++) {
+          pfac = (pvar[c_id2][i] - pvar[c_id][i]) * ddc;
+
+          for (cs_lnum_t j = 0; j < 3; j++) {
+            fctb[j] = dc[j] * pfac;
+            rhs[c_id][i][j] += _weight * fctb[j];
+          }
+        }
+      }
+
+      // for(cs_lnum_t i = 0; i < 3; i++){
+      //   for(cs_lnum_t j = 0; j < 3; j++){
+      //     rhs[c_id][i][j] = _rhs[tid][i][j];
+      //   }
+      // }
+
+    } 
+  }
+
+  if (halo_type == CS_HALO_EXTENDED) {
+
+   #pragma omp target teams distribute parallel for \
+                         schedule(static,1)
+   for (cs_lnum_t c_id1 = 0; c_id1 < n_cells; c_id1++) {
+     for (cs_lnum_t cidx = cell_cells_idx[c_id1];
+          cidx < cell_cells_idx[c_id1+1];
+          cidx++) {
+
+       cs_lnum_t c_id2 = cell_cells_lst[cidx];
+
+       cs_real_t dc[3];
+
+       for (cs_lnum_t i = 0; i < 3; i++){
+        dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i];
+       }
+
+       cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+
+       for (cs_lnum_t i = 0; i < 3; i++) {
+
+         cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+
+         for (cs_lnum_t j = 0; j < 3; j++) {
+           rhs[c_id1][i][j] += dc[j] * pfac;
+         }
+       }
+     }
+   }
+
+  } 
+
+  if(scatter){
+    #pragma omp target teams distribute parallel for \
+                              firstprivate(cs_math_zero_threshold) schedule(static,1)
+    for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) {
+
+      cs_lnum_t c_id1 = b_face_cells[f_id];
+
+      cs_real_t n_d_dist[3];
+    //  /* Normal is vector 0 if the b_face_normal norm is too small */
+    cs_math_3_normalize(b_face_normal[f_id], n_d_dist);
+
+      cs_real_t d_b_dist = 1. / b_dist[f_id];
+
+    //  /* Normal divided by b_dist */
+      for (cs_lnum_t i = 0; i < 3; i++){
+        n_d_dist[i] *= d_b_dist;
+      }
+
+      for (cs_lnum_t i = 0; i < 3; i++) {
+        cs_real_t pfac =   coefav[f_id][i]*inc
+                        + (  coefbv[f_id][0][i] * pvar[c_id1][0]
+                            + coefbv[f_id][1][i] * pvar[c_id1][1]
+                            + coefbv[f_id][2][i] * pvar[c_id1][2]
+                            - pvar[c_id1][i]);
+
+        for (cs_lnum_t j = 0; j < 3; j++){
+          #pragma omp atomic
+          rhs[c_id1][i][j] += n_d_dist[j] * pfac;
+        }
+      }
+
+    } 
+  }
+  else{
+    #pragma omp target teams distribute parallel for \
+                        firstprivate(cs_math_zero_threshold) schedule(static,1)
+    for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) {
+
+      cs_lnum_t c_id = b_cells[c_idx];
+
+      cs_lnum_t s_id = cell_b_faces_idx[c_id];
+      cs_lnum_t e_id = cell_b_faces_idx[c_id+1];
+
+      cs_lnum_t f_id;
+
+      cs_real_t n_d_dist[3];
+
+      for(cs_lnum_t index = s_id; index < e_id; index++){
+
+        f_id = cell_b_faces[index];
+
+        cs_math_3_normalize(b_face_normal[f_id], n_d_dist);
+
+        cs_real_t d_b_dist = 1. / b_dist[f_id];
+
+        //  /* Normal divided by b_dist */
+        for (cs_lnum_t i = 0; i < 3; i++){
+          n_d_dist[i] *= d_b_dist;
+        }
+
+        for (cs_lnum_t i = 0; i < 3; i++) {
+          cs_real_t pfac =   coefav[f_id][i]*inc
+                          + (  coefbv[f_id][0][i] * pvar[c_id][0]
+                              + coefbv[f_id][1][i] * pvar[c_id][1]
+                              + coefbv[f_id][2][i] * pvar[c_id][2]
+                              - pvar[c_id][i]);
+
+          for (cs_lnum_t j = 0; j < 3; j++){
+            rhs[c_id][i][j] += n_d_dist[j] * pfac;
+          }
+        }
+      }
+
+    } 
+  }
+  
+
+ #pragma omp target teams distribute parallel for \
+                       schedule(static,1)
+ for (cs_lnum_t c_idx = 0; c_idx < n_cells*3*3; c_idx++) {
+
+  size_t c_id = c_idx / (3*3);
+  size_t i = (c_idx / 3) % 3;
+  size_t j = c_idx % 3;
+
+  auto cocg_temp = cocg[c_id];
+  cs_real_t _cocg[3];
+
+  _cocg[0] = cocg_temp[5];
+  _cocg[1] = cocg_temp[4];
+  _cocg[2] = cocg_temp[2];
+
+  if(j == 0){
+    _cocg[0] = cocg_temp[0];
+    _cocg[1] = cocg_temp[3];
+    _cocg[2] = cocg_temp[5];
+  }
+
+  if(j == 1){
+    _cocg[0] = cocg_temp[3];
+    _cocg[1] = cocg_temp[1];
+    _cocg[2] = cocg_temp[4];
+  }
+
+  gradv[c_id][i][j] =   rhs[c_id][i][0] * _cocg[0]
+                      + rhs[c_id][i][1] * _cocg[1]
+                      + rhs[c_id][i][2] * _cocg[2];
+ }
+
+} // end omp data
+
+t_end = omp_get_wtime();
+
+t_kernel = t_end - t_begin;
+printf("Time of kernel: %lf\n", t_kernel);
+
+}
+
+#endif
+END_C_DECLS
+
 static void
 _lsq_vector_gradient(const cs_mesh_t               *m,
                      const cs_mesh_adjacencies_t   *madj,
@@ -6665,7 +7189,7 @@ _lsq_vector_gradient(const cs_mesh_t               *m,
                      const cs_real_t      *restrict c_weight,
                      cs_real_33_t         *restrict gradv)
 {
-  const cs_lnum_t n_cells = m->n_cells;
+  const cs_lnum_t n_cells                 = m->n_cells;
   const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts;
   const int n_i_groups = m->i_face_numbering->n_groups;
   const int n_i_threads = m->i_face_numbering->n_threads;
@@ -6691,16 +7215,113 @@ _lsq_vector_gradient(const cs_mesh_t               *m,
 
   cs_cocg_6_t  *restrict cocgb_s = NULL;
   cs_cocg_6_t *restrict cocg = NULL;
-  _get_cell_cocg_lsq(m, halo_type, false, fvq, &cocg, &cocgb_s);
 
-  cs_real_33_t *rhs;
+  /* Timing the computation */
 
-  BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t);
+  std::chrono::high_resolution_clock::time_point start, stop;
+  std::chrono::microseconds elapsed, elapsed_cuda, elapsed_target;
+
+#if defined(HAVE_CUDA)
+  bool accel = (cs_get_device_id() > -1) ? true : false;
+#else
+  bool accel = false;
+#endif
+
+  _get_cell_cocg_lsq(m, halo_type, accel, fvq, &cocg, &cocgb_s);
+
+  cs_real_33_t *rhs, *rhs_cuda, *rhs_target, *gradv_cuda, *gradv_cpu, *gradv_target;
+  bool compute_cuda, compute_cpu, res_cpu, perf, accuracy;
+
+  compute_cuda = accel;
+  res_cpu = !accel;
+
+#if defined(DEBUG)
+  compute_cpu = true;
+  perf        = true;
+  accuracy    = true;
+#elif defined(NDEBUG)
+  compute_cpu = true;
+  res_cpu     = true;
+  perf        = false;
+  accuracy    = false;
+#else
+  compute_cpu = false;
+  perf        = false;
+  accuracy    = false;
+#endif
+
+  // Pour l'instant ces lignes sont pour moi
+  // Elles seront à enlever
+  // compute_cuda  = true;
+  compute_cpu   = true;
+  // res_cpu       = false;
+  perf        = true;
+  // accuracy    = true;
+  
+BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t);
+BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t);
+BFT_MALLOC(rhs_target, n_cells_ext, cs_real_33_t);
+BFT_MALLOC(gradv_cuda, n_cells_ext, cs_real_33_t);
+BFT_MALLOC(gradv_cpu, n_cells_ext, cs_real_33_t);
+BFT_MALLOC(gradv_target, n_cells_ext, cs_real_33_t);
 
   /* Compute Right-Hand Side */
   /*-------------------------*/
+#if defined(HAVE_CUDA)
+  if(compute_cuda){
+    if(perf){
+      start = std::chrono::high_resolution_clock::now();
+    }
+    cs_lsq_vector_gradient_cuda(
+      m,
+      madj,
+      fvq,
+      halo_type,
+      inc,
+      coefav,
+      coefbv,
+      pvar,
+      c_weight,
+      cocg,
+      cocgb_s,
+      gradv,
+      rhs_cuda);
+    
+    if(perf){
+      stop = std::chrono::high_resolution_clock::now();
+      elapsed_cuda = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+    }
+  } // end if compute_cuda
+#endif
 
-# pragma omp parallel for
+#if defined(HAVE_OPENMP_TARGET)
+if(perf){
+  start = std::chrono::high_resolution_clock::now();
+}
+_lsq_vector_gradient_target(m, 
+                            madj, 
+                            fvq, 
+                            halo_type, 
+                            inc, 
+                            coefav,
+                            coefbv,
+                            pvar,
+                            c_weight,
+                            gradv_target,
+                            cocg,
+                            rhs_target);
+if(perf){
+  stop = std::chrono::high_resolution_clock::now();
+  elapsed_target = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+  printf("OMP target lsq %ld\n", elapsed_target.count());
+}
+#endif
+
+if(compute_cpu){
+  if(perf){
+    start = std::chrono::high_resolution_clock::now();
+  }
+  # pragma omp parallel for
   for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) {
     for (cs_lnum_t i = 0; i < 3; i++)
       for (cs_lnum_t j = 0; j < 3; j++)
@@ -6708,7 +7329,6 @@ _lsq_vector_gradient(const cs_mesh_t               *m,
   }
 
   /* Contribution from interior faces */
-
   for (int g_id = 0; g_id < n_i_groups; g_id++) {
 
 #   pragma omp parallel for
@@ -6832,17 +7452,18 @@ _lsq_vector_gradient(const cs_mesh_t               *m,
   /* Compute gradient */
   /*------------------*/
 
+  #pragma omp parallel for
   for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) {
     for (cs_lnum_t i = 0; i < 3; i++) {
-      gradv[c_id][i][0] =   rhs[c_id][i][0] * cocg[c_id][0]
+      gradv_cpu[c_id][i][0] =   rhs[c_id][i][0] * cocg[c_id][0]
                           + rhs[c_id][i][1] * cocg[c_id][3]
                           + rhs[c_id][i][2] * cocg[c_id][5];
 
-      gradv[c_id][i][1] =   rhs[c_id][i][0] * cocg[c_id][3]
+      gradv_cpu[c_id][i][1] =   rhs[c_id][i][0] * cocg[c_id][3]
                           + rhs[c_id][i][1] * cocg[c_id][1]
                           + rhs[c_id][i][2] * cocg[c_id][4];
 
-      gradv[c_id][i][2] =   rhs[c_id][i][0] * cocg[c_id][5]
+      gradv_cpu[c_id][i][2] =   rhs[c_id][i][0] * cocg[c_id][5]
                           + rhs[c_id][i][1] * cocg[c_id][4]
                           + rhs[c_id][i][2] * cocg[c_id][2];
     }
@@ -6900,12 +7521,38 @@ _lsq_vector_gradient(const cs_mesh_t               *m,
       for (int kk = 0; kk < 9; kk++) {
         int ii = _33_9_idx[kk][0];
         int jj = _33_9_idx[kk][1];
-        gradv[c_id][ii][jj] = x[kk];
+        gradv_cpu[c_id][ii][jj] = x[kk];
       }
 
     }
 
   }
+  stop = std::chrono::high_resolution_clock::now();
+  elapsed = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+} // end if COMPUTE_CPU 
+
+if(accuracy){
+  #pragma omp parallel for
+  for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) {
+    for (cs_lnum_t i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; ++j) {
+        auto cpu  = gradv_cpu[c_id][i][j];
+        auto cuda = gradv[c_id][i][j];
+
+        if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-12) {
+          printf("lsq DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\t|CPU - CUDA| = %.17f\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda));
+        }
+      }
+    }
+  }
+}
+
+if(perf)
+  printf("lsq Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count());
+
+if(res_cpu){
+  memcpy(gradv, gradv_cpu, sizeof(cs_real_33_t) * n_cells_ext);
+}
 
   /* Periodicity and parallelism treatment */
 
@@ -6916,6 +7563,11 @@ _lsq_vector_gradient(const cs_mesh_t               *m,
   }
 
   BFT_FREE(rhs);
+  BFT_FREE(rhs_cuda);
+  BFT_FREE(rhs_target);
+  BFT_FREE(gradv_cuda);
+  BFT_FREE(gradv_cpu);
+  BFT_FREE(gradv_target);
 }
 
 /*----------------------------------------------------------------------------*/
@@ -7022,10 +7674,20 @@ _lsq_strided_gradient(const cs_mesh_t             *m,
   BFT_MALLOC(rhs, n_cells_ext, grad_t);
   cs_array_real_fill_zero(n_cells_ext*stride*3, (cs_real_t *)rhs);
 
+  grad_t *gradv_cpu;
+  BFT_MALLOC(gradv_cpu, n_cells_ext*stride*3, grad_t);
+
+
+#if defined(HAVE_CUDA)
+  bool accel = (cs_get_device_id() > -1) ? true : false;
+#else
+  bool accel = false;
+#endif
+
   cs_cocg_6_t *restrict cocgb = NULL;
   cs_cocg_6_t *restrict cocg = NULL;
 
-  _get_cell_cocg_lsq(m, halo_type, false, fvq, &cocg, &cocgb);
+  _get_cell_cocg_lsq(m, halo_type, accel, fvq, &cocg, &cocgb);
 
   /* Contribution from interior faces
      -------------------------------- */
@@ -7295,24 +7957,45 @@ _lsq_strided_gradient(const cs_mesh_t             *m,
   #pragma omp parallel for if(n_cells >= CS_THR_MIN)
   for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) {
     for (cs_lnum_t i = 0; i < stride; i++) {
-      grad[c_id][i][0] =   rhs[c_id][i][0] * cocg[c_id][0]
-                         + rhs[c_id][i][1] * cocg[c_id][3]
-                         + rhs[c_id][i][2] * cocg[c_id][5];
+      gradv_cpu[c_id][i][0] =   rhs[c_id][i][0] * cocg[c_id][0]
+                          + rhs[c_id][i][1] * cocg[c_id][3]
+                          + rhs[c_id][i][2] * cocg[c_id][5];
 
-      grad[c_id][i][1] =   rhs[c_id][i][0] * cocg[c_id][3]
-                         + rhs[c_id][i][1] * cocg[c_id][1]
-                         + rhs[c_id][i][2] * cocg[c_id][4];
+      gradv_cpu[c_id][i][1] =   rhs[c_id][i][0] * cocg[c_id][3]
+                          + rhs[c_id][i][1] * cocg[c_id][1]
+                          + rhs[c_id][i][2] * cocg[c_id][4];
 
-      grad[c_id][i][2] =   rhs[c_id][i][0] * cocg[c_id][5]
-                         + rhs[c_id][i][1] * cocg[c_id][4]
-                         + rhs[c_id][i][2] * cocg[c_id][2];
+      gradv_cpu[c_id][i][2] =   rhs[c_id][i][0] * cocg[c_id][5]
+                          + rhs[c_id][i][1] * cocg[c_id][4]
+                          + rhs[c_id][i][2] * cocg[c_id][2];
 
     }
   }
+  memcpy(grad, gradv_cpu, sizeof(cs_real_t) * n_cells_ext * stride * 3);
 
   /* Correct gradient on boundary cells */
   /*------------------------------------*/
-
+cs_real_t c_norm, ref_norm;
+
+// #if defined(HAVE_CUDA)
+  // cs_lsq_vector_gradient_strided_cuda<stride>
+  // (
+  //   m,
+  //   madj,
+  //   fvq,
+  //   halo_type,
+  //   inc,
+  //   coefav,
+  //   coefbv,
+  //   pvar,
+  //   c_weight,
+  //   cocg,
+  //   cocgb,
+  //   gradv,
+  //   rhs,
+  //   n_c_iter_max,
+  //   c_eps);
+// #else
   #pragma omp parallel for schedule(dynamic, CS_THR_MIN)
   for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) {
 
@@ -7321,7 +8004,7 @@ _lsq_strided_gradient(const cs_mesh_t             *m,
     cs_lnum_t s_id = cell_b_faces_idx[c_id];
     cs_lnum_t e_id = cell_b_faces_idx[c_id+1];
 
-    cs_real_3_t *c_grad = grad[c_id];
+    cs_real_3_t *c_grad = gradv_cpu[c_id];
 
     cs_real_t grad_0[stride][3], grad_i[stride][3];
 
@@ -7330,7 +8013,7 @@ _lsq_strided_gradient(const cs_mesh_t             *m,
 
     /* Compute norm for convergence testing. */
 
-    cs_real_t ref_norm = 0;
+    ref_norm = 0;
     for (cs_lnum_t kk = 0; kk < stride; kk++) {
       for (cs_lnum_t ll = 0; ll < 3; ll++)
         ref_norm += abs(c_grad[kk][ll]);
@@ -7338,7 +8021,7 @@ _lsq_strided_gradient(const cs_mesh_t             *m,
 
     /* Iterate over boundary condition contributions. */
 
-    cs_real_t c_norm = 0;
+    c_norm = 0;
 
     int n_c_it;
     for (n_c_it = 0; n_c_it < n_c_iter_max; n_c_it++) {
@@ -7453,6 +8136,7 @@ _lsq_strided_gradient(const cs_mesh_t             *m,
 #endif
       n_c_it *= -1;
     }
+// #endif
 
     /* Optional postprocessing */
 
@@ -7464,12 +8148,27 @@ _lsq_strided_gradient(const cs_mesh_t             *m,
     }
 
   } /* End of correction for BC coeffs */
+  #pragma omp parallel for
+  for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) {
+    for (cs_lnum_t i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; ++j) {
+        auto cpu  = gradv_cpu[c_id][i][j];
+        auto cuda = grad[c_id][i][j];
+
+        if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-6) {
+          printf("lsq_strided DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\t|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda));
+        }
+      }
+    }
+  }
+
 
   /* Synchronize halos */
 
   _sync_strided_gradient_halo<stride>(m, halo_type, grad);
 
   BFT_FREE(rhs);
+  BFT_FREE(gradv_cpu);
 }
 
 /*----------------------------------------------------------------------------
@@ -8741,27 +9440,184 @@ _gradient_vector(const char                     *var_name,
 
   /* Use Neumann BC's as default if not provided */
 
+
   cs_real_3_t *_bc_coeff_a = NULL;
   cs_real_33_t *_bc_coeff_b = NULL;
 
+  /* Timing the computation */
+
+  std::chrono::high_resolution_clock::time_point start, stop;
+  std::chrono::microseconds elapsed, elapsed_cuda;
+
+  cs_real_3_t *_bc_coeff_a_gpu = NULL;
+  cs_real_3_t *_bc_coeff_a_cpu = NULL;
+  cs_real_33_t *_bc_coeff_b_gpu = NULL;
+  cs_real_33_t *_bc_coeff_b_cpu = NULL;
+
+  bool compute_cuda;
+  bool compute_cpu;
+  bool res_cpu;
+  bool perf;
+  bool accuracy;
+  
+#if defined(HAVE_CUDA)
+  compute_cuda = (cs_get_device_id() > -1) ? true : false;
+#else
+  compute_cuda = false;
+#endif
+
+
+res_cpu = !compute_cuda;
+
+#if defined(DEBUG)
+  compute_cpu = true;
+  perf        = true;
+  accuracy    = true;
+#elif defined(NDEBUG)
+  compute_cpu = true;
+  perf        = false;
+  accuracy    = false;
+#else
+  compute_cpu = false;
+  perf        = false;
+  accuracy    = false;
+#endif
+
+  // Pour l'instant ces lignes sont pour moi
+  // Elles seront à enlever
+  compute_cuda  = false;
+  compute_cpu   = true;
+  res_cpu       = true;
+
+  // A ne pas garder dans la version finale
+  // perf        = false;
+  // accuracy    = false;
+
+// Compute on GPU
+#if defined(HAVE_CUDA)
+  if(compute_cuda){
+    BFT_MALLOC(_bc_coeff_a_gpu, n_b_faces, cs_real_3_t);
+    BFT_MALLOC(_bc_coeff_b_gpu, n_b_faces, cs_real_33_t);
+    if(perf){
+      start = std::chrono::high_resolution_clock::now();
+    }
+    _gradient_vector_cuda(mesh, _bc_coeff_a_gpu, _bc_coeff_b_gpu, (bc_coeff_a == NULL), (bc_coeff_b == NULL), perf);
+    if(perf){
+      stop = std::chrono::high_resolution_clock::now();
+      elapsed_cuda = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+    }
+  }
+#endif
+
+// Compute on CPU
+  if(compute_cpu){
+    BFT_MALLOC(_bc_coeff_a_cpu, n_b_faces, cs_real_3_t);
+    BFT_MALLOC(_bc_coeff_b_cpu, n_b_faces, cs_real_33_t);
+
+    if(perf){
+      start = std::chrono::high_resolution_clock::now();
+    }
+
+    if (bc_coeff_a == NULL) {
+      for (cs_lnum_t i = 0; i < n_b_faces; i++) {
+        for (cs_lnum_t j = 0; j < 3; j++)
+          _bc_coeff_a_cpu[i][j] = 0;
+      }
+    }
+    if (bc_coeff_b == NULL) {
+      for (cs_lnum_t i = 0; i < n_b_faces; i++) {
+        for (cs_lnum_t j = 0; j < 3; j++) {
+          for (cs_lnum_t k = 0; k < 3; k++)
+            _bc_coeff_b_cpu[i][j][k] = 0;
+          _bc_coeff_b_cpu[i][j][j] = 1;
+        }
+      }
+    }
+
+    if(perf){
+      stop = std::chrono::high_resolution_clock::now();
+      elapsed = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+    }
+  }
+
+// selected the result of the computation on CPU or GPU
   if (bc_coeff_a == NULL) {
-    BFT_MALLOC(_bc_coeff_a, n_b_faces, cs_real_3_t);
-    for (cs_lnum_t i = 0; i < n_b_faces; i++) {
-      for (cs_lnum_t j = 0; j < 3; j++)
-        _bc_coeff_a[i][j] = 0;
+    if(res_cpu){
+      bc_coeff_a = (const cs_real_3_t *)_bc_coeff_a_cpu;
+    } else {
+      bc_coeff_a = (const cs_real_3_t *)_bc_coeff_a_gpu;
     }
-    bc_coeff_a = (const cs_real_3_t *)_bc_coeff_a;
   }
   if (bc_coeff_b == NULL) {
-    BFT_MALLOC(_bc_coeff_b, n_b_faces, cs_real_33_t);
-    for (cs_lnum_t i = 0; i < n_b_faces; i++) {
-      for (cs_lnum_t j = 0; j < 3; j++) {
-        for (cs_lnum_t k = 0; k < 3; k++)
-          _bc_coeff_b[i][j][k] = 0;
-        _bc_coeff_b[i][j][j] = 1;
+    if(res_cpu){
+      bc_coeff_b = (const cs_real_33_t *)_bc_coeff_b_cpu;
+    } else {
+      bc_coeff_b = (const cs_real_33_t *)_bc_coeff_b_gpu;
+    }
+  }
+
+  /* Performances */
+  if(perf){
+    #if defined(HAVE_CUDA)
+      if(compute_cuda){
+        printf("_gradient_vector Compute and tranferts time in us: CUDA = %ld\n", elapsed_cuda.count());
       }
+    #endif
+
+    if(compute_cpu){
+      printf("_gradient_vector Compute and tranferts time in us: CPU = %ld\n", elapsed.count());
+    }
+  }
+
+  /* Accuracy grad_cpu and grad_gpu */
+  if(accuracy){
+    #if defined(HAVE_CUDA)
+      if(compute_cuda){
+        if(compute_cpu){
+          for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) {
+            for (cs_lnum_t i = 0; i < 3; i++) {
+              auto cpu = _bc_coeff_a_cpu[f_id][i];
+              auto cuda = _bc_coeff_a_gpu[f_id][i];
+              double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) );
+              if (err> 1e-12) {
+                printf("_gradient_vector_a DIFFERENCE @%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\tulp = %a\n", f_id, i, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda));
+              }
+            }
+          }
+
+          for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) {
+            for (cs_lnum_t i = 0; i < 3; i++) {
+              for (int j  =0; j < 3; ++j) {
+                auto cpu = _bc_coeff_b_cpu[f_id][i][j];
+                auto cuda = _bc_coeff_b_gpu[f_id][i][j];
+                double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) );
+                if (err> 1e-12) {
+                  printf("_gradient_vector_b DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\tulp = %a\n", f_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda));
+                }
+              }
+            }
+          }
+        }
+      }
+    #endif
+  }
+
+// Free memory 
+#if defined(HAVE_CUDA)
+  if(compute_cuda){
+    if(res_cpu){
+      BFT_FREE(_bc_coeff_a_gpu);
+      BFT_FREE(_bc_coeff_b_gpu);
+    }
+  }
+#endif
+
+// Free memory
+  if(compute_cpu){
+    if(!res_cpu){
+      BFT_FREE(_bc_coeff_a_cpu);
+      BFT_FREE(_bc_coeff_b_cpu);
     }
-    bc_coeff_b = (const cs_real_33_t *)_bc_coeff_b;
   }
 
   /* Update of local BC. coefficients for internal coupling */
diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu
index b8f16e2e79..df08ace6a0 100644
--- a/src/alge/cs_gradient_cuda.cu
+++ b/src/alge/cs_gradient_cuda.cu
@@ -23,53 +23,25 @@
 */
 
 /*----------------------------------------------------------------------------*/
+#include "cs_alge_cuda.cuh"
 
-#include "cs_defs.h"
-
-/*----------------------------------------------------------------------------
- * Standard C library headers
- *----------------------------------------------------------------------------*/
-
-#include <assert.h>
-#include <errno.h>
-#include <float.h>
-#include <math.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <string.h>
-
-#if defined(HAVE_MPI)
-#include <mpi.h>
-#endif
-
-#include <cuda_runtime_api.h>
-
-/*----------------------------------------------------------------------------
- *  Local headers
- *----------------------------------------------------------------------------*/
-
-#include "bft_error.h"
-#include "bft_mem.h"
-
-#include "cs_base_accel.h"
-#include "cs_base_cuda.h"
-#include "cs_blas.h"
-#include "cs_cell_to_vertex.h"
-#include "cs_ext_neighborhood.h"
-#include "cs_field.h"
-#include "cs_field_pointer.h"
-#include "cs_halo.h"
-#include "cs_halo_perio.h"
-#include "cs_log.h"
-#include "cs_math.h"
-#include "cs_mesh.h"
-#include "cs_mesh_adjacencies.h"
-#include "cs_mesh_quantities.h"
-#include "cs_parall.h"
-#include "cs_porous_model.h"
-#include "cs_prototypes.h"
-#include "cs_timer.h"
-#include "cs_timer_stats.h"
+#include "cs_gradient.h"
+#include "cs_gradient_lsq_vector.cuh"
+#include "cs_gradient_lsq_vector_gather.cuh"
+#include "cs_gradient_lsq_vector_gather_v2.cuh"
+#include "cs_gradient_lsq_vector_gather_v3.cuh"
+#include "cs_gradient_lsq_vector_v2.cuh"
+#include "cs_gradient_lsq_vector_v3.cuh"
+#include "cs_gradient_priv.h"
+#include "cs_reconstruct_vector_gradient_gather.cuh"
+#include "cs_reconstruct_vector_gradient_gather_v2.cuh"
+#include "cs_reconstruct_vector_gradient_gather_v3.cuh"
+#include "cs_reconstruct_vector_gradient_gather_v4.cuh"
+#include "cs_reconstruct_vector_gradient_gather_v5.cuh"
+#include "cs_reconstruct_vector_gradient_scatter.cuh"
+#include "cs_reconstruct_vector_gradient_scatter_cf.cuh"
+#include "cs_reconstruct_vector_gradient_scatter_v2.cuh"
+#include "cs_reconstruct_vector_gradient_scatter_v2_cf.cuh"
 
 /*----------------------------------------------------------------------------
  *  Header for the current file
@@ -98,6 +70,36 @@
  * Recompute cocg at boundaries, using saved cocgb
  *----------------------------------------------------------------------------*/
 
+#define INSTANTIATE_LSQ(name, stride) template void name <stride> (const cs_mesh_t *m,\
+                     const cs_mesh_adjacencies_t   *madj,\
+                     const cs_mesh_quantities_t    *fvq,\
+                     const cs_halo_type_t           halo_type,\
+                     const int                      inc,\
+                     const cs_real_t (*restrict coefav)[stride],\
+                     const cs_real_t (*restrict coefbv)[stride][stride],\
+                     const cs_real_t (*restrict pvar)[stride],\
+                     const cs_real_t      *restrict c_weight,\
+                     cs_cocg_6_t          *restrict cocg,\
+                     cs_cocg_6_t          *restrict cocgb,\
+                     cs_real_t (*restrict gradv)[stride][3],\
+                     cs_real_t (*restrict rhs)[stride][3],\
+                     cs_lnum_t n_c_iter_max,\
+                     cs_real_t c_eps)
+
+#define INSTANTIATE_RECONSTRUCT(name, stride) template void name <stride> (const cs_mesh_t *m, \
+                              const cs_mesh_adjacencies_t  *madj, \
+                              const cs_mesh_quantities_t   *fvq, \
+                              cs_halo_type_t                halo_type, \
+                              int                           inc, \
+                              const cs_real_t (*restrict coefav)[stride], \
+                              const cs_real_t (*restrict coefbv)[stride][stride], \
+                              const cs_real_t (*restrict pvar)[stride], \
+                              const cs_real_t     *restrict c_weight, \
+                              const cs_real_t (*restrict r_grad)[stride][3], \
+                              cs_real_t (*restrict grad)[stride][3], \
+                              bool                      test_bool, \
+                              bool                          perf)
+
 template <typename T>
 __global__ static void
 _compute_cocg_from_cocgb(cs_lnum_t         n_b_cells,
@@ -430,8 +432,115 @@ _init_rhsv(cs_lnum_t         size,
   }
 }
 
+__global__ static void
+_init_rhs_v3(cs_lnum_t         size,
+           double3      *restrict rhs)
+{
+  cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_id >= size)
+    return;
+
+  rhs[c_id] = make_double3(0.0, 0.0, 0.0);
+}
+
+__global__ static void
+_compute_gradient_lsq_v_v3(cs_lnum_t           size,
+                        cs_real_33_t        *restrict gradv,
+                        cs_real_33_t        *restrict rhs,
+                        cs_cocg_6_t         *restrict cocg)
+{
+  size_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_id >= size)
+    return;
+  auto& gradc = gradv[c_id];
+  auto& rhsc = rhs[c_id];
+  auto cocgc = cocg[c_id];
+  for(cs_lnum_t i = 0; i < 3; i++){
+    auto& gradci = gradc[i];
+    auto rhsci = rhsc[i];
+    gradci[0] =   rhsci[0] * cocgc[0]
+                          + rhsci[1] * cocgc[3]
+                          + rhsci[2] * cocgc[5];
+
+    gradci[1] =   rhsci[0] * cocgc[3]
+                        + rhsci[1] * cocgc[1]
+                        + rhsci[2] * cocgc[4];
+
+    gradci[2] =   rhsci[0] * cocgc[5]
+                        + rhsci[1] * cocgc[4]
+                        + rhsci[2] * cocgc[2];
+  }
+}
+
+__global__ static void
+_compute_gradient_lsq_b_v(cs_lnum_t         size,
+                          cs_lnum_t         n_b_cells,
+                          cs_lnum_t         *restrict b_cells,
+                          cs_real_33_t        *restrict gradv,
+                          cs_real_33_t        *restrict rhs,
+                          cs_cocg_6_t         *restrict cocg,
+                          cs_real_3_t *restrict b_face_normal,
+                          cs_lnum_t *restrict cell_b_faces,
+                          cs_lnum_t *restrict cell_b_faces_idx)
+{
+  size_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  cs_lnum_t _33_9_idx[9][2];
+  int nn = 0;
+  for (int ll = 0; ll < 3; ll++) {
+    for (int mm = 0; mm < 3; mm++) {
+      _33_9_idx[nn][0] = ll;
+      _33_9_idx[nn][1] = mm;
+      nn++;
+    }
+  }
+
+   /* Loop on boundary cells */
+  cs_lnum_t c_id1 = b_cells[c_id];
+  cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9];
+
+  cocgb[0][0] = cocg[c_id][0];
+  cocgb[0][1] = cocg[c_id][3];
+  cocgb[0][2] = cocg[c_id][5];
+  cocgb[1][0] = cocg[c_id][3];
+  cocgb[1][1] = cocg[c_id][1];
+  cocgb[1][2] = cocg[c_id][4];
+  cocgb[2][0] = cocg[c_id][5];
+  cocgb[2][1] = cocg[c_id][4];
+  cocgb[2][2] = cocg[c_id][2];
+
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id+1];
+  cs_lnum_t f_id;
+  cs_real_3_t normal;
+  cs_real_t norm, inverse_norm;
+
+  for (cs_lnum_t index = s_id; index < e_id; index++) {
+
+    f_id = cell_b_faces[index];
+
+    /* Normal is vector 0 if the b_face_normal norm is too small */
+    norm = sqrt(b_face_normal[index][0]*b_face_normal[index][0]
+            + b_face_normal[index][1]*b_face_normal[index][1]
+            + b_face_normal[index][2]*b_face_normal[index][2]);
+
+    inverse_norm = 1. / norm;
+
+    normal[0] = inverse_norm * b_face_normal[index][0];
+    normal[1] = inverse_norm * b_face_normal[index][1];
+    normal[2] = inverse_norm * b_face_normal[index][2];
+
+    for (cs_lnum_t ii = 0; ii < 3; ii++) {
+      for (cs_lnum_t jj = 0; jj < 3; jj++)
+        cocgb[ii][jj] += normal[ii] * normal[jj];
+    }
+
+  }
+
+}
+
 /*----------------------------------------------------------------------------
- * Synchronize of copy a cs_real_t type array from the host to a device.
+ * Synchronize of copy a T type array from the host to a device.
  *
  * parameters:
  *   val_h          <-- pointer to host data
@@ -443,38 +552,6 @@ _init_rhsv(cs_lnum_t         size,
  *                      after use if non-NULL)
  *----------------------------------------------------------------------------*/
 
-static void
-_sync_or_copy_real_h2d(const  cs_real_t   *val_h,
-                       cs_lnum_t           n_vals,
-                       int                 device_id,
-                       cudaStream_t        stream,
-                       const cs_real_t   **val_d,
-                       void              **buf_d)
-{
-  const cs_real_t  *_val_d = NULL;
-  void             *_buf_d = NULL;
-
-  cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h);
-  size_t size = n_vals * sizeof(cs_real_t);
-
-  if (alloc_mode == CS_ALLOC_HOST) {
-    CS_CUDA_CHECK(cudaMalloc(&_buf_d, size));
-    cs_cuda_copy_h2d(_buf_d, val_h, size);
-    _val_d = (const cs_real_t *)_buf_d;
-  }
-  else {
-    _val_d = (const cs_real_t *)cs_get_device_ptr((void *)val_h);
-
-    if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED)
-      cudaMemPrefetchAsync(val_h, size, device_id, stream);
-    else
-      cs_sync_h2d(val_h);
-  }
-
-  *val_d = _val_d;
-  *buf_d = _buf_d;
-}
-
 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
 
 /*=============================================================================
@@ -749,3 +826,1362 @@ cs_gradient_scalar_lsq_cuda(const cs_mesh_t              *m,
 }
 
 /*----------------------------------------------------------------------------*/
+/*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
+
+/*=============================================================================
+ * Semi-private function definitions
+ *============================================================================*/
+
+/*----------------------------------------------------------------------------
+ * Compute cell gradient using least-squares reconstruction for non-orthogonal
+ * meshes (nswrgp > 1).
+ *
+ * Optionally, a volume force generating a hydrostatic pressure component
+ * may be accounted for.
+ *
+ * cocg is computed to account for variable B.C.'s (flux).
+ *
+ * parameters:
+ *   m              <-- pointer to associated mesh structure
+ *   madj           <-- pointer to mesh adjacencies structure
+ *   fvq            <-- pointer to associated finite volume quantities
+ *   halo_type      <-- halo type (extended or not)
+ *   inc            <-- if 0, solve on increment; 1 otherwise
+ *   coefav         <-- B.C. coefficients for boundary face normals
+ *   coefbv         <-- B.C. coefficients for boundary face normals
+ *   pvar           <-- variable
+ *   gradv          --> gradient of pvar (du_i/dx_j : gradv[][i][j])
+ *----------------------------------------------------------------------------*/
+extern "C" void
+cs_lsq_vector_gradient_cuda(const cs_mesh_t               *m,
+                     const cs_mesh_adjacencies_t   *madj,
+                     const cs_mesh_quantities_t    *fvq,
+                     const cs_halo_type_t           halo_type,
+                     const int                      inc,
+                     const cs_real_3_t    *restrict coefav,
+                     const cs_real_33_t   *restrict coefbv,
+                     const cs_real_3_t    *restrict pvar,
+                     const cs_real_t      *restrict c_weight,
+                     cs_cocg_6_t          *restrict cocg,
+                     cs_cocg_6_t          *restrict cocgb,
+                     cs_real_33_t         *restrict gradv,
+                     cs_real_33_t         *restrict rhs)
+{
+  const cs_lnum_t n_cells = m->n_cells;
+  const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts;
+  const cs_lnum_t n_b_faces   = m->n_b_faces;
+  const cs_lnum_t n_i_faces   = m->n_i_faces;
+
+
+  int device_id;
+  cudaGetDevice(&device_id);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  cudaEvent_t start, mem_h2d, init, i_faces, halo, b_faces, gradient, gradient_b, stop;
+  float msec = 0.0f, msecTotal = 0.0f;
+  CS_CUDA_CHECK(cudaEventCreate(&start));
+  CS_CUDA_CHECK(cudaEventCreate(&mem_h2d));
+  CS_CUDA_CHECK(cudaEventCreate(&init));
+  CS_CUDA_CHECK(cudaEventCreate(&i_faces));
+  CS_CUDA_CHECK(cudaEventCreate(&halo));
+  CS_CUDA_CHECK(cudaEventCreate(&b_faces));
+  CS_CUDA_CHECK(cudaEventCreate(&gradient));
+  CS_CUDA_CHECK(cudaEventCreate(&gradient_b));
+  CS_CUDA_CHECK(cudaEventCreate(&stop));
+
+  // Record the start event
+  CS_CUDA_CHECK(cudaEventRecord(start, stream));
+
+  cs_real_33_t *rhs_d;
+  CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_33_t)));
+
+
+  cs_real_33_t *grad_d = NULL;
+  CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_33_t)));
+
+  void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL,
+  *_cell_cells_idx_d = NULL;
+  const cs_real_3_t *pvar_d = NULL, *coefa_d = NULL;
+  const cs_real_33_t *coefb_d = NULL;
+  const cs_lnum_t *cell_cells_idx_d = NULL;
+
+  // cs_cuda_copy_h2d(rhs_d, rhs, n_cells * sizeof(cs_real_33_t));
+
+  unsigned int blocksize = 256;
+
+  const cs_lnum_2_t *restrict i_face_cells
+    = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells);
+  const cs_lnum_t *restrict b_face_cells
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells);
+  const cs_lnum_t *restrict b_cells
+    = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_cells);
+  const cs_lnum_t *restrict cell_cells_idx
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx);
+  const cs_lnum_t *restrict cell_cells_lst
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst);
+  const cs_lnum_t *restrict cell_b_faces_idx
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx);
+  const cs_lnum_t *restrict cell_b_faces
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces);
+  const cs_lnum_t *restrict cell_i_faces
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces);
+  const short int *restrict cell_i_faces_sgn
+    = (const short int *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces_sgn);
+  const int n_i_groups = m->i_face_numbering->n_groups;
+  const int n_i_threads = m->i_face_numbering->n_threads;
+  const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index;
+  const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index;
+
+  const cs_lnum_t *restrict cell_cells
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells);
+  const cs_real_3_t *restrict cell_cen
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen);
+  const cs_real_3_t *restrict cell_f_cen
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen);
+  const cs_real_t *restrict weight
+    = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight);
+  const cs_real_t *restrict b_dist
+    = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->b_dist);
+  const cs_real_3_t *restrict b_face_normal
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal);
+  const cs_real_3_t *restrict b_face_cog
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_cog);
+
+  const cs_real_t *restrict cell_f_cen_1d
+    = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen);
+  const cs_real_3_t *restrict diipb
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb);
+
+  cs_lnum_t stride = 3;
+
+  // printf("n_i_thread:%d\tn_i_groups:%d\tn_cells%d\n", n_i_threads, n_i_groups, n_cells);
+
+  _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream,
+                         &pvar_d, &_pvar_d);
+
+  _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream,
+                         &coefa_d, &_coefa_d);
+  _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream,
+                         &coefb_d, &_coefb_d);
+
+  CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream));
+
+  // _init_rhs<<<get_gridsize(n_cells_ext, blocksize), blocksize, 0, stream>>>
+  //   (n_cells_ext,
+  //    rhs_d);
+  cudaMemset(rhs_d, 0, n_cells_ext*sizeof(cs_real_33_t));
+
+  // _init_rhs_v2<<<get_gridsize(n_cells_ext*3*3, blocksize), blocksize, 0, stream>>>
+  //   (n_cells_ext*3*3,
+  //    rhs_d);
+
+  // _init_rhs_v3<<<get_gridsize(n_cells_ext*3, blocksize), blocksize, 0, stream>>>
+  //   (n_cells_ext*3,
+  //    rhs_d);
+
+  CS_CUDA_CHECK(cudaEventRecord(init, stream));
+	
+  
+  // _compute_rhs_lsq_v_i_face_v0<<<get_gridsize(n_i_faces, blocksize), blocksize, 0, stream>>>
+  //     (n_i_faces,
+  //      i_face_cells, 
+  //      cell_f_cen, 
+  //      rhs_d, 
+  //      pvar_d, 
+  //      weight, 
+  //      c_weight);
+
+  // _compute_rhs_lsq_v_i_face_cf<<<get_gridsize(n_i_faces, blocksize), blocksize, 0, stream>>>
+  //     (n_i_faces,
+  //      i_face_cells,
+  //      cell_f_cen,
+  //      rhs_d,
+  //      pvar_d,
+  //      weight,
+  //      c_weight);
+  // _compute_rhs_lsq_v_i_face<<<get_gridsize(n_i_faces, blocksize), blocksize, 0, stream>>>
+  //     (n_i_faces,
+  //      i_face_cells,
+  //      cell_f_cen,
+  //      rhs_d,
+  //      pvar_d,
+  //      weight,
+  //      c_weight);
+
+  _compute_rhs_lsq_v_i_face_v2cf<<<get_gridsize(n_i_faces, blocksize), blocksize, 0, stream>>>
+     (n_i_faces,
+      i_face_cells, 
+      cell_f_cen, 
+      rhs_d, 
+      pvar_d, 
+      weight, 
+      c_weight);
+
+  // _compute_rhs_lsq_v_i_face_v3<<<get_gridsize(n_i_faces*3*3, blocksize), blocksize, 0, stream>>>
+  //     (n_i_faces*3*3,
+  //      i_face_cells, 
+  //      cell_f_cen, 
+  //      rhs_d, 
+  //      pvar_d, 
+  //      weight, 
+  //      c_weight);
+  // assert(cell_cells_idx);
+  // assert(cell_cells);
+  // assert(cell_f_cen);
+  // assert(rhs_d);
+  // assert(pvar_d);
+  // assert(weight);
+  // _compute_rhs_lsq_v_i_face_gather<<<get_gridsize(n_cells, blocksize), blocksize, 0, stream>>>
+  //     (n_cells,
+  //      cell_cells_idx,
+  //      cell_cells,
+  //      cell_i_faces,
+  //      cell_i_faces_sgn,
+  //      cell_f_cen, 
+  //      rhs_d, 
+  //      pvar_d, 
+  //      weight, 
+  //      c_weight);
+
+  // _compute_rhs_lsq_v_i_face_gather_v2<<<get_gridsize(n_cells, blocksize), blocksize, 0, stream>>>
+  //    (n_cells,
+  //     cell_cells_idx,
+  //     cell_cells,
+  //     cell_i_faces,
+  //     cell_i_faces_sgn,
+  //     cell_f_cen, 
+  //     rhs_d, 
+  //     pvar_d, 
+  //     weight, 
+  //     c_weight);
+  
+  // _compute_rhs_lsq_v_i_face_gather_v4<<<get_gridsize(n_cells, blocksize), blocksize, 0, stream>>>
+  //     (n_cells,
+  //      cell_cells_idx,
+  //      cell_cells,
+  //      cell_i_faces,
+  //      cell_i_faces_sgn,
+  //      cell_f_cen, 
+  //      rhs_d, 
+  //      pvar_d, 
+  //      weight, 
+  //      c_weight);
+
+  CS_CUDA_CHECK(cudaEventRecord(i_faces, stream));
+
+  if(halo_type == CS_HALO_EXTENDED && cell_cells_idx != NULL){
+
+    _compute_rhs_lsq_v_b_neighbor<<<get_gridsize(n_cells, blocksize), blocksize, 0, stream>>>
+      (n_cells, 
+       cell_cells_idx, 
+       cell_cells, 
+       cell_f_cen, 
+       rhs_d, 
+       pvar_d);
+  }
+  CS_CUDA_CHECK(cudaEventRecord(halo, stream));
+
+  // _compute_rhs_lsq_v_b_face<<<get_gridsize(m->n_b_cells, blocksize), blocksize, 0, stream>>>
+  //     (m->n_b_faces,
+  //      b_face_cells,
+  //      cell_f_cen,
+  //      b_face_normal,
+  //      rhs_d,
+  //      pvar_d,
+  //      b_dist,
+  //      coefb_d,
+  //      coefa_d,
+  //      inc);
+
+  // _compute_rhs_lsq_v_b_face_gather_stride_v2<3, cs_real_3_t, cs_real_33_t><<<get_gridsize(m->n_b_cells, blocksize), blocksize, 0, stream>>>
+  //     (m->n_b_cells,
+  //      cell_b_faces_idx,
+  //      cell_b_faces,
+  //      b_cells,
+  //      b_face_cog,
+  //      cell_cen, 
+  //      rhs_d, 
+  //      pvar_d, 
+  //      coefb_d, 
+  //      coefa_d,
+  //      cocg,
+  //      cocgb, 
+  //      inc);
+    
+  _compute_rhs_lsq_v_b_face_gather_v3<<<get_gridsize(m->n_b_cells, blocksize), blocksize, 0, stream>>>
+     (m->n_b_cells,
+      cell_b_faces_idx,
+      cell_b_faces,
+      b_cells,
+      b_face_normal, 
+      rhs_d, 
+      pvar_d, 
+      b_dist, 
+      coefb_d, 
+      coefa_d, 
+      inc);
+
+  // _compute_rhs_lsq_v_b_face_v2<<<get_gridsize(m->n_b_cells, blocksize), blocksize, 0, stream>>>
+  //     (m->n_b_faces,
+  //      b_face_cells, 
+  //      cell_f_cen, 
+  //      b_face_normal, 
+  //      rhs_d, 
+  //      pvar_d, 
+  //      b_dist, 
+  //      coefb_d, 
+  //      coefa_d, 
+  //      inc);
+
+  CS_CUDA_CHECK(cudaEventRecord(b_faces, stream));
+
+
+  // if (rhs_d != NULL) {
+  //   size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3;
+  //   cs_cuda_copy_d2h(rhs, rhs_d, size);
+  // }
+  // else
+  //   cs_sync_d2h(rhs);
+
+  // /* Compute gradient */
+  // /*------------------*/
+
+  // _compute_gradient_lsq_v<<<get_gridsize(n_cells, blocksize), blocksize, 0, stream>>>
+  //   (n_cells,
+  //    grad_d, 
+  //    rhs_d, 
+  //    cocg);
+
+  // _compute_gradient_lsq_v_v4<<<get_gridsize(n_cells, blocksize), blocksize, 0, stream>>>
+  //   (n_cells,
+  //    grad_d, 
+  //    rhs_d, 
+  //    cocg);
+
+
+  // _compute_gradient_lsq_v_v5<<<get_gridsize(n_cells*3*3, blocksize), blocksize, 0, stream>>>
+  //   (n_cells*3*3,
+  //    gradv_d, 
+  //    rhs_d, 
+  //    cocg);
+
+  _compute_gradient_lsq_v_v6<<<get_gridsize(n_cells*3*3, blocksize), blocksize, 0, stream>>>
+    (n_cells*3*3,
+     grad_d, 
+     rhs_d, 
+     cocg);
+
+  CS_CUDA_CHECK(cudaEventRecord(gradient, stream));
+
+  _compute_gradient_lsq_b_v<<<get_gridsize(m->n_b_cells, blocksize), blocksize, 0, stream>>>
+    (m->n_b_cells,
+     b_cells,
+     cell_b_faces_idx,
+     cell_b_faces,
+     b_face_normal,
+     diipb,
+     pvar_d,
+     b_dist,
+     coefb_d,
+     coefa_d,
+     grad_d, 
+     rhs_d, 
+     cocgb,
+     inc);
+
+  CS_CUDA_CHECK(cudaEventRecord(gradient_b, stream));
+
+  // /* Sync to host */
+  if (grad_d != NULL) {
+    size_t size = n_cells * sizeof(cs_real_t) * 3 * 3;
+    cs_cuda_copy_d2h(gradv, grad_d, size);
+  }
+  else
+    cs_sync_d2h(gradv);
+
+  CS_CUDA_CHECK(cudaEventRecord(stop, stream));
+  CS_CUDA_CHECK(cudaEventSynchronize(stop));
+
+  cudaStreamSynchronize(stream);
+  cudaStreamDestroy(stream);
+
+  printf("lsq Kernels :");
+  msec = 0.0f;
+	CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init));
+  printf("Kernels execution time in us: \t");
+  printf("Init = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+	CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces));
+  printf("I_faces = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+	CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, halo));
+  printf("Halo = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventElapsedTime(&msec, halo, b_faces));
+  printf("B_faces = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces, gradient));
+  printf("Gradient = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventElapsedTime(&msec, gradient, gradient_b));
+  printf("Gradient_b = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient_b));
+  printf("Total kernel = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop));
+  printf("Total = %f\t", msec*1000.f);
+
+  printf("\n");
+
+
+  if (_pvar_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_pvar_d));
+  if (_coefa_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_coefa_d));
+  if (_coefb_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_coefb_d));
+
+  CS_CUDA_CHECK(cudaFree(rhs_d));
+  CS_CUDA_CHECK(cudaFree(grad_d));
+  
+}
+
+/*----------------------------------------------------------------------------*/
+/*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
+
+/*=============================================================================
+ * Semi-private function definitions
+ *============================================================================*/
+
+/*----------------------------------------------------------------------------
+ * Compute cell gradient using least-squares reconstruction for non-orthogonal
+ * meshes (nswrgp > 1).
+ *
+ * Optionally, a volume force generating a hydrostatic pressure component
+ * may be accounted for.
+ *
+ * cocg is computed to account for variable B.C.'s (flux).
+ *
+ * parameters:
+ *   m              <-- pointer to associated mesh structure
+ *   madj           <-- pointer to mesh adjacencies structure
+ *   fvq            <-- pointer to associated finite volume quantities
+ *   halo_type      <-- halo type (extended or not)
+ *   inc            <-- if 0, solve on increment; 1 otherwise
+ *   coefav         <-- B.C. coefficients for boundary face normals
+ *   coefbv         <-- B.C. coefficients for boundary face normals
+ *   pvar           <-- variable
+ *   gradv          --> gradient of pvar (du_i/dx_j : gradv[][i][j])
+ *----------------------------------------------------------------------------*/
+template <cs_lnum_t stride>
+void
+cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t               *m,
+                     const cs_mesh_adjacencies_t   *madj,
+                     const cs_mesh_quantities_t    *fvq,
+                     const cs_halo_type_t           halo_type,
+                     const int                      inc,
+                     const cs_real_t (*restrict coefav)[stride],
+                     const cs_real_t (*restrict coefbv)[stride][stride],
+                     const cs_real_t (*restrict pvar)[stride],
+                     const cs_real_t      *restrict c_weight,
+                     cs_cocg_6_t          *restrict cocg,
+                     cs_cocg_6_t          *restrict cocgb,
+                     cs_real_t (*restrict gradv)[stride][3],
+                     cs_real_t (*restrict rhs)[stride][3],
+                     cs_lnum_t n_c_iter_max,
+                     cs_real_t c_eps)
+{
+  const cs_lnum_t n_cells = m->n_cells;
+  const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts;
+  const cs_lnum_t n_b_faces   = m->n_b_faces;
+  const cs_lnum_t n_i_faces   = m->n_i_faces;
+
+
+  int device_id;
+  cudaGetDevice(&device_id);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  cudaEvent_t start, mem_h2d, init, i_faces, halo, b_faces, gradient, stop;
+  float msec = 0.0f, msecTotal = 0.0f;
+  CS_CUDA_CHECK(cudaEventCreate(&start));
+  CS_CUDA_CHECK(cudaEventCreate(&mem_h2d));
+  CS_CUDA_CHECK(cudaEventCreate(&init));
+  CS_CUDA_CHECK(cudaEventCreate(&i_faces));
+  CS_CUDA_CHECK(cudaEventCreate(&halo));
+  CS_CUDA_CHECK(cudaEventCreate(&b_faces));
+  CS_CUDA_CHECK(cudaEventCreate(&gradient));
+  CS_CUDA_CHECK(cudaEventCreate(&stop));
+
+  // Record the start event
+  CS_CUDA_CHECK(cudaEventRecord(start, stream));
+
+  decltype(rhs) rhs_d;
+  CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_t)*stride*3));
+
+
+  decltype(gradv) grad_d = NULL;
+  CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_t)*stride*3));
+
+  void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL,
+  *_cell_cells_idx_d = NULL;
+  decltype(pvar) pvar_d = NULL, coefa_d = NULL;
+  decltype(coefbv) coefb_d = NULL;
+  const cs_lnum_t *cell_cells_idx_d = NULL;
+
+  unsigned int blocksize = 256;
+
+  const cs_lnum_2_t *restrict i_face_cells
+    = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells);
+  const cs_lnum_t *restrict b_face_cells
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells);
+  const cs_lnum_t *restrict b_cells
+    = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_cells);
+  const cs_lnum_t *restrict cell_cells_idx
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx);
+  const cs_lnum_t *restrict cell_cells_lst
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst);
+  const cs_lnum_t *restrict cell_b_faces_idx
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx);
+  const cs_lnum_t *restrict cell_b_faces
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces);
+  const cs_lnum_t *restrict cell_i_faces
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces);
+  const short int *restrict cell_i_faces_sgn
+    = (const short int *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces_sgn);
+  const int n_i_groups = m->i_face_numbering->n_groups;
+  const int n_i_threads = m->i_face_numbering->n_threads;
+  const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index;
+  const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index;
+
+  const cs_lnum_t *restrict cell_cells
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells);
+  const cs_real_3_t *restrict cell_cen
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen);
+  const cs_real_3_t *restrict cell_f_cen
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen);
+  const cs_real_t *restrict weight
+    = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight);
+  const cs_real_t *restrict b_dist
+    = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->b_dist);
+  const cs_real_3_t *restrict b_face_normal
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal);
+  const cs_real_3_t *restrict b_face_cog
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_cog);
+  const cs_real_3_t *restrict diipb
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb);
+
+  _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream,
+                         &pvar_d, &_pvar_d);
+
+  _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream,
+                         &coefa_d, &_coefa_d);
+  _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream,
+                         &coefb_d, &_coefb_d);
+
+  cs_cuda_copy_h2d(grad_d, gradv, sizeof(cs_real_t) * n_cells * stride * 3);
+
+  CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream));
+
+  CS_CUDA_CHECK(cudaEventRecord(init, stream));
+
+  CS_CUDA_CHECK(cudaEventRecord(i_faces, stream));
+
+  CS_CUDA_CHECK(cudaEventRecord(halo, stream));
+
+  // assert(b_cells);
+  // assert(cell_b_faces_idx);
+  // assert(cell_b_faces);
+  // assert(b_face_cog);
+  // assert(cell_cen);
+  // assert(diipb);
+  // assert(grad_d);
+  // assert(coefb_d);
+  // assert(cocg);
+
+  // for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) {
+  //   for (cs_lnum_t i = 0; i < stride; i++) {
+  //     for (int j = 0; j < 3; ++j) {
+  //       // if(fabs(gradv[c_id][i][j]) != 0.0)
+  //         // printf("grad = %f\t", gradv[c_id][i][j]);
+  //     }
+  //   }
+  // }
+
+  _compute_gradient_lsq_b_strided_v<stride><<<get_gridsize(m->n_b_cells, blocksize), blocksize, 0, stream>>>
+      (m->n_b_cells,
+       b_cells,
+       cell_b_faces_idx,
+       cell_b_faces,
+       b_face_cog,
+       cell_cen,
+       diipb,
+       grad_d,
+       coefb_d, 
+       cocg,
+       n_c_iter_max,
+       c_eps);
+
+  CS_CUDA_CHECK(cudaEventRecord(b_faces, stream));
+
+  CS_CUDA_CHECK(cudaEventRecord(gradient, stream));
+
+  // /* Sync to host */
+  if (grad_d != NULL) {
+    size_t size = n_cells * sizeof(cs_real_t) * stride * 3;
+    cs_cuda_copy_d2h(gradv, grad_d, size);
+  }
+  else
+    cs_sync_d2h(gradv);
+
+  CS_CUDA_CHECK(cudaEventRecord(stop, stream));
+  CS_CUDA_CHECK(cudaEventSynchronize(stop));
+
+  cudaStreamSynchronize(stream);
+  cudaStreamDestroy(stream);
+
+  printf("lsq Kernels :");
+  // msec = 0.0f;
+	// CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init));
+  // printf("Kernels execution time in us: \t");
+  // printf("Init = %f\t", msec*1000.f);
+
+  // msec = 0.0f;
+	// CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces));
+  // printf("I_faces = %f\t", msec*1000.f);
+
+  // msec = 0.0f;
+	// CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, halo));
+  // printf("Halo = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventElapsedTime(&msec, halo, b_faces));
+  printf("B_faces = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces, gradient));
+  printf("Gradient = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient));
+  printf("Total kernel = %f\t", msec*1000.f);
+
+  msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop));
+  printf("Total = %f\t", msec*1000.f);
+
+  printf("\n");
+
+
+  if (_pvar_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_pvar_d));
+  if (_coefa_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_coefa_d));
+  if (_coefb_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_coefb_d));
+
+  CS_CUDA_CHECK(cudaFree(rhs_d));
+  CS_CUDA_CHECK(cudaFree(grad_d));
+  
+}
+
+INSTANTIATE_LSQ(cs_lsq_vector_gradient_strided_cuda, 1);
+INSTANTIATE_LSQ(cs_lsq_vector_gradient_strided_cuda, 3);
+INSTANTIATE_LSQ(cs_lsq_vector_gradient_strided_cuda, 6);
+
+
+
+
+/*----------------------------------------------------------------------------
+ * Reconstruct the gradient of a vector using a given gradient of
+ * this vector (typically lsq).
+ *
+ * parameters:
+ *   m              <-- pointer to associated mesh structure
+ *   fvq            <-- pointer to associated finite volume quantities
+ *   cpl            <-- structure associated with internal coupling, or NULL
+ *   inc            <-- if 0, solve on increment; 1 otherwise
+ *   coefav         <-- B.C. coefficients for boundary face normals
+ *   coefbv         <-- B.C. coefficients for boundary face normals
+ *   pvar           <-- variable
+ *   c_weight       <-- weighted gradient coefficient variable
+ *   r_grad         --> gradient used for reconstruction
+ *   grad           --> gradient of pvar (du_i/dx_j : grad[][i][j])
+ *----------------------------------------------------------------------------*/
+template <cs_lnum_t stride>
+void
+cs_reconstruct_vector_gradient_cuda(const cs_mesh_t              *m,
+                              const cs_mesh_adjacencies_t  *madj,
+                              const cs_mesh_quantities_t   *fvq,
+                              cs_halo_type_t                halo_type,
+                              int                           inc,
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              const cs_real_t     *restrict c_weight,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              bool                      test_bool,
+                              bool                          perf)
+{
+  const cs_lnum_t n_cells = m->n_cells;
+  const cs_lnum_t n_b_cells = m->n_b_cells;
+  const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts;
+  const cs_lnum_t n_b_faces   = m->n_b_faces;
+  const cs_lnum_t n_i_faces   = m->n_i_faces;
+
+  int device_id;
+  cudaGetDevice(&device_id);
+
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  
+  cudaEvent_t start, mem_h2d, init, i_faces, b_faces_1, b_faces_2, b_faces_3, stop;
+  float msec = 0.0f, msec_tot;
+  CS_CUDA_CHECK(cudaEventCreate(&start));
+  CS_CUDA_CHECK(cudaEventCreate(&mem_h2d));
+  CS_CUDA_CHECK(cudaEventCreate(&init));
+  CS_CUDA_CHECK(cudaEventCreate(&i_faces));
+  CS_CUDA_CHECK(cudaEventCreate(&b_faces_1));
+  CS_CUDA_CHECK(cudaEventCreate(&b_faces_2));
+  CS_CUDA_CHECK(cudaEventCreate(&b_faces_3));
+  CS_CUDA_CHECK(cudaEventCreate(&stop));
+
+
+  // Record the start event
+  CS_CUDA_CHECK(cudaEventRecord(start, stream));
+
+  decltype(grad) grad_d;
+  CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells_ext * sizeof(cs_real_t)*stride*3));
+
+  void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL,
+  *_cell_cells_idx_d = NULL, *_r_grad_d = NULL;
+  decltype(pvar) pvar_d = NULL, coefa_d = NULL;
+  decltype(coefbv) coefb_d = NULL;
+  decltype(r_grad) r_grad_d = NULL;
+  const cs_lnum_t *cell_cells_idx_d = NULL;
+
+
+  unsigned int blocksize = 256;
+
+  const cs_lnum_2_t *restrict i_face_cells
+    = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells);
+  const cs_lnum_t *restrict b_face_cells
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells);
+  const cs_lnum_t *restrict cell_b_faces_idx
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx);
+  const int n_i_groups
+      = m->i_face_numbering->n_groups;
+  const int n_i_threads
+      = m->i_face_numbering->n_threads;
+  cs_lnum_t *restrict i_group_index;
+  CS_CUDA_CHECK(cudaMalloc(&i_group_index, sizeof(int)*n_i_groups * n_i_threads * 2));
+  cs_cuda_copy_h2d(i_group_index, (void *)m->i_face_numbering->group_index, sizeof(int)*n_i_groups * n_i_threads * 2);
+  
+  const int n_b_groups
+      = m->b_face_numbering->n_groups;
+  const int n_b_threads
+      = m->b_face_numbering->n_threads;
+  cs_lnum_t *restrict b_group_index;
+  CS_CUDA_CHECK(cudaMalloc(&b_group_index, sizeof(int)*n_i_groups * n_i_threads * 2));
+  cs_cuda_copy_h2d(b_group_index, (void *)m->b_face_numbering->group_index, sizeof(int)*n_b_groups * n_b_threads * 2);
+  const cs_lnum_t *restrict cell_cells_idx
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx);
+  const cs_lnum_t *restrict cell_cells
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells);
+  // if (madj->cell_i_faces == NULL) {
+  cs_mesh_adjacencies_update_cell_i_faces();
+  // }
+  assert(madj->cell_i_faces);
+  const cs_lnum_t n_cells_i_face = (madj->cell_cells_idx[n_cells]);
+  cs_lnum_t *restrict cell_i_faces;
+  CS_CUDA_CHECK(cudaMalloc(&cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face));
+  cs_cuda_copy_h2d(cell_i_faces, madj->cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face);
+  assert(cell_i_faces);
+
+  short int *restrict cell_i_faces_sgn;
+  CS_CUDA_CHECK(cudaMalloc(&cell_i_faces_sgn, sizeof(short int)*n_cells_i_face));
+  cs_cuda_copy_h2d(cell_i_faces_sgn, madj->cell_i_faces_sgn, sizeof(short int)*n_cells_i_face);
+
+  const cs_lnum_t *restrict b_cells
+    = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_cells);
+  const cs_lnum_t *restrict cell_b_faces
+    = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces);
+
+  cs_real_t *restrict cell_f_vol;
+  CS_CUDA_CHECK(cudaMalloc(&cell_f_vol, n_cells * sizeof(cs_real_t)));
+  cs_cuda_copy_h2d(cell_f_vol, (void *)fvq->cell_f_vol, sizeof(cs_real_t)*n_cells);
+  if (cs_glob_porous_model == 1 || cs_glob_porous_model == 2)
+    cell_f_vol = fvq->cell_vol;
+  const cs_real_3_t *restrict cell_f_cen
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen);
+  const cs_real_t *restrict weight
+    = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight);
+  cs_real_3_t *restrict i_f_face_normal;
+  CS_CUDA_CHECK(cudaMalloc(&i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces));
+  cs_cuda_copy_h2d(i_f_face_normal, (void *)fvq->i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces);
+
+  const cs_real_3_t *restrict b_f_face_normal
+  = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_normal);
+  cs_real_3_t *restrict dofij;
+  CS_CUDA_CHECK(cudaMalloc(&dofij, sizeof(cs_real_3_t)*n_i_faces));
+  cs_cuda_copy_h2d(dofij, (void *)fvq->dofij, sizeof(cs_real_3_t)*n_i_faces);
+  const cs_real_3_t *restrict diipb
+    = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb);
+  cs_real_33_t *restrict corr_grad_lin;
+  CS_CUDA_CHECK(cudaMalloc(&corr_grad_lin, n_cells * sizeof(cs_real_33_t)));
+  cs_cuda_copy_h2d(corr_grad_lin, (void *)fvq->corr_grad_lin, sizeof(cs_real_33_t)*n_cells);
+  const cs_lnum_t has_dc
+      = fvq->has_disable_flag;
+  int *restrict c_disable_flag;
+  CS_CUDA_CHECK(cudaMalloc(&c_disable_flag, n_cells * sizeof(int)));
+  cs_cuda_copy_h2d(c_disable_flag, (void *)fvq->c_disable_flag, sizeof(int)*n_cells);
+
+
+  _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream,
+    &pvar_d, &_pvar_d);
+
+  _sync_or_copy_real_h2d(r_grad, n_cells_ext, device_id, stream,
+    &r_grad_d, &_r_grad_d);
+
+  _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream,
+        &coefa_d, &_coefa_d);
+  _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream,
+        &coefb_d, &_coefb_d);
+      
+
+  // ----------------------------Begin of Kernels part 1-------------------------------------------
+  
+  CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream));
+
+  /* Initialization */
+
+  cudaMemset(grad_d, 0, n_cells * sizeof(cs_real_t)*stride*3);
+
+  CS_CUDA_CHECK(cudaEventRecord(init, stream));
+  
+  
+  /* Interior faces contribution */
+
+  /*************************************Kernels Scatter**************************************************/
+  _compute_reconstruct_v_i_face<stride><<<get_gridsize(n_i_faces, blocksize), blocksize, 0, stream>>>
+                                (n_i_faces,
+                                i_face_cells,
+                                pvar_d,
+                                weight,
+                                c_weight,
+                                r_grad_d,
+                                grad_d,
+                                dofij,
+                                i_f_face_normal);
+
+  // _compute_reconstruct_v_i_face_v2<stride><<<get_gridsize(n_i_faces, blocksize) * 3, blocksize, 0, stream>>>
+  //                               (n_i_faces * 3,
+  //                               i_face_cells,
+  //                               pvar_d,
+  //                               weight,
+  //                               c_weight,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               dofij,
+  //                               i_f_face_normal);
+
+  /*************************************Kernels Scatter conflict free**************************************/
+  // _compute_reconstruct_v_i_face_cf<stride><<<get_gridsize(n_i_faces, blocksize), blocksize, 0, stream>>>
+  //                               (n_i_faces,
+  //                               i_face_cells,
+  //                               pvar_d,
+  //                               weight,
+  //                               c_weight,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               dofij,
+  //                               i_f_face_normal);
+
+  // _compute_reconstruct_v_i_face_v2_cf<stride><<<get_gridsize(n_i_faces, blocksize) * 3, blocksize, 0, stream>>>
+  //                               (n_i_faces * 3,
+  //                               i_face_cells,
+  //                               pvar_d,
+  //                               weight,
+  //                               c_weight,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               dofij,
+  //                               i_f_face_normal);
+  
+  /*************************************Kernels Gather**************************************************/
+  // _compute_reconstruct_v_i_face_gather<stride><<<get_gridsize(n_cells, blocksize), blocksize, 0, stream>>>
+  //                                     ( n_cells,
+  //                                       pvar_d,
+  //                                       weight,
+  //                                       c_weight,
+  //                                       r_grad_d,
+  //                                       grad_d,
+  //                                       dofij,
+  //                                       i_f_face_normal,
+  //                                       cell_cells_idx,
+  //                                       cell_cells,
+  //                                       cell_i_faces,
+  //                                       cell_i_faces_sgn);
+
+
+  // _compute_reconstruct_v_i_face_gather_v2<stride><<<get_gridsize(n_cells, blocksize) * 3 * 3, blocksize, 0, stream>>>
+  //                                     ( n_cells * 3 * 3,
+  //                                       pvar_d,
+  //                                       weight,
+  //                                       c_weight,
+  //                                       r_grad_d,
+  //                                       grad_d,
+  //                                       dofij,
+  //                                       i_f_face_normal,
+  //                                       cell_cells_idx,
+  //                                       cell_cells,
+  //                                       cell_i_faces,
+  //                                       cell_i_faces_sgn);
+
+
+
+  /*************************************Kernels Gather registers memory************************************/
+  // _compute_reconstruct_v_i_face_gather_v3<stride><<<get_gridsize(n_cells, blocksize), blocksize, 0, stream>>>
+  //                                     ( n_cells,
+  //                                       pvar_d,
+  //                                       weight,
+  //                                       c_weight,
+  //                                       r_grad_d,
+  //                                       grad_d,
+  //                                       dofij,
+  //                                       i_f_face_normal,
+  //                                       cell_cells_idx,
+  //                                       cell_cells,
+  //                                       cell_i_faces,
+  //                                       cell_i_faces_sgn);
+
+
+  // _compute_reconstruct_v_i_face_gather_v4<stride><<<get_gridsize(n_cells, blocksize) * 3 * 3, blocksize, 0, stream>>>
+  //                                     ( n_cells * 3 * 3,
+  //                                       pvar_d,
+  //                                       weight,
+  //                                       c_weight,
+  //                                       r_grad_d,
+  //                                       grad_d,
+  //                                       dofij,
+  //                                       i_f_face_normal,
+  //                                       cell_cells_idx,
+  //                                       cell_cells,
+  //                                       cell_i_faces,
+  //                                       cell_i_faces_sgn);
+
+
+
+
+  /*************************************Kernels Gather shared memory***************************************/
+  // _compute_reconstruct_v_i_face_gather_v5<stride><<<get_gridsize(n_cells, blocksize), blocksize, 0, stream>>>
+  //                                     ( n_cells,
+  //                                       pvar_d,
+  //                                       weight,
+  //                                       c_weight,
+  //                                       r_grad_d,
+  //                                       grad_d,
+  //                                       dofij,
+  //                                       i_f_face_normal,
+  //                                       cell_cells_idx,
+  //                                       cell_cells,
+  //                                       cell_i_faces,
+  //                                       cell_i_faces_sgn);
+
+  CS_CUDA_CHECK(cudaEventRecord(i_faces, stream));
+  
+  // ----------------------------End of Kernels part 1-------------------------------------------
+
+  // if (grad_d != NULL) {
+  //   size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3;
+  //   cs_cuda_copy_d2h(grad, grad_d, size);
+  // }
+  // else
+  //   cs_sync_d2h(grad);
+    
+  // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3;
+  // cs_cuda_copy_d2h(r_grad, r_grad_d, size);
+    
+
+  /* Contribution from coupled faces */
+  // if (cpl != NULL) {
+  //   cs_internal_coupling_initialize_vector_gradient(cpl, c_weight, pvar, grad);
+  //   cs_internal_coupling_reconstruct_vector_gradient(cpl, r_grad, grad);
+  // }
+  
+  // cs_cuda_copy_h2d(grad_d, grad, n_cells_ext * sizeof(cs_real_33_t));
+
+  CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream));
+  
+  // ----------------------------Begin of Kernels part 2-------------------------------------------
+
+
+  /*************************************Kernels Scatter**************************************************/
+  _compute_reconstruct_v_b_face<stride><<<get_gridsize(n_b_faces, blocksize), blocksize, 0, stream>>>
+                              ( n_b_faces,
+                                coefb_d,
+                                coefa_d,
+                                pvar_d,
+                                inc,
+                                diipb,
+                                r_grad_d,
+                                grad_d,
+                                b_f_face_normal,
+                                b_face_cells);
+
+
+  // _compute_reconstruct_v_b_face_v2<stride><<<get_gridsize(n_b_faces, blocksize) * 3, blocksize, 0, stream>>>
+  //                             ( n_b_faces * 3,
+  //                               coefb_d,
+  //                               coefa_d,
+  //                               pvar_d,
+  //                               inc,
+  //                               diipb,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               b_f_face_normal,
+  //                               b_face_cells);
+  
+  /*************************************Kernels Scatter conflict free************************************/
+  // _compute_reconstruct_v_b_face_cf<stride><<<get_gridsize(n_b_faces, blocksize), blocksize, 0, stream>>>
+  //                             ( n_b_faces,
+  //                               coefb_d,
+  //                               coefa_d,
+  //                               pvar_d,
+  //                               inc,
+  //                               diipb,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               b_f_face_normal,
+  //                               b_face_cells);
+
+  // _compute_reconstruct_v_b_face_v2_cf<stride><<<get_gridsize(n_b_faces, blocksize) * 3, blocksize, 0, stream>>>
+  //                             ( n_b_faces * 3,
+  //                               coefb_d,
+  //                               coefa_d,
+  //                               pvar_d,
+  //                               inc,
+  //                               diipb,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               b_f_face_normal,
+  //                               b_face_cells);
+
+  /*************************************Kernels Gather**************************************************/
+  // _compute_reconstruct_v_b_face_gather<stride><<<get_gridsize(n_b_cells, blocksize), blocksize, 0, stream>>>
+  //                             ( n_b_cells,
+  //                               coefb_d,
+  //                               coefa_d,
+  //                               pvar_d,
+  //                               inc,
+  //                               diipb,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               b_f_face_normal,
+  //                               b_cells,
+  //                               cell_b_faces,
+  //                               cell_b_faces_idx);
+
+
+  // _compute_reconstruct_v_b_face_gather_v2<stride><<<get_gridsize(n_b_cells, blocksize) * 3, blocksize, 0, stream>>>
+  //                             ( n_b_cells * 3,
+  //                               coefb_d,
+  //                               coefa_d,
+  //                               pvar_d,
+  //                               inc,
+  //                               diipb,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               b_f_face_normal,
+  //                               b_cells,
+  //                               cell_b_faces,
+  //                               cell_b_faces_idx);
+
+  /*************************************Kernels Gather registers memory***************************************/
+  // _compute_reconstruct_v_b_face_gather_v3<stride><<<get_gridsize(n_b_cells, blocksize), blocksize, 0, stream>>>
+  //                             ( n_b_cells,
+  //                               coefb_d,
+  //                               coefa_d,
+  //                               pvar_d,
+  //                               inc,
+  //                               diipb,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               b_f_face_normal,
+  //                               b_cells,
+  //                               cell_b_faces,
+  //                               cell_b_faces_idx);
+
+
+  // _compute_reconstruct_v_b_face_gather_v4<stride><<<get_gridsize(n_b_cells, blocksize) * 3, blocksize, 0, stream>>>
+  //                             ( n_b_cells * 3,
+  //                               coefb_d,
+  //                               coefa_d,
+  //                               pvar_d,
+  //                               inc,
+  //                               diipb,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               b_f_face_normal,
+  //                               b_cells,
+  //                               cell_b_faces,
+  //                               cell_b_faces_idx);
+
+
+
+  /*************************************Kernels Gather shared memory***************************************/
+  // _compute_reconstruct_v_b_face_gather_v5<stride><<<get_gridsize(n_b_cells, blocksize), blocksize, 0, stream>>>
+  //                             ( n_b_cells,
+  //                               coefb_d,
+  //                               coefa_d,
+  //                               pvar_d,
+  //                               inc,
+  //                               diipb,
+  //                               r_grad_d,
+  //                               grad_d,
+  //                               b_f_face_normal,
+  //                               b_cells,
+  //                               cell_b_faces,
+  //                               cell_b_faces_idx);
+
+
+  CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream));
+  
+  // _compute_reconstruct_correction<stride><<<get_gridsize(n_cells, blocksize), blocksize, 0, stream>>>
+  //                             ( n_cells,
+  //                               has_dc,
+  //                               c_disable_flag,
+  //                               cell_f_vol,
+  //                               grad_d,
+  //                               corr_grad_lin,
+  //                               test_bool
+  //                             );
+
+  _compute_reconstruct_correction_v2<stride><<<get_gridsize(n_cells, blocksize) * 3, blocksize, 0, stream>>>
+                              ( n_cells * 3,
+                                has_dc,
+                                c_disable_flag,
+                                cell_f_vol,
+                                grad_d,
+                                corr_grad_lin,
+                                test_bool
+                              );
+  CS_CUDA_CHECK(cudaEventRecord(b_faces_3, stream));
+
+  // ----------------------------End of Kernels part 2-------------------------------------------
+
+  /* Sync to host */
+  if (grad_d != NULL) {
+    size_t size = n_cells_ext * sizeof(cs_real_t) * stride * 3;
+    cs_cuda_copy_d2h(grad, grad_d, size);
+  }
+  else
+    cs_sync_d2h(grad);
+    
+
+  CS_CUDA_CHECK(cudaEventRecord(stop, stream));
+  CS_CUDA_CHECK(cudaEventSynchronize(stop));
+
+  cudaStreamSynchronize(stream);
+  cudaStreamDestroy(stream);
+
+  if(perf){
+    printf("reconstruct Kernels times:\t");
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init));
+    printf("Kernels execution time in us: \t");
+    printf("Init = %f\t", msec*1000.f);
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces));
+    printf("I_faces = %f\t", msec*1000.f);
+
+    // msec = 0.0f;
+    // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, b_faces_1));
+    // printf("CPU part = %f\t", msec*1000.f);
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_2));
+    printf("B_faces = %f\t", msec*1000.f);
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_2, b_faces_3));
+    printf("Correction = %f\t", msec*1000.f);
+
+    printf("\n");
+
+    msec_tot = 0.0f;
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, i_faces));
+    printf("reconstruct Total kernel part 1= %f\t", msec*1000.f);
+    msec_tot = msec;
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_3));
+    printf("Total kernel part 2= %f\t", msec*1000.f);
+    msec_tot += msec;
+
+    printf("Total kernel 1 and 2= %f\t", msec_tot*1000.f);
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop));
+    printf("Total = %f\t", msec*1000.f);
+
+    printf("\n");
+  }
+
+  if (_pvar_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_pvar_d));
+  if (_coefa_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_coefa_d));
+  if (_coefb_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_coefb_d));
+  if (_r_grad_d != NULL)
+    CS_CUDA_CHECK(cudaFree(_r_grad_d));
+
+  CS_CUDA_CHECK(cudaFree(cell_i_faces));
+  CS_CUDA_CHECK(cudaFree(cell_i_faces_sgn));
+
+  CS_CUDA_CHECK(cudaFree(i_group_index));
+  CS_CUDA_CHECK(cudaFree(b_group_index));
+  CS_CUDA_CHECK(cudaFree(cell_f_vol));
+  CS_CUDA_CHECK(cudaFree(i_f_face_normal));
+  CS_CUDA_CHECK(cudaFree(dofij));
+  CS_CUDA_CHECK(cudaFree(corr_grad_lin));
+  CS_CUDA_CHECK(cudaFree(c_disable_flag));
+  CS_CUDA_CHECK(cudaFree(grad_d));
+}
+
+
+/*----------------------------------------------------------------------------
+ * _gradient_vector the gradient of a vector using a given gradient of
+ * this vector (typically lsq).
+ *
+ * parameters:
+ *   m              <-- pointer to associated mesh structure
+ *   fvq            <-- pointer to associated finite volume quantities
+ *   cpl            <-- structure associated with internal coupling, or NULL
+ *   inc            <-- if 0, solve on increment; 1 otherwise
+ *   coefav         <-- B.C. coefficients for boundary face normals
+ *   coefbv         <-- B.C. coefficients for boundary face normals
+ *   pvar           <-- variable
+ *   c_weight       <-- weighted gradient coefficient variable
+ *   r_grad         --> gradient used for reconstruction
+ *   grad           --> gradient of pvar (du_i/dx_j : grad[][i][j])
+ *----------------------------------------------------------------------------*/
+extern "C" void
+_gradient_vector_cuda(const cs_mesh_t    *mesh,
+                      cs_real_3_t        *_bc_coeff_a,
+                      cs_real_33_t       *_bc_coeff_b,
+                      bool                a_null,
+                      bool                b_null,
+                      bool                perf)
+{
+  const cs_lnum_t n_b_faces = mesh->n_b_faces;
+
+  int device_id;
+  cudaGetDevice(&device_id);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  
+  cudaEvent_t start, mem_h2d, init1, init2, stop;
+  float msec = 0.0f;
+  CS_CUDA_CHECK(cudaEventCreate(&start));
+  CS_CUDA_CHECK(cudaEventCreate(&mem_h2d));
+  CS_CUDA_CHECK(cudaEventCreate(&init1));
+  CS_CUDA_CHECK(cudaEventCreate(&init2));
+  CS_CUDA_CHECK(cudaEventCreate(&stop));
+
+
+  // Record the start event
+  CS_CUDA_CHECK(cudaEventRecord(start, stream));
+
+  unsigned int blocksize = 256;
+
+  cs_real_3_t *_bc_coeff_a_d;
+  CS_CUDA_CHECK(cudaMalloc(&_bc_coeff_a_d, n_b_faces * sizeof(cs_real_3_t)));
+  cs_real_33_t *_bc_coeff_b_d;
+  CS_CUDA_CHECK(cudaMalloc(&_bc_coeff_b_d, n_b_faces * sizeof(cs_real_33_t)));
+
+
+  /* Initialization */
+
+  CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream));
+
+  if(a_null){
+    cudaMemset(_bc_coeff_a_d, 0, n_b_faces * sizeof(cs_real_3_t));
+  }
+
+  CS_CUDA_CHECK(cudaEventRecord(init1, stream));
+
+  if(b_null){
+    cudaMemset(_bc_coeff_b_d, 0, n_b_faces * sizeof(cs_real_33_t));
+    _set_one_to_coeff_b<<< get_gridsize(n_b_faces * 3, blocksize), blocksize, 0, stream>>>
+                              (n_b_faces * 3, _bc_coeff_b_d);
+  }
+  
+  CS_CUDA_CHECK(cudaEventRecord(init2, stream));
+
+  
+  /* Sync to host */
+  if (_bc_coeff_a_d != NULL) {
+    size_t size = n_b_faces * sizeof(cs_real_t) * 3;
+    cs_cuda_copy_d2h(_bc_coeff_a, _bc_coeff_a_d, size);
+  }
+  else
+    cs_sync_d2h(_bc_coeff_a);
+  /* Sync to host */
+  if (_bc_coeff_b_d != NULL) {
+    size_t size = n_b_faces * sizeof(cs_real_t) * 3 * 3;
+    cs_cuda_copy_d2h(_bc_coeff_b, _bc_coeff_b_d, size);
+  }
+  else
+    cs_sync_d2h(_bc_coeff_b);
+
+  
+  CS_CUDA_CHECK(cudaEventRecord(stop, stream));
+  CS_CUDA_CHECK(cudaEventSynchronize(stop));
+
+  cudaStreamSynchronize(stream);
+  cudaStreamDestroy(stream);
+
+  if(perf){
+    printf("reconstruct Kernels times:\t");
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init1));
+    printf("Kernels execution time in us: \t");
+    printf("Init1 = %f\t", msec*1000.f);
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init1, init2));
+    printf("Init2 = %f\t", msec*1000.f);
+
+    msec = 0.0f;
+    CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop));
+    printf("Total = %f\t", msec*1000.f);
+
+    printf("\n");
+  }
+  CS_CUDA_CHECK(cudaFree(_bc_coeff_a_d));
+  CS_CUDA_CHECK(cudaFree(_bc_coeff_b_d));
+}
+
+INSTANTIATE_RECONSTRUCT(cs_reconstruct_vector_gradient_cuda, 1);
+INSTANTIATE_RECONSTRUCT(cs_reconstruct_vector_gradient_cuda, 3);
+INSTANTIATE_RECONSTRUCT(cs_reconstruct_vector_gradient_cuda, 6);
diff --git a/src/alge/cs_gradient_lsq_vector.cuh b/src/alge/cs_gradient_lsq_vector.cuh
new file mode 100644
index 0000000000..0ecacd1d3d
--- /dev/null
+++ b/src/alge/cs_gradient_lsq_vector.cuh
@@ -0,0 +1,595 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------------
+ * Initialize RHS with null values
+ *----------------------------------------------------------------------------*/
+
+__global__ static void
+_init_rhs(cs_lnum_t         n_cells_ext,
+           cs_real_33_t      *restrict rhs)
+{
+  cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_id < n_cells_ext) {
+    for (cs_lnum_t i = 0; i < 3; i++)
+      for (cs_lnum_t j = 0; j < 3; j++)
+        rhs[c_id][i][j] = 0.0;
+  }
+}
+
+__global__ static void
+_compute_rhs_lsq_v_i_face_v0(cs_lnum_t            n_i_faces,
+                          const cs_lnum_2_t      *i_face_cells,
+                          const cs_real_3_t    *cell_f_cen,
+                          cs_real_33_t         *rhs,
+                          const cs_real_3_t    *pvar,
+                          const cs_real_t         *weight,
+                          const cs_real_t      *c_weight)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_i_faces){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac;
+  cs_lnum_t c_id1, c_id2;
+
+  c_id1 = i_face_cells[f_id][0];
+  c_id2 = i_face_cells[f_id][1];
+
+  dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0];
+  dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1];
+  dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2];
+
+  ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+  
+  if (c_weight != NULL){
+    _pond = weight[f_id];
+    _denom = 1. / (  _pond       *c_weight[c_id1]
+                                + (1. - _pond)*c_weight[c_id2]);
+                            
+    for(cs_lnum_t i = 0; i < 3; i++){
+      pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+      for(cs_lnum_t j = 0; j < 3; j++){
+        fctb[j] = dc[j] * pfac;
+        atomicAdd(&rhs[c_id1][i][j], c_weight[c_id2] * _denom * fctb[j]);
+        atomicAdd(&rhs[c_id2][i][j], c_weight[c_id1] * _denom * fctb[j]);
+      }
+    }
+  }
+  else{
+    for(cs_lnum_t i = 0; i < 3; i++){
+      pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+      for(cs_lnum_t j = 0; j < 3; j++){
+        fctb[j] = dc[j] * pfac;
+        atomicAdd(&rhs[c_id1][i][j], fctb[j]);
+        atomicAdd(&rhs[c_id2][i][j], fctb[j]);
+      }
+    }
+  }
+}
+
+__global__ static void
+_compute_rhs_lsq_v_i_face(cs_lnum_t            n_i_faces,
+                          const cs_lnum_2_t      *restrict i_face_cells,
+                          const cs_real_3_t    *restrict cell_f_cen,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t         *restrict weight,
+                          const cs_real_t      *restrict c_weight)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_i_faces){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac;
+  cs_lnum_t c_id1, c_id2;
+
+  c_id1 = i_face_cells[f_id][0];
+  c_id2 = i_face_cells[f_id][1];
+
+  dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0];
+  dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1];
+  dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2];
+
+  ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+  
+  if (c_weight == NULL){
+    _weight1 = 1.;
+    _weight2 = 1.;
+  }
+  else{
+    _pond = weight[f_id];
+    _denom = 1. / (  _pond       *c_weight[c_id1]
+                                + (1. - _pond)*c_weight[c_id2]);
+    _weight1 = c_weight[c_id1] * _denom;
+    _weight2 = c_weight[c_id2] * _denom;
+  }
+  
+  for(cs_lnum_t i = 0; i < 3; i++){
+    pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+    for(cs_lnum_t j = 0; j < 3; j++){
+      fctb[j] = dc[j] * pfac;
+      atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]);
+      atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]);
+    }
+  }
+}
+
+__global__ static void
+_compute_rhs_lsq_v_i_face_cf(cs_lnum_t            size,
+                          const cs_lnum_2_t      *restrict i_face_cells,
+                          const cs_real_3_t    *restrict cell_f_cen,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t         *restrict weight,
+                          const cs_real_t      *restrict c_weight)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= size){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac;
+  cs_lnum_t c_id1, c_id2;
+
+  c_id1 = i_face_cells[f_id][0];
+  c_id2 = i_face_cells[f_id][1];
+
+  dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0];
+  dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1];
+  dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2];
+
+  ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+  
+  if (c_weight == NULL){
+    _weight1 = 1.;
+    _weight2 = 1.;
+  }
+  else{
+    _pond = weight[f_id];
+    _denom = 1. / (  _pond       *c_weight[c_id1]
+                                + (1. - _pond)*c_weight[c_id2]);
+    _weight1 = c_weight[c_id1] * _denom;
+    _weight2 = c_weight[c_id2] * _denom;
+  }
+
+  using Cell = AtomicCell<cs_real_t, 3, 3>;
+  Cell _rhs1, _rhs2;
+  
+  for(cs_lnum_t i = 0; i < 3; i++){
+    pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+    for(cs_lnum_t j = 0; j < 3; j++){
+      fctb[j] = dc[j] * pfac;
+      _rhs1[i][j].get() = _weight2 * fctb[j];
+      _rhs2[i][j].get() = _weight1 * fctb[j];
+      //atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]);
+      //atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]);
+    }
+  }
+
+#if 1
+  Cell::ref(rhs[c_id1]).conflict_free_add(-1u, _rhs1);
+  Cell::ref(rhs[c_id2]).conflict_free_add(-1u, _rhs2);
+#else
+  Cell::ref(rhs[c_id1]).atomic_add(_rhs1);
+  Cell::ref(rhs[c_id2]).atomic_add(_rhs2);
+#endif
+}
+
+__global__ static void
+_compute_rhs_lsq_v_b_neighbor(cs_lnum_t            n_cells,
+                                const cs_lnum_t      *restrict cell_cells_idx,
+                                const cs_lnum_t      *restrict cell_cells,
+                                const cs_real_3_t    *restrict cell_f_cen,
+                                cs_real_33_t         *restrict rhs,
+                                const cs_real_3_t    *restrict pvar)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+
+  cs_lnum_t s_id = cell_cells_idx[c_id1];
+  cs_lnum_t e_id = cell_cells_idx[c_id1 + 1];
+
+  cs_real_t dc[3], ddc, pfac;
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+
+    cs_lnum_t c_id2 = cell_cells[index];
+
+    dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0];
+    dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1];
+    dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2];
+
+    ddc = 1./(dc[0] * dc[0] + dc[1] * dc[1] + dc[2] * dc[2]);
+
+    for (cs_lnum_t i = 0; i < 3; i++) {
+
+      pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+
+      for (cs_lnum_t j = 0; j < 3; j++) {
+        rhs[c_id1][i][j] += dc[j] * pfac;
+      }
+    }
+  }
+
+}
+
+__global__ static void
+_compute_rhs_lsq_v_b_face(cs_lnum_t           n_b_faces,
+                          const cs_lnum_t      *restrict b_face_cells,
+                          const cs_real_3_t    *restrict cell_f_cen,
+                          const cs_real_3_t    *restrict b_face_normal,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t      *restrict b_dist,
+                          const cs_real_33_t   *restrict coefbv,
+                          const cs_real_3_t    *restrict coefav,
+                          const int            inc)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_b_faces){
+    return;
+  }
+
+  cs_lnum_t c_id1;
+  cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm;
+
+  c_id1 = b_face_cells[f_id];
+
+  cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist);
+
+  d_b_dist = 1. / b_dist[f_id];
+
+  /* Normal divided by b_dist */
+  n_d_dist[0] *= d_b_dist;
+  n_d_dist[1] *= d_b_dist;
+  n_d_dist[2] *= d_b_dist;
+
+  for (cs_lnum_t i = 0; i < 3; i++) {
+    pfac =   coefav[f_id][i]*inc
+          + ( coefbv[f_id][0][i] * pvar[c_id1][0]
+            + coefbv[f_id][1][i] * pvar[c_id1][1]
+            + coefbv[f_id][2][i] * pvar[c_id1][2]
+            - pvar[c_id1][i]);
+
+    atomicAdd(&rhs[c_id1][i][0], n_d_dist[0] * pfac);
+    atomicAdd(&rhs[c_id1][i][1], n_d_dist[1] * pfac);
+    atomicAdd(&rhs[c_id1][i][2], n_d_dist[2] * pfac); 
+  }
+}
+
+__global__ static void
+_compute_gradient_lsq_v(cs_lnum_t           n_cells,
+                        cs_real_33_t        *restrict gradv,
+                        cs_real_33_t        *restrict rhs,
+                        cs_cocg_6_t         *restrict cocg)
+{
+  size_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_id >= n_cells) 
+    return;
+
+  for(cs_lnum_t i = 0; i < 3; i++){
+    gradv[c_id][i][0] =   rhs[c_id][i][0] * cocg[c_id][0]
+                          + rhs[c_id][i][1] * cocg[c_id][3]
+                          + rhs[c_id][i][2] * cocg[c_id][5];
+
+    gradv[c_id][i][1] =   rhs[c_id][i][0] * cocg[c_id][3]
+                        + rhs[c_id][i][1] * cocg[c_id][1]
+                        + rhs[c_id][i][2] * cocg[c_id][4];
+
+    gradv[c_id][i][2] =   rhs[c_id][i][0] * cocg[c_id][5]
+                        + rhs[c_id][i][1] * cocg[c_id][4]
+                        + rhs[c_id][i][2] * cocg[c_id][2];
+  }
+}
+
+__global__ static void
+_compute_gradient_lsq_b_v(cs_lnum_t           n_b_cells,
+                        const cs_lnum_t      *restrict b_cells,
+                        const cs_lnum_t      *restrict cell_b_faces_idx,
+                        const cs_lnum_t      *restrict cell_b_faces,
+                        const cs_real_3_t    *restrict b_face_normal,
+                        const cs_real_3_t    *restrict diipb,
+                        const cs_real_3_t    *restrict pvar,
+                        const cs_real_t      *restrict b_dist,
+                        const cs_real_33_t   *restrict coefbv,
+                        const cs_real_3_t    *restrict coefav,
+                        cs_real_33_t        *restrict gradv,
+                        cs_real_33_t        *restrict rhs,
+                        cs_cocg_6_t         *restrict cocgb_s,
+                        const int            inc)
+{
+  size_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_idx >= n_b_cells) 
+    return;
+
+  cs_lnum_t c_id = b_cells[c_idx];
+
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id+1];
+
+  cs_lnum_t f_id;
+  cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9];
+  cs_real_3_t normal;
+
+  cs_lnum_t _33_9_idx[9][2];
+  int nn = 0;
+  for (int ll = 0; ll < 3; ll++) {
+    for (int mm = 0; mm < 3; mm++) {
+      _33_9_idx[nn][0] = ll;
+      _33_9_idx[nn][1] = mm;
+      nn++;
+    }
+  }
+
+  auto _cocg = cocgb_s[c_idx];
+  auto _rhs = rhs[c_id];
+
+  cocgb[0][0] = _cocg[0];
+  cocgb[0][1] = _cocg[3];
+  cocgb[0][2] = _cocg[5];
+  cocgb[1][0] = _cocg[3];
+  cocgb[1][1] = _cocg[1];
+  cocgb[1][2] = _cocg[4];
+  cocgb[2][0] = _cocg[5];
+  cocgb[2][1] = _cocg[4];
+  cocgb[2][2] = _cocg[2];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    f_id = cell_b_faces[index];
+
+    cs_math_3_normalize_cuda(b_face_normal[f_id], normal);
+    for (cs_lnum_t ii = 0; ii < 3; ii++) {
+      for (cs_lnum_t jj = 0; jj < 3; jj++)
+        cocgb[ii][jj] += normal[ii] * normal[jj];
+    }
+  }
+
+  for (int ll = 0; ll < 9; ll++) {
+
+    int ll_9 = ll*(ll+1)/2;
+
+    for (int mm = 0; mm <= ll; mm++) {
+      cocgb_v[ll_9+mm] = 0.;
+
+      int pp = _33_9_idx[ll][0];
+      int qq = _33_9_idx[ll][1];
+
+      int rr = _33_9_idx[mm][0];
+      int ss = _33_9_idx[mm][1];
+
+      if (pp == rr)
+        cocgb_v[ll_9+mm] = cocgb[qq][ss];
+
+      rhsb_v[ll] = _rhs[pp][qq];
+    }
+  }
+
+  cs_real_3_t nb;
+  cs_real_t a[3], bt[3][3], db, db2;
+  for (cs_lnum_t i = s_id; i < e_id; i++) {
+
+    f_id = cell_b_faces[i];
+
+    auto iipbf = diipb[f_id];
+    
+    cs_math_3_normalize_cuda(b_face_normal[f_id], nb);
+
+    db = 1./b_dist[f_id];
+    db2 = db*db;
+
+    for (int ll = 0; ll < 3; ll++) {
+      for (int pp = 0; pp < 3; pp++)
+        bt[ll][pp] = coefbv[f_id][ll][pp];
+    }
+    for (int ll = 0; ll < 3; ll++) {
+      a[ll] = inc*coefav[f_id][ll];
+      bt[ll][ll] -= 1;
+    }
+
+    for (int ll = 0; ll < 9; ll++) {
+
+      int kk = _33_9_idx[ll][0];
+      int qq = _33_9_idx[ll][1];
+
+      int ll_9 = ll*(ll+1)/2;
+      for (int pp = 0; pp <= ll; pp++) {
+
+        int rr = _33_9_idx[pp][0];
+        int ss = _33_9_idx[pp][1];
+
+        cs_real_t cocgv = 0.;
+        for (int mm = 0; mm < 3; mm++)
+          cocgv += bt[mm][kk]*bt[mm][rr];
+        cocgb_v[ll_9+pp] += cocgv*(iipbf[qq]*iipbf[ss])*db2;
+
+        cocgb_v[ll_9+pp] -= (  nb[ss]*bt[rr][kk]*iipbf[qq]
+                             + nb[qq]*bt[kk][rr]*iipbf[ss])
+                             *db;
+      }
+    }
+
+    for (int ll = 0; ll < 9; ll++) {
+      int pp = _33_9_idx[ll][0];
+      int qq = _33_9_idx[ll][1];
+
+      cs_real_t rhsv = 0.;
+      for (int rr = 0; rr < 3; rr++) {
+        rhsv +=   bt[rr][pp]*diipb[f_id][qq]
+                            *(a[rr]+ bt[rr][0]*pvar[c_id][0]
+                                   + bt[rr][1]*pvar[c_id][1]
+                                   + bt[rr][2]*pvar[c_id][2]);
+      }
+
+      rhsb_v[ll] -= rhsv*db2;
+    }
+
+  }
+  _fact_crout_pp_cuda<9>(cocgb_v);
+
+  _fw_and_bw_ldtl_pp_cuda<9>(cocgb_v, x, rhsb_v);
+
+  for (int kk = 0; kk < 9; kk++) {
+    int ii = _33_9_idx[kk][0];
+    int jj = _33_9_idx[kk][1];
+    gradv[c_id][ii][jj] = x[kk];
+  }
+}
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_gradient_lsq_b_strided_v(const cs_lnum_t           n_b_cells,
+                        const cs_lnum_t      *restrict b_cells,
+                        const cs_lnum_t      *restrict cell_b_faces_idx,
+                        const cs_lnum_t      *restrict cell_b_faces,
+                        const cs_real_3_t          *restrict b_face_cog,
+                        const cs_real_3_t          *restrict cell_cen,
+                        const cs_real_3_t          *restrict diipb,
+                        cs_real_t (*restrict gradv)[stride][3],
+                        const cs_real_t (*restrict coefbv)[stride][stride],
+                        cs_cocg_6_t         *restrict cocg,
+                        cs_lnum_t           n_c_iter_max,
+                        cs_real_t           c_eps)
+{
+  size_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_idx >= n_b_cells) 
+    return;
+  
+  cs_lnum_t c_id = b_cells[c_idx];
+
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id+1];
+
+  auto c_grad = gradv[c_id];
+  auto _cocg = cocg[c_id];
+  auto _cell_cen = cell_cen[c_id];
+
+  cs_real_t grad_0[stride][3], grad_i[stride][3], rhs_c[stride][3], dif[3],  grad_c[stride][3], 
+            var_ip_f[stride];
+
+  cs_real_t ref_norm = 0.0, ddif, c_norm = 0;
+  cs_lnum_t n_c_it, f_id;
+  cs_real_t  eps_dvg = 1e-2;
+  cs_real_t cs_math_epzero = 1e-12;
+
+  for(cs_lnum_t i = 0; i < stride; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      grad_0[i][j] = c_grad[i][j];
+      grad_i[i][j] = c_grad[i][j];
+    }
+  }
+
+  ref_norm = 0;
+  for (cs_lnum_t kk = 0; kk < stride; kk++) {
+    for (cs_lnum_t ll = 0; ll < 3; ll++)
+      ref_norm += cs_math_fabs_cuda(c_grad[kk][ll]);
+  }
+
+  c_norm = 0;
+
+  for (n_c_it = 0; n_c_it < n_c_iter_max; n_c_it++) {
+
+    for (cs_lnum_t ll = 0; ll < stride; ll++) {
+      rhs_c[ll][0] = 0;
+      rhs_c[ll][1] = 0;
+      rhs_c[ll][2] = 0;
+    }
+    
+    for(cs_lnum_t index = s_id; index < e_id; index++){
+      f_id = cell_b_faces[index];
+
+      for (cs_lnum_t ii = 0; ii < 3; ii++)
+        dif[ii] = b_face_cog[f_id][ii] - _cell_cen[ii];
+
+      ddif = 1. / cs_math_3_square_norm_cuda(dif);
+
+      for (cs_lnum_t ll = 0; ll < stride; ll++) {
+        var_ip_f[ll] = cs_math_3_dot_product_cuda(c_grad[ll], diipb[f_id]);
+      }        
+
+      auto b = coefbv[f_id];
+
+      for (cs_lnum_t kk = 0; kk < stride; kk++) {
+        cs_real_t pfac = 0;
+        for (cs_lnum_t ll = 0; ll < stride; ll++) {
+          pfac += b[kk][ll] * var_ip_f[ll] * ddif;
+        }
+
+        for (cs_lnum_t ll = 0; ll < 3; ll++)
+          rhs_c[kk][ll] += dif[ll] * pfac;
+      }
+
+    }
+
+    for(cs_lnum_t i = 0; i < stride; i++){
+      grad_c[i][0] =  rhs_c[i][0] * _cocg[0]
+                    + rhs_c[i][1] * _cocg[3]
+                    + rhs_c[i][2] * _cocg[5];
+
+      grad_c[i][1] =  rhs_c[i][0] * _cocg[3]
+                    + rhs_c[i][1] * _cocg[1]
+                    + rhs_c[i][2] * _cocg[4];
+
+      grad_c[i][2] =  rhs_c[i][0] * _cocg[5]
+                    + rhs_c[i][1] * _cocg[4]
+                    + rhs_c[i][2] * _cocg[2];
+    }
+
+    c_norm = 0.0;
+    for (cs_lnum_t ii = 0; ii < stride; ii++) {
+      for (cs_lnum_t jj = 0; jj < 3; jj++) {
+        c_grad[ii][jj] = grad_0[ii][jj] + grad_c[ii][jj];
+        c_norm += cs_math_fabs_cuda(c_grad[ii][jj] - grad_i[ii][jj]);
+        grad_i[ii][jj] = c_grad[ii][jj];
+      }
+    }
+
+    if (c_norm < ref_norm * c_eps || c_norm < cs_math_epzero)
+        break;
+  }
+  
+  for (cs_lnum_t ii = 0; ii < stride; ii++) {
+    for (cs_lnum_t jj = 0; jj < 3; jj++) {
+      gradv[c_id][ii][jj] = c_grad[ii][jj];
+    }
+  }
+
+  if (c_norm > eps_dvg * ref_norm) {
+    for (cs_lnum_t ii = 0; ii < stride; ii++) {
+      for (cs_lnum_t jj = 0; jj < 3; jj++) {
+        gradv[c_id][ii][jj] = grad_0[ii][jj];
+      }
+    }
+
+    n_c_it *= -1;
+  }
+}
diff --git a/src/alge/cs_gradient_lsq_vector_gather.cuh b/src/alge/cs_gradient_lsq_vector_gather.cuh
new file mode 100644
index 0000000000..586764e259
--- /dev/null
+++ b/src/alge/cs_gradient_lsq_vector_gather.cuh
@@ -0,0 +1,294 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+__global__ static void
+_compute_rhs_lsq_v_i_face_gather(cs_lnum_t            n_cells,
+                          const cs_lnum_t      *restrict cell_cells_idx,
+                          const cs_lnum_t      *restrict cell_cells,
+                          const cs_lnum_t      *restrict cell_i_faces,
+                          const short int      *restrict cell_i_faces_sgn,
+                          const cs_real_3_t    *restrict cell_f_cen,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t         *restrict weight,
+                          const cs_real_t      *restrict c_weight)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _denom, _pond, pfac;
+  cs_lnum_t c_id2, f_id;
+
+  cs_lnum_t s_id = cell_cells_idx[c_id1];
+  cs_lnum_t e_id = cell_cells_idx[c_id1 + 1];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    c_id2 = cell_cells[index];
+
+    dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0];
+    dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1];
+    dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2];
+
+    ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+    
+    if (c_weight != NULL){
+      f_id = cell_i_faces[index];
+    _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id];
+    _denom = 1. / (  _pond       *c_weight[c_id1]
+                                + (1. - _pond)*c_weight[c_id2]);
+                            
+    for(cs_lnum_t i = 0; i < 3; i++){
+      pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+      for(cs_lnum_t j = 0; j < 3; j++){
+        fctb[j] = dc[j] * pfac;
+        rhs[c_id1][i][j] += c_weight[c_id2] * _denom * fctb[j];
+      }
+    }
+  }
+  else{
+    for(cs_lnum_t i = 0; i < 3; i++){
+      pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+      for(cs_lnum_t j = 0; j < 3; j++){
+        fctb[j] = dc[j] * pfac;
+        rhs[c_id1][i][j] += fctb[j];
+      }
+    }
+  }
+}
+}
+
+__global__ static void
+_compute_rhs_lsq_v_b_face_gather(cs_lnum_t           n_b_cells,
+                          const cs_lnum_t      *restrict cell_b_faces_idx,
+                          const cs_lnum_t      *restrict cell_b_faces,
+                          const cs_lnum_t      *restrict b_cells,
+                          const cs_real_3_t    *restrict b_face_normal,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t      *restrict b_dist,
+                          const cs_real_33_t   *restrict coefbv,
+                          const cs_real_3_t    *restrict coefav,
+                          const int            inc)
+{
+  cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_idx >= n_b_cells){
+    return;
+  }
+
+  cs_lnum_t c_id = b_cells[c_idx];
+
+  cs_lnum_t f_id;
+  cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm;
+
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+
+    f_id = cell_b_faces[index];
+
+    cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist);
+
+    d_b_dist = 1. / b_dist[f_id];
+
+    /* Normal divided by b_dist */
+    n_d_dist[0] *= d_b_dist;
+    n_d_dist[1] *= d_b_dist;
+    n_d_dist[2] *= d_b_dist;
+
+    for (cs_lnum_t i = 0; i < 3; i++) {
+      pfac =   coefav[f_id][i]*inc
+            + ( coefbv[f_id][0][i] * pvar[c_id][0]
+              + coefbv[f_id][1][i] * pvar[c_id][1]
+              + coefbv[f_id][2][i] * pvar[c_id][2]
+              - pvar[c_id][i]);
+
+      rhs[c_id][i][0] += n_d_dist[0] * pfac;
+      rhs[c_id][i][1] += n_d_dist[1] * pfac;
+      rhs[c_id][i][2] += n_d_dist[2] * pfac; 
+    }
+  }
+}
+
+template <cs_lnum_t stride, typename val_t, typename coefb_t>
+__global__ static void
+_compute_rhs_lsq_v_b_face_gather_stride(cs_lnum_t           n_b_cells,
+                          const cs_lnum_t      *restrict cell_b_faces_idx,
+                          const cs_lnum_t      *restrict cell_b_faces,
+                          const cs_lnum_t      *restrict b_cells,
+                          const cs_real_3_t    *restrict b_face_cog,
+                          const cs_real_3_t    *restrict cell_cen,
+                          cs_real_33_t         *restrict rhs,
+                          const val_t    *restrict pvar,
+                          const coefb_t   *restrict coefbv,
+                          const cs_real_3_t    *restrict coefav,
+                          cs_cocg_6_t         *restrict cocg,
+                          const cs_cocg_6_t         *restrict cocgb,
+                          const int            inc)
+{
+  cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_idx >= n_b_cells){
+    return;
+  }
+
+  cs_lnum_t c_id = b_cells[c_idx];
+
+  cs_lnum_t f_id;
+  cs_real_t dif[stride], ddif, pfac, norm, var_f[stride];
+
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  for(cs_lnum_t ll = 0; ll < 6; ll++)
+    cocg[c_id][ll] = cocgb[c_idx][ll];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+
+    f_id = cell_b_faces[index];
+
+    for (cs_lnum_t ll = 0; ll < 3; ll++)
+      dif[ll] = b_face_cog[f_id][ll] - cell_cen[c_id][ll];
+
+    ddif = 1. / cs_math_3_square_norm_cuda(dif);
+
+    cocg[c_id][0] += dif[0]*dif[0]*ddif;
+    cocg[c_id][1] += dif[1]*dif[1]*ddif;
+    cocg[c_id][2] += dif[2]*dif[2]*ddif;
+    cocg[c_id][3] += dif[0]*dif[1]*ddif;
+    cocg[c_id][4] += dif[1]*dif[2]*ddif;
+    cocg[c_id][5] += dif[0]*dif[2]*ddif;
+
+    for (cs_lnum_t kk = 0; kk < stride; kk++) {
+      var_f[kk] = coefav[f_id][kk]*inc;
+      for (cs_lnum_t ll = 0; ll < stride; ll++) {
+        var_f[kk] += coefbv[f_id][ll][kk] * pvar[c_id][ll];
+      }
+
+      pfac = (var_f[kk] - pvar[c_id][kk]) * ddif;
+
+      for (cs_lnum_t ll = 0; ll < 3; ll++)
+        rhs[c_id][kk][ll] += dif[ll] * pfac;
+    }
+  }
+  _math_6_inv_cramer_sym_in_place_cuda(cocg[c_id]);
+}
+
+
+template <cs_lnum_t stride, typename val_t, typename coefb_t>
+__global__ static void
+_compute_rhs_lsq_v_b_face_gather_stride_v2(cs_lnum_t           n_b_cells,
+                          const cs_lnum_t      *restrict cell_b_faces_idx,
+                          const cs_lnum_t      *restrict cell_b_faces,
+                          const cs_lnum_t      *restrict b_cells,
+                          const cs_real_3_t    *restrict b_face_cog,
+                          const cs_real_3_t    *restrict cell_cen,
+                          cs_real_33_t         *restrict rhs,
+                          const val_t    *restrict pvar,
+                          const coefb_t   *restrict coefbv,
+                          const cs_real_3_t    *restrict coefav,
+                          cs_cocg_6_t         *restrict cocg,
+                          const cs_cocg_6_t         *restrict cocgb,
+                          const int            inc)
+{
+  cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  cs_lnum_t lindex = threadIdx.x;
+
+  if(c_idx >= n_b_cells){
+    return;
+  }
+
+  cs_lnum_t c_id = b_cells[c_idx];
+
+  cs_lnum_t f_id;
+  cs_real_t dif[stride], ddif, pfac, norm, var_f[stride];
+
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  for(cs_lnum_t ll = 0; ll < 6; ll++)
+    cocg[c_id][ll] = cocgb[c_idx][ll];
+
+  __shared__ cs_real_t _rhs[256][3][3];
+
+  for(cs_lnum_t i = 0; i < 3; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      _rhs[lindex][i][j] = rhs[c_id][i][j];
+    }
+  }
+
+  auto _pvar = pvar[c_id];
+  auto _cocg = cocg[c_id];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+
+    f_id = cell_b_faces[index];
+    auto _coefbv = coefbv[f_id];
+    auto _coefav = coefav[f_id];
+
+
+    for (cs_lnum_t ll = 0; ll < 3; ll++)
+      dif[ll] = b_face_cog[f_id][ll] - cell_cen[c_id][ll];
+
+    ddif = 1. / cs_math_3_square_norm_cuda(dif);
+
+    _cocg[0] += dif[0]*dif[0]*ddif;
+    _cocg[1] += dif[1]*dif[1]*ddif;
+    _cocg[2] += dif[2]*dif[2]*ddif;
+    _cocg[3] += dif[0]*dif[1]*ddif;
+    _cocg[4] += dif[1]*dif[2]*ddif;
+    _cocg[5] += dif[0]*dif[2]*ddif;
+
+    for (cs_lnum_t kk = 0; kk < stride; kk++) {
+      var_f[kk] = _coefav[kk]*inc;
+      for (cs_lnum_t ll = 0; ll < stride; ll++) {
+        var_f[kk] += _coefbv[ll][kk] * _pvar[ll];
+      }
+
+      pfac = (var_f[kk] - _pvar[kk]) * ddif;
+
+      for (cs_lnum_t ll = 0; ll < 3; ll++)
+        _rhs[lindex][kk][ll] += dif[ll] * pfac;
+    }
+  }
+
+  cocg[c_id][0] += _cocg[0];
+  cocg[c_id][1] += _cocg[1];
+  cocg[c_id][2] += _cocg[2];
+  cocg[c_id][3] += _cocg[3];
+  cocg[c_id][4] += _cocg[4];
+  cocg[c_id][5] += _cocg[5];
+
+  for(cs_lnum_t i = 0; i < stride; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      rhs[c_id][i][j] = _rhs[lindex][i][j];
+    }
+  }
+  // _math_6_inv_cramer_sym_in_place_cuda(cocg[c_id]);
+}
diff --git a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh
new file mode 100644
index 0000000000..cdb831140c
--- /dev/null
+++ b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh
@@ -0,0 +1,168 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+__global__ static void
+_compute_rhs_lsq_v_i_face_gather_v2(cs_lnum_t            n_cells,
+                          const cs_lnum_t      *restrict cell_cells_idx,
+                          const cs_lnum_t      *restrict cell_cells,
+                          const cs_lnum_t      *restrict cell_i_faces,
+                          const short int      *restrict cell_i_faces_sgn,
+                          const cs_real_3_t    *restrict cell_f_cen,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t         *restrict weight,
+                          const cs_real_t      *restrict c_weight)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac;
+  cs_lnum_t c_id2, f_id;
+
+  cs_lnum_t s_id = cell_cells_idx[c_id1];
+  cs_lnum_t e_id = cell_cells_idx[c_id1 + 1];
+
+  __shared__ cs_real_t _rhs[3][3];
+
+  auto temp_rhs = rhs[c_id1];
+  _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2];
+  _rhs[1][0]= temp_rhs[1][0]; _rhs[1][1]= temp_rhs[1][1]; _rhs[1][2]= temp_rhs[1][2];
+  _rhs[2][0]= temp_rhs[2][0]; _rhs[2][1]= temp_rhs[2][1]; _rhs[2][2]= temp_rhs[2][2];
+
+  auto _pvar1 = pvar[c_id1];
+
+  auto _cell_f_cen1 = cell_f_cen[c_id1];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    c_id2 = cell_cells[index];
+
+    auto _cell_f_cen2 = cell_f_cen[c_id2];
+
+    dc[0] = _cell_f_cen2[0] - _cell_f_cen1[0];
+    dc[1] = _cell_f_cen2[1] - _cell_f_cen1[1];
+    dc[2] = _cell_f_cen2[2] - _cell_f_cen1[2];
+
+    ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+
+    if (c_weight == NULL){
+      _weight = 1.;
+    }
+    else{
+      f_id = cell_i_faces[index];
+      _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id];
+      _denom = 1. / (  _pond       *c_weight[c_id1]
+                                  + (1. - _pond)*c_weight[c_id2]);
+      _weight = c_weight[c_id2] * _denom;
+    }
+
+    auto _pvar2 = pvar[c_id2];
+    // _pvar2[0]= temp_pvar2[0]; _pvar2[1]= temp_pvar2[1]; _pvar2[2]= temp_pvar2[2];
+
+    for(cs_lnum_t i = 0; i < 3; i++){
+      pfac = (_pvar2[i] - _pvar1[i]) * ddc;
+      for(cs_lnum_t j = 0; j < 3; j++){
+        fctb[j] = dc[j] * pfac;
+        _rhs[i][j] += _weight * fctb[j];
+      }
+    }
+    
+  }
+  rhs[c_id1][0][0] = _rhs[0][0]; rhs[c_id1][0][1] = _rhs[0][1]; rhs[c_id1][0][2] = _rhs[0][2];
+  rhs[c_id1][1][0] = _rhs[1][0]; rhs[c_id1][1][1] = _rhs[1][1]; rhs[c_id1][1][2] = _rhs[1][2];
+  rhs[c_id1][2][0] = _rhs[2][0]; rhs[c_id1][2][1] = _rhs[2][1]; rhs[c_id1][2][2] = _rhs[2][2];
+}
+
+__global__ static void
+_compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t           n_b_cells,
+                          const cs_lnum_t      *restrict cell_b_faces_idx,
+                          const cs_lnum_t      *restrict cell_b_faces,
+                          const cs_lnum_t      *restrict b_cells,
+                          const cs_real_3_t    *restrict b_face_normal,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t      *restrict b_dist,
+                          const cs_real_33_t   *restrict coefbv,
+                          const cs_real_3_t    *restrict coefav,
+                          const int            inc)
+{
+  cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_idx >= n_b_cells){
+    return;
+  }
+  
+  cs_lnum_t c_id = b_cells[c_idx];
+
+  cs_lnum_t f_id;
+  cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm;
+
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  cs_real_t _rhs[3][3];
+
+  auto temp_rhs = rhs[c_id];
+  _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2];
+  _rhs[1][0]= temp_rhs[1][0]; _rhs[1][1]= temp_rhs[1][1]; _rhs[1][2]= temp_rhs[1][2];
+  _rhs[2][0]= temp_rhs[2][0]; _rhs[2][1]= temp_rhs[2][1]; _rhs[2][2]= temp_rhs[2][2];
+
+  auto _pvar1 = pvar[c_id];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+
+    f_id = cell_b_faces[index];
+
+    auto _coefav = coefav[f_id];
+    auto _coefbv = coefbv[f_id];
+
+    cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist);
+
+    d_b_dist = 1. / b_dist[f_id];
+
+    /* Normal divided by b_dist */
+    n_d_dist[0] *= d_b_dist;
+    n_d_dist[1] *= d_b_dist;
+    n_d_dist[2] *= d_b_dist;
+
+    for (cs_lnum_t i = 0; i < 3; i++) {
+      pfac =   _coefav[i]*inc
+            + ( _coefbv[0][i] * _pvar1[0]
+              + _coefbv[1][i] * _pvar1[1]
+              + _coefbv[2][i] * _pvar1[2]
+              - _pvar1[i]);
+
+      _rhs[i][0] += n_d_dist[0] * pfac;
+      _rhs[i][1] += n_d_dist[1] * pfac;
+      _rhs[i][2] += n_d_dist[2] * pfac; 
+    }
+
+  }
+  rhs[c_id][0][0] = _rhs[0][0]; rhs[c_id][0][1] = _rhs[0][1]; rhs[c_id][0][2] = _rhs[0][2];
+  rhs[c_id][1][0] = _rhs[1][0]; rhs[c_id][1][1] = _rhs[1][1]; rhs[c_id][1][2] = _rhs[1][2];
+  rhs[c_id][2][0] = _rhs[2][0]; rhs[c_id][2][1] = _rhs[2][1]; rhs[c_id][2][2] = _rhs[2][2];
+}
diff --git a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh
new file mode 100644
index 0000000000..37ce174c8d
--- /dev/null
+++ b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh
@@ -0,0 +1,264 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+__global__ static void
+_compute_rhs_lsq_v_i_face_gather_v3(cs_lnum_t            n_cells,
+                          const cs_lnum_t      *restrict cell_cells_idx,
+                          const cs_lnum_t      *restrict cell_cells,
+                          const cs_lnum_t      *restrict cell_i_faces,
+                          const short int      *restrict cell_i_faces_sgn,
+                          const cs_real_3_t    *restrict cell_f_cen,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t         *restrict weight,
+                          const cs_real_t      *restrict c_weight)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+  cs_lnum_t lindex = threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac;
+  cs_lnum_t c_id2, f_id;
+
+  // size_t c_id1 = c_id / (3*3);
+  // size_t i = (c_id / 3) % 3;
+  // size_t j = c_id % 3;
+
+  cs_lnum_t s_id = cell_cells_idx[c_id1];
+  cs_lnum_t e_id = cell_cells_idx[c_id1 + 1];
+
+  __shared__ cs_real_t _rhs[256][3][3];
+
+  for(cs_lnum_t i = 0; i < 3; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      _rhs[lindex][i][j] = rhs[c_id1][i][j];
+    }
+  }
+  // __syncthreads();
+  auto _pvar1 = pvar[c_id1];
+
+  auto _cell_f_cen1 = cell_f_cen[c_id1];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    c_id2 = cell_cells[index];
+
+    auto _cell_f_cen2 = cell_f_cen[c_id2];
+
+    dc[0] = _cell_f_cen2[0] - _cell_f_cen1[0];
+    dc[1] = _cell_f_cen2[1] - _cell_f_cen1[1];
+    dc[2] = _cell_f_cen2[2] - _cell_f_cen1[2];
+
+    ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+
+    if (c_weight == NULL){
+      _weight = 1.;
+    }
+    else{
+      f_id = cell_i_faces[index];
+      _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id];
+      _denom = 1. / (  _pond       *c_weight[c_id1]
+                                  + (1. - _pond)*c_weight[c_id2]);
+      _weight = c_weight[c_id2] * _denom;
+    }
+
+    auto _pvar2 = pvar[c_id2];
+
+    for(cs_lnum_t i = 0; i < 3; i++){
+      pfac = (_pvar2[i] - _pvar1[i]) * ddc;
+      for(cs_lnum_t j = 0; j < 3; j++){
+        fctb[j] = dc[j] * pfac;
+        _rhs[lindex][i][j] += _weight * fctb[j];
+      }
+    }
+    
+  }
+  // __syncthreads();
+  for(cs_lnum_t i = 0; i < 3; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      rhs[c_id1][i][j] = _rhs[lindex][i][j];
+    }
+  }
+}
+
+__global__ static void
+_compute_rhs_lsq_v_i_face_gather_v4(cs_lnum_t            n_cells,
+                          const cs_lnum_t      *restrict cell_cells_idx,
+                          const cs_lnum_t      *restrict cell_cells,
+                          const cs_lnum_t      *restrict cell_i_faces,
+                          const short int      *restrict cell_i_faces_sgn,
+                          const cs_real_3_t    *restrict cell_f_cen,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t         *restrict weight,
+                          const cs_real_t      *restrict c_weight)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+  cs_lnum_t lindex = threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac;
+  cs_lnum_t c_id2, f_id;
+
+  // size_t c_id1 = c_id / (3*3);
+  // size_t i = (c_id / 3) % 3;
+  // size_t j = c_id % 3;
+
+  cs_lnum_t s_id = cell_cells_idx[c_id1];
+  cs_lnum_t e_id = cell_cells_idx[c_id1 + 1];
+
+  __shared__ cs_real_t _rhs[256][3][3];
+
+  for(cs_lnum_t i = 0; i < 3; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      _rhs[lindex][i][j] = 0.0;
+    }
+  }
+  // __syncthreads();
+  auto _pvar1 = pvar[c_id1];
+
+  auto _cell_f_cen1 = cell_f_cen[c_id1];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    c_id2 = cell_cells[index];
+
+    auto _cell_f_cen2 = cell_f_cen[c_id2];
+
+    dc[0] = _cell_f_cen2[0] - _cell_f_cen1[0];
+    dc[1] = _cell_f_cen2[1] - _cell_f_cen1[1];
+    dc[2] = _cell_f_cen2[2] - _cell_f_cen1[2];
+
+    ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+
+    if (c_weight == NULL){
+      _weight = 1.;
+    }
+    else{
+      f_id = cell_i_faces[index];
+      _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id];
+      _denom = 1. / (  _pond       *c_weight[c_id1]
+                                  + (1. - _pond)*c_weight[c_id2]);
+      _weight = c_weight[c_id2] * _denom;
+    }
+
+    auto _pvar2 = pvar[c_id2];
+
+    for(cs_lnum_t i = 0; i < 3; i++){
+      pfac = (_pvar2[i] - _pvar1[i]) * ddc;
+      for(cs_lnum_t j = 0; j < 3; j++){
+        fctb[j] = dc[j] * pfac;
+        _rhs[lindex][i][j] += _weight * fctb[j];
+      }
+    }
+    
+  }
+  // __syncthreads();
+  for(cs_lnum_t i = 0; i < 3; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      rhs[c_id1][i][j] = _rhs[lindex][i][j];
+    }
+  }
+}
+
+__global__ static void
+_compute_rhs_lsq_v_b_face_gather_v3(cs_lnum_t           n_b_cells,
+                          const cs_lnum_t      *restrict cell_b_faces_idx,
+                          const cs_lnum_t      *restrict cell_b_faces,
+                          const cs_lnum_t      *restrict b_cells,
+                          const cs_real_3_t    *restrict b_face_normal,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t      *restrict b_dist,
+                          const cs_real_33_t   *restrict coefbv,
+                          const cs_real_3_t    *restrict coefav,
+                          const int            inc)
+{
+  cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  cs_lnum_t lindex = threadIdx.x;
+
+  if(c_idx >= n_b_cells){
+    return;
+  }
+  
+  cs_lnum_t c_id = b_cells[c_idx];
+
+  cs_lnum_t f_id;
+  cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm;
+
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  __shared__ cs_real_t _rhs[256][3][3];
+
+  for(cs_lnum_t i = 0; i < 3; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      _rhs[lindex][i][j] = rhs[c_id][i][j];
+    }
+  }
+
+  // __syncthreads();
+  
+  auto _pvar1 = pvar[c_id];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+
+    f_id = cell_b_faces[index];
+
+    auto _coefav = coefav[f_id];
+    auto _coefbv = coefbv[f_id];
+
+    cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist);
+
+    d_b_dist = 1. / b_dist[f_id];
+
+    /* Normal divided by b_dist */
+    n_d_dist[0] *= d_b_dist;
+    n_d_dist[1] *= d_b_dist;
+    n_d_dist[2] *= d_b_dist;
+
+    for (cs_lnum_t i = 0; i < 3; i++) {
+      pfac =   _coefav[i]*inc
+            + ( _coefbv[0][i] * _pvar1[0]
+              + _coefbv[1][i] * _pvar1[1]
+              + _coefbv[2][i] * _pvar1[2]
+              - _pvar1[i]);
+
+      _rhs[lindex][i][0] += n_d_dist[0] * pfac;
+      _rhs[lindex][i][1]+= n_d_dist[1] * pfac;
+      _rhs[lindex][i][2] += n_d_dist[2] * pfac; 
+    }
+
+  }
+  // __syncthreads();
+  for(cs_lnum_t i = 0; i < 3; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      rhs[c_id][i][j] = _rhs[lindex][i][j];
+    }
+  }
+}
diff --git a/src/alge/cs_gradient_lsq_vector_v2.cuh b/src/alge/cs_gradient_lsq_vector_v2.cuh
new file mode 100644
index 0000000000..a342fd67f7
--- /dev/null
+++ b/src/alge/cs_gradient_lsq_vector_v2.cuh
@@ -0,0 +1,249 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------------
+ * Initialize RHS with null values
+ *----------------------------------------------------------------------------*/
+
+__global__ static void
+_init_rhs_v2(cs_lnum_t         n_cells_g,
+           cs_real_33_t      *restrict _rhs)
+{
+  cs_real_t *rhs = (cs_real_t *) _rhs;
+  cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_id >= n_cells_g)
+    return;
+
+  rhs[c_id] = 0.0;
+}
+
+__global__ static void
+_compute_rhs_lsq_v_i_face_v2(cs_lnum_t            n_i_faces,
+                          const cs_lnum_t      *restrict i_face_cells,
+                          const cs_real_t    *restrict cell_f_cen,
+                          cs_real_33_t         *restrict _rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t         *restrict weight,
+                          const cs_real_t      *restrict c_weight)
+{
+  cs_real_t *rhs = (cs_real_t *) _rhs;
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_i_faces){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac;
+  cs_lnum_t c_id1, c_id2;
+
+  c_id1 = i_face_cells[f_id*2];
+  c_id2 = i_face_cells[f_id*2 + 1];
+
+  dc[0] = cell_f_cen[c_id2*3] - cell_f_cen[c_id1*3];
+  dc[1] = cell_f_cen[c_id2*3 + 1] - cell_f_cen[c_id1*3 + 1];
+  dc[2] = cell_f_cen[c_id2*3 + 2] - cell_f_cen[c_id1*3 + 2];
+
+  ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+  
+  if (c_weight == NULL){
+    _weight1 = 1.;
+    _weight2 = 1.;
+  }
+  else{
+    _pond = weight[f_id];
+    _denom = 1. / (  _pond       *c_weight[c_id1]
+                                + (1. - _pond)*c_weight[c_id2]);
+    _weight1 = c_weight[c_id1] * _denom;
+    _weight2 = c_weight[c_id2] * _denom;
+  }
+  
+  for(cs_lnum_t i = 0; i < 3; i++){
+    pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+    for(cs_lnum_t j = 0; j < 3; j++){
+      fctb[j] = dc[j] * pfac;
+      atomicAdd(&rhs[c_id1*3*3 + i*3 + j], _weight2 * fctb[j]);
+      atomicAdd(&rhs[c_id2*3*3 + i*3 + j], _weight1 * fctb[j]);
+    }
+  }
+}
+
+__global__ static void
+_compute_rhs_lsq_v_i_face_v2cf(cs_lnum_t            size,
+                          const cs_lnum_2_t      *restrict _i_face_cells,
+                          const cs_real_3_t    *restrict _cell_f_cen,
+                          cs_real_33_t         *restrict _rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t         *restrict weight,
+                          const cs_real_t      *restrict c_weight)
+{
+  cs_real_t *rhs = (cs_real_t *) _rhs;
+  cs_lnum_t *i_face_cells = (cs_lnum_t *) _i_face_cells;
+  cs_real_t *cell_f_cen = (cs_real_t *) _cell_f_cen;
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= size){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac;
+  cs_lnum_t c_id1, c_id2;
+
+  c_id1 = i_face_cells[f_id*2];
+  c_id2 = i_face_cells[f_id*2 + 1];
+
+  dc[0] = cell_f_cen[c_id2*3] - cell_f_cen[c_id1*3];
+  dc[1] = cell_f_cen[c_id2*3 + 1] - cell_f_cen[c_id1*3 + 1];
+  dc[2] = cell_f_cen[c_id2*3 + 2] - cell_f_cen[c_id1*3 + 2];
+
+  ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+  
+  if (c_weight == NULL){
+    _weight1 = 1.;
+    _weight2 = 1.;
+  }
+  else{
+    _pond = weight[f_id];
+    _denom = 1. / (  _pond       *c_weight[c_id1]
+                                + (1. - _pond)*c_weight[c_id2]);
+    _weight1 = c_weight[c_id1] * _denom;
+    _weight2 = c_weight[c_id2] * _denom;
+  }
+
+  using Cell = AtomicCell<cs_real_t, 3, 3>;
+  Cell _rhs1, _rhs2;
+  
+  for(cs_lnum_t i = 0; i < 3; i++){
+    pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+    for(cs_lnum_t j = 0; j < 3; j++){
+      fctb[j] = dc[j] * pfac;
+      //_rhs1[i][j].get() += _weight2 * fctb[j];
+      //_rhs2[i][j].get() += _weight1 * fctb[j];
+      atomicAdd(&rhs[c_id1*3*3 + i*3 + j], _weight2 * fctb[j]);
+      atomicAdd(&rhs[c_id2*3*3 + i*3 + j], _weight1 * fctb[j]);
+    }
+  }
+  //reinterpret_cast<Cell*>(&rhs[c_id1*3*3][0][0])->atomic_add(_rhs1);
+  //reinterpret_cast<Cell*>(&rhs[c_id2*3*3][0][0])->atomic_add(_rhs2);
+}
+
+__global__ static void
+_compute_rhs_lsq_v_b_face_v2(cs_lnum_t           n_b_faces,
+                            const cs_lnum_t      *restrict b_face_cells,
+                            const cs_real_3_t    *restrict cell_f_cen,
+                            const cs_real_3_t    *restrict b_face_normal,
+                            cs_real_33_t            *restrict _rhs,
+                            const cs_real_3_t    *restrict pvar,
+                            const cs_real_t      *restrict b_dist,
+                            const cs_real_33_t   *restrict coefbv,
+                            const cs_real_3_t    *restrict coefav,
+                            const int            inc)
+{
+  cs_real_t *rhs = (cs_real_t *) _rhs;
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_b_faces){
+    return;
+  }
+
+  cs_lnum_t c_id1;
+  cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm;
+
+  c_id1 = b_face_cells[f_id];
+
+  cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist);
+
+  d_b_dist = 1. / b_dist[f_id];
+
+  /* Normal divided by b_dist */
+  n_d_dist[0] *= d_b_dist;
+  n_d_dist[1] *= d_b_dist;
+  n_d_dist[2] *= d_b_dist;
+
+  for (cs_lnum_t i = 0; i < 3; i++) {
+    pfac =   coefav[f_id][i]*inc
+          + ( coefbv[f_id][0][i] * pvar[c_id1][0]
+            + coefbv[f_id][1][i] * pvar[c_id1][1]
+            + coefbv[f_id][2][i] * pvar[c_id1][2]
+            - pvar[c_id1][i]);
+
+    atomicAdd(&rhs[c_id1*3*3 + i*3], n_d_dist[0] * pfac);
+    atomicAdd(&rhs[c_id1*3*3 + i*3 + 1], n_d_dist[1] * pfac);
+    atomicAdd(&rhs[c_id1*3*3 + i*3 + 2], n_d_dist[2] * pfac); 
+  }
+}
+
+__global__ static void
+_compute_gradient_lsq_v_v2(cs_lnum_t           n_cells_g,
+                        cs_real_33_t        *restrict _gradv,
+                        cs_real_33_t        *restrict _rhs,
+                        cs_cocg_6_t         *restrict cocg)
+{
+  cs_real_t *rhs = (cs_real_t *) _rhs;
+  cs_real_t *gradv = (cs_real_t *) _gradv;
+  size_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_id >= n_cells_g) 
+    return;
+
+  for(cs_lnum_t i = 0; i < 3; i++){
+    gradv[c_id*3*3 + i*3] =   rhs[c_id*3*3 + i*3] * cocg[c_id][0]
+                          + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][3]
+                          + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][5];
+
+    gradv[c_id*3*3 + i*3 + 1] =   rhs[c_id*3*3 + i*3] * cocg[c_id][3]
+                        + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][1]
+                        + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][4];
+
+    gradv[c_id*3*3 + i*3 + 2] =   rhs[c_id*3*3 + i*3] * cocg[c_id][5]
+                        + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][4]
+                        + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][2];
+  }
+}
+
+__global__ static void
+_compute_gradient_lsq_v_v4(cs_lnum_t           n_cells,
+                        cs_real_33_t        *restrict gradv_m,
+                        cs_real_33_t        *restrict rhs_m,
+                        cs_cocg_6_t         *restrict cocg)
+{
+  size_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_id >= n_cells) 
+    return;
+
+  cs_real_t *rhs = (cs_real_t *) rhs_m;
+  cs_real_t *gradv = (cs_real_t *) gradv_m;
+
+  for(cs_lnum_t i = 0; i < 3; i++){
+    gradv[c_id*3*3 + i*3] =   rhs[c_id*3*3 + i*3] * cocg[c_id][0]
+                          + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][3]
+                          + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][5];
+
+    gradv[c_id*3*3 + i*3 + 1] =   rhs[c_id*3*3 + i*3] * cocg[c_id][3]
+                        + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][1]
+                        + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][4];
+
+    gradv[c_id*3*3 + i*3 + 2] =   rhs[c_id*3*3 + i*3] * cocg[c_id][5]
+                        + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][4]
+                        + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][2];
+  }
+}
diff --git a/src/alge/cs_gradient_lsq_vector_v3.cuh b/src/alge/cs_gradient_lsq_vector_v3.cuh
new file mode 100644
index 0000000000..135d0a2520
--- /dev/null
+++ b/src/alge/cs_gradient_lsq_vector_v3.cuh
@@ -0,0 +1,214 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+
+__global__ static void
+_compute_rhs_lsq_v_i_face_v3(cs_lnum_t            n_i_faces,
+                          const cs_lnum_2_t      *restrict i_face_cells,
+                          const cs_real_3_t    *restrict cell_f_cen,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t         *restrict weight,
+                          const cs_real_t      *restrict c_weight)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_i_faces){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac;
+  cs_lnum_t c_id1, c_id2;
+
+  size_t f_id1 = f_id / (3*3);
+  size_t i = (f_id / 3) % 3;
+  size_t j = f_id % 3;
+
+  c_id1 = i_face_cells[f_id1][0];
+  c_id2 = i_face_cells[f_id1][1];
+
+  dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0];
+  dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1];
+  dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2];
+
+  ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+  
+  if (c_weight == NULL){
+    _weight1 = 1.;
+    _weight2 = 1.;
+  }
+  else{
+    _pond = weight[f_id1];
+    _denom = 1. / (  _pond       *c_weight[c_id1]
+                                + (1. - _pond)*c_weight[c_id2]);
+    _weight1 = c_weight[c_id1] * _denom;
+    _weight2 = c_weight[c_id2] * _denom;
+  }
+  
+  //for(cs_lnum_t i = 0; i < 3; i++){
+    pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+    //for(cs_lnum_t j = 0; j < 3; j++){
+      fctb[j] = dc[j] * pfac;
+      atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]);
+      atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]);
+    //}
+  //}
+}
+
+__global__ static void
+_compute_rhs_lsq_v_i_face_v3cf(cs_lnum_t            size,
+                          const cs_lnum_2_t      *restrict i_face_cells,
+                          const cs_real_3_t    *restrict cell_f_cen,
+                          cs_real_33_t         *restrict rhs,
+                          const cs_real_3_t    *restrict pvar,
+                          const cs_real_t         *restrict weight,
+                          const cs_real_t      *restrict c_weight)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= size){
+    return;
+  }
+  cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac;
+  cs_lnum_t c_id1, c_id2;
+
+  size_t f_id1 = f_id / (3*3);
+  size_t i = (f_id / 3) % 3;
+  size_t j = f_id % 3;
+
+  c_id1 = i_face_cells[f_id1][0];
+  c_id2 = i_face_cells[f_id1][1];
+
+  dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0];
+  dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1];
+  dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2];
+
+  ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]);
+  
+  if (c_weight == NULL){
+    _weight1 = 1.;
+    _weight2 = 1.;
+  }
+  else{
+    _pond = weight[f_id1];
+    _denom = 1. / (  _pond       *c_weight[c_id1]
+                                + (1. - _pond)*c_weight[c_id2]);
+    _weight1 = c_weight[c_id1] * _denom;
+    _weight2 = c_weight[c_id2] * _denom;
+  }
+
+  using Cell = AtomicCell<cs_real_t>;
+  
+  //for(cs_lnum_t i = 0; i < 3; i++){
+    pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc;
+    //for(cs_lnum_t j = 0; j < 3; j++){
+      fctb[j] = dc[j] * pfac;
+      Cell::ref(rhs[c_id1][i][j]).conflict_free_add(-1u, Cell::ref(_weight2 * fctb[j]));
+      Cell::ref(rhs[c_id2][i][j]).conflict_free_add(-1u, Cell::ref(_weight1 * fctb[j]));
+      //atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]);
+      //atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]);
+    //}
+  //}
+}
+
+__global__ static void
+_compute_gradient_lsq_v_v5(cs_lnum_t           n_cells,
+                        cs_real_t        *restrict _gradv,
+                        cs_real_t        *restrict _rhs,
+                        cs_cocg_6_t         *restrict cocg)
+{
+  cs_real_t *rhs = (cs_real_t *) _rhs;
+  cs_real_t *gradv = (cs_real_t *) _gradv;
+  size_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_id >= n_cells) 
+    return;
+
+  size_t c_id1 = c_id / (3*3);
+  size_t i = (c_id / 3) % 3;
+  size_t j = c_id % 3;
+
+  auto cocg_temp = cocg[c_id1];
+  cs_real_t _cocg[3];
+
+  _cocg[0] = cocg_temp[5];
+  _cocg[1] = cocg_temp[4];
+  _cocg[2] = cocg_temp[2];
+
+  if(j == 0){
+    _cocg[0] = cocg_temp[0];
+    _cocg[1] = cocg_temp[3];
+    _cocg[2] = cocg_temp[5];
+  }
+
+  if(j == 1){
+    _cocg[0] = cocg_temp[3];
+    _cocg[1] = cocg_temp[1];
+    _cocg[2] = cocg_temp[4];
+  }
+  
+  gradv[c_id] =   rhs[c_id1*3*3 + i*3] * _cocg[0]
+                        + rhs[c_id1*3*3 + i*3 + 1] * _cocg[1]
+                        + rhs[c_id1*3*3 + i*3 + 2] * _cocg[2];
+
+}
+
+__global__ static void
+_compute_gradient_lsq_v_v6(cs_lnum_t           n_cells,
+                        cs_real_33_t        *restrict gradv,
+                        cs_real_33_t        *restrict rhs,
+                        cs_cocg_6_t         *restrict cocg)
+{
+  size_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c_id >= n_cells) 
+    return;
+
+  size_t c_id1 = c_id / (3*3);
+  size_t i = (c_id / 3) % 3;
+  size_t j = c_id % 3;
+
+  auto cocg_temp = cocg[c_id1];
+  cs_real_t _cocg[3];
+
+  _cocg[0] = cocg_temp[5];
+  _cocg[1] = cocg_temp[4];
+  _cocg[2] = cocg_temp[2];
+
+  if(j == 0){
+    _cocg[0] = cocg_temp[0];
+    _cocg[1] = cocg_temp[3];
+    _cocg[2] = cocg_temp[5];
+  }
+
+  if(j == 1){
+    _cocg[0] = cocg_temp[3];
+    _cocg[1] = cocg_temp[1];
+    _cocg[2] = cocg_temp[4];
+  }
+
+  gradv[c_id1][i][j] =   rhs[c_id1][i][0] * _cocg[0]
+                        + rhs[c_id1][i][1] * _cocg[1]
+                        + rhs[c_id1][i][2] * _cocg[2];
+
+}
diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h
index 399a9a02e8..867db2cac8 100644
--- a/src/alge/cs_gradient_priv.h
+++ b/src/alge/cs_gradient_priv.h
@@ -109,12 +109,92 @@ cs_gradient_scalar_lsq_cuda(const cs_mesh_t              *m,
                             cs_cocg_6_t         *restrict cocgb,
                             cs_real_3_t         *restrict grad);
 
-#endif /* defined(HAVE_CUDA) */
+void
+cs_lsq_vector_gradient_cuda(const cs_mesh_t        *m,
+                     const cs_mesh_adjacencies_t   *madj,
+                     const cs_mesh_quantities_t    *fvq,
+                     const cs_halo_type_t           halo_type,
+                     const int                      inc,
+                     const cs_real_3_t    *restrict coefav,
+                     const cs_real_33_t   *restrict coefbv,
+                     const cs_real_3_t    *restrict pvar,
+                     const cs_real_t      *restrict c_weight,
+                     cs_cocg_6_t          *restrict cocg,
+                     cs_cocg_6_t          *restrict cocgb,
+                     cs_real_33_t         *restrict gradv,
+                     cs_real_33_t         *restrict rhs);
+
+void
+cs_reconstruct_vector_gradient_cuda(const cs_mesh_t              *m,
+                                    const cs_mesh_adjacencies_t  *madj,
+                                    const cs_mesh_quantities_t   *fvq,
+                                    const cs_internal_coupling_t *cpl,
+                                    cs_halo_type_t                halo_type,
+                                    int                           inc,
+                                    const cs_real_3_t   *restrict coefav,
+                                    const cs_real_33_t  *restrict coefbv,
+                                    const cs_real_3_t   *restrict pvar,
+                                    const cs_real_t     *restrict c_weight,
+                                    const cs_real_33_t        *restrict r_grad,
+                                    cs_real_33_t        *restrict grad,
+                                    const bool                   *coupled_faces,
+                                    cs_lnum_t                     cpl_stride,
+                                    bool                          test_bool,
+                                    bool                          perf);
+
+void
+_gradient_vector_cuda(const cs_mesh_t     *mesh,
+                      cs_real_3_t         *_bc_coeff_a,
+                      cs_real_33_t        *_bc_coeff_b,
+                      bool                 a_null,
+                      bool                 b_null,
+                      bool                 perf);
+                      
+#endif
+
+/* defined(HAVE_CUDA) */
 
 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
 
 /*----------------------------------------------------------------------------*/
 
 END_C_DECLS
-
+#ifdef __cplusplus
+/**
+ * This template will be instantited with stride = 1, 3, 6, 9
+*/
+template <cs_lnum_t stride>
+void
+cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t               *m,
+                     const cs_mesh_adjacencies_t   *madj,
+                     const cs_mesh_quantities_t    *fvq,
+                     const cs_halo_type_t           halo_type,
+                     const int                      inc,
+                     const cs_real_t (*restrict coefav)[stride],
+                     const cs_real_t (*restrict coefbv)[stride][stride],
+                     const cs_real_t (*restrict pvar)[stride],
+                     const cs_real_t      *restrict c_weight,
+                     cs_cocg_6_t          *restrict cocg,
+                     cs_cocg_6_t          *restrict cocgb,
+                     cs_real_t (*restrict gradv)[stride][3],
+                     cs_real_t (*restrict rhs)[stride][3],
+                     cs_lnum_t n_c_iter_max,
+                     cs_real_t c_eps);
+
+template <cs_lnum_t stride>
+void
+cs_reconstruct_vector_gradient_cuda(const cs_mesh_t              *m,
+                              const cs_mesh_adjacencies_t  *madj,
+                              const cs_mesh_quantities_t   *fvq,
+                              cs_halo_type_t                halo_type,
+                              int                           inc,
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              const cs_real_t     *restrict c_weight,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              bool                      test_bool,
+                              bool                          perf);
+#endif
 #endif /* __CS_GRADIENT_CUDA_H__ */
diff --git a/src/alge/cs_reconstruct_vector_gradient_gather.cuh b/src/alge/cs_reconstruct_vector_gradient_gather.cuh
new file mode 100644
index 0000000000..c7262866b2
--- /dev/null
+++ b/src/alge/cs_reconstruct_vector_gradient_gather.cuh
@@ -0,0 +1,142 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_i_face_gather(cs_lnum_t            n_cells,
+                          const cs_real_t (*restrict pvar)[stride],
+                          const cs_real_t         *weight,
+                          const cs_real_t      *c_weight,
+                          const cs_real_t (*restrict r_grad)[stride][3],
+                          cs_real_t (*restrict grad)[stride][3],
+                          const cs_real_3_t *restrict dofij,
+                          const cs_real_3_t *restrict i_f_face_normal,
+                          const cs_lnum_t *restrict cell_cells_idx,
+                          const cs_lnum_t *restrict cell_cells,
+                          const cs_lnum_t *restrict cell_i_faces,
+                          const short int *restrict cell_i_faces_sgn)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+
+  cs_lnum_t c_id2, f_id;
+  cs_real_t pond, ktpond, pfaci, pfacj, rfac;
+
+  cs_lnum_t s_id = cell_cells_idx[c_id1];
+  cs_lnum_t e_id = cell_cells_idx[c_id1 + 1];
+
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    c_id2 = cell_cells[index];
+    f_id = cell_i_faces[index];
+
+    pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id];
+    ktpond = (c_weight == NULL) ?
+            pond :                    // no cell weighting
+            pond * c_weight[c_id1] // cell weighting active
+            / (      pond * c_weight[c_id1]
+                + (1.0-pond)* c_weight[c_id2]);
+
+    for (cs_lnum_t i = 0; i < stride; i++) {
+      pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]);
+
+      /* Reconstruction part */
+      rfac = 0.5 * (  dofij[f_id][0]*(  r_grad[c_id1][i][0]
+                                      + r_grad[c_id2][i][0])
+                    + dofij[f_id][1]*(  r_grad[c_id1][i][1]
+                                      + r_grad[c_id2][i][1])
+                    + dofij[f_id][2]*(  r_grad[c_id1][i][2]
+                                      + r_grad[c_id2][i][2]));
+
+      for (cs_lnum_t j = 0; j < 3; j++) {
+        grad[c_id1][i][j] += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j];
+      }
+    }
+  }
+}
+
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_b_face_gather(cs_lnum_t           n_b_cells,
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              int                           inc,
+                              const cs_real_3_t *restrict diipb,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              const cs_real_3_t *restrict b_f_face_normal,
+                              const cs_lnum_t      *restrict b_cells,
+                              const cs_lnum_t      *restrict cell_b_faces,
+                              const cs_lnum_t      *restrict cell_b_faces_idx)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+
+  if(c_id1 >= n_b_cells){
+    return;
+  }
+
+  cs_lnum_t c_id = b_cells[c_id1];
+  
+  cs_real_t pfac, rfac, vecfac;
+  cs_lnum_t f_id;
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    f_id = cell_b_faces[index];
+
+    for (cs_lnum_t i = 0; i < stride; i++) {
+
+      pfac = inc*coefav[f_id][i];
+
+      for (cs_lnum_t k = 0; k < 3; k++){
+        pfac += coefbv[f_id][i][k] * pvar[c_id][k];
+      }
+
+      pfac -= pvar[c_id][i];
+
+    //   /* Reconstruction part */
+      rfac = 0.;
+      for (cs_lnum_t k = 0; k < stride; k++) {
+        vecfac =   r_grad[c_id][k][0] * diipb[f_id][0]
+                            + r_grad[c_id][k][1] * diipb[f_id][1]
+                            + r_grad[c_id][k][2] * diipb[f_id][2];
+        rfac += coefbv[f_id][i][k] * vecfac;
+      }
+
+      for (cs_lnum_t j = 0; j < 3; j++){
+        grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j];
+      }
+    }
+  }
+}
diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh
new file mode 100644
index 0000000000..ff1723ba0f
--- /dev/null
+++ b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh
@@ -0,0 +1,144 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_i_face_gather_v2(cs_lnum_t            n_cells,
+                          const cs_real_t (*restrict pvar)[stride],
+                          const cs_real_t         *weight,
+                          const cs_real_t      *c_weight,
+                          const cs_real_t (*restrict r_grad)[stride][3],
+                          cs_real_t (*restrict grad)[stride][3],
+                          const cs_real_3_t *restrict dofij,
+                          const cs_real_3_t *restrict i_f_face_normal,
+                          const cs_lnum_t *restrict cell_cells_idx,
+                          const cs_lnum_t *restrict cell_cells,
+                          const cs_lnum_t *restrict cell_i_faces,
+                          const short int *restrict cell_i_faces_sgn)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+
+
+  cs_lnum_t c_id2, f_id;
+  cs_real_t pond, ktpond, pfaci, pfacj, rfac;
+
+  size_t c_idx = c_id1 / (stride*3);
+  size_t i = (c_id1 / 3) % stride;
+  size_t j = c_id1 % 3;
+
+  cs_lnum_t s_id = cell_cells_idx[c_idx];
+  cs_lnum_t e_id = cell_cells_idx[c_idx + 1];
+
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    c_id2 = cell_cells[index];
+    f_id = cell_i_faces[index];
+
+    pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id];
+    ktpond = (c_weight == NULL) ?
+            pond :                    // no cell weighting
+            pond * c_weight[c_idx] // cell weighting active
+            / (      pond * c_weight[c_idx]
+                + (1.0-pond)* c_weight[c_id2]);
+
+    pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_idx][i]);
+
+    /* Reconstruction part */
+    rfac = 0.5 * (  dofij[f_id][0]*(  r_grad[c_idx][i][0]
+                                    + r_grad[c_id2][i][0])
+                    + dofij[f_id][1]*(  r_grad[c_idx][i][1]
+                                    + r_grad[c_id2][i][1])
+                    + dofij[f_id][2]*(  r_grad[c_idx][i][2]
+                                    + r_grad[c_id2][i][2]));
+
+    grad[c_idx][i][j] += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j];
+  }
+}
+
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_b_face_gather_v2(cs_lnum_t           n_b_cells,
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              int                           inc,
+                              const cs_real_3_t *restrict diipb,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              const cs_real_3_t *restrict b_f_face_normal,
+                              const cs_lnum_t      *restrict b_cells,
+                              const cs_lnum_t      *restrict cell_b_faces,
+                              const cs_lnum_t      *restrict cell_b_faces_idx)
+{
+  cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+
+  if(c_idx >= n_b_cells){
+    return;
+  }
+
+  size_t c_id1 = c_idx / stride;
+  size_t i = c_idx % stride;
+
+  cs_lnum_t c_id = b_cells[c_id1];
+  
+  cs_real_t pfac, rfac, vecfac;
+  cs_lnum_t f_id;
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    f_id = cell_b_faces[index];
+
+    pfac = inc*coefav[f_id][i];
+
+    pfac += coefbv[f_id][i][0] * pvar[c_id][0]
+          + coefbv[f_id][i][1] * pvar[c_id][1]
+          + coefbv[f_id][i][2] * pvar[c_id][2];
+
+    pfac -= pvar[c_id][i];
+
+  //   /* Reconstruction part */
+    rfac = 0.;
+    for (cs_lnum_t k = 0; k < stride; k++) {
+      vecfac =   r_grad[c_id][k][0] * diipb[f_id][0]
+                          + r_grad[c_id][k][1] * diipb[f_id][1]
+                          + r_grad[c_id][k][2] * diipb[f_id][2];
+      rfac += coefbv[f_id][i][k] * vecfac;
+    }
+
+    grad[c_id][i][0] += (pfac + rfac) * b_f_face_normal[f_id][0];
+    grad[c_id][i][1] += (pfac + rfac) * b_f_face_normal[f_id][1];
+    grad[c_id][i][2] += (pfac + rfac) * b_f_face_normal[f_id][2];
+
+  }
+}
diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh
new file mode 100644
index 0000000000..aa53aa9f9e
--- /dev/null
+++ b/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh
@@ -0,0 +1,173 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_i_face_gather_v3(cs_lnum_t            n_cells,
+                          const cs_real_t (*restrict pvar)[stride],
+                          const cs_real_t         *weight,
+                          const cs_real_t      *c_weight,
+                          const cs_real_t (*restrict r_grad)[stride][3],
+                          cs_real_t (*restrict grad)[stride][3],
+                          const cs_real_3_t *restrict dofij,
+                          const cs_real_3_t *restrict i_f_face_normal,
+                          const cs_lnum_t *restrict cell_cells_idx,
+                          const cs_lnum_t *restrict cell_cells,
+                          const cs_lnum_t *restrict cell_i_faces,
+                          const short int *restrict cell_i_faces_sgn)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+
+  cs_lnum_t c_id2, f_id;
+  cs_real_t pond, ktpond, pfaci, pfacj, rfac;
+
+  cs_lnum_t s_id = cell_cells_idx[c_id1];
+  cs_lnum_t e_id = cell_cells_idx[c_id1 + 1];
+  
+  auto _grad = grad[c_id1];
+  auto _pvar1 = pvar[c_id1];
+  auto _r_grad1 = r_grad[c_id1];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    c_id2 = cell_cells[index];
+    f_id = cell_i_faces[index];
+
+    auto _pvar2 = pvar[c_id2];
+    auto _r_grad2 = r_grad[c_id2];
+    auto _dofij =  dofij[f_id];
+    auto _i_f_face_normal =  i_f_face_normal[f_id];
+    auto _cell_i_faces_sgn =  cell_i_faces_sgn[index];
+
+    pond = (_cell_i_faces_sgn > 0) ? weight[f_id] : 1. - weight[f_id];
+    ktpond = (c_weight == NULL) ?
+            pond :                    // no cell weighting
+            pond * c_weight[c_id1] // cell weighting active
+            / (      pond * c_weight[c_id1]
+                + (1.0-pond)* c_weight[c_id2]);
+
+    for (cs_lnum_t i = 0; i < stride; i++) {
+        pfaci = (1.0-ktpond) * (_pvar2[i] - _pvar1[i]);
+
+        /* Reconstruction part */
+        rfac = 0.5 * (    _dofij[0]*(       _r_grad1[i][0]
+                                          + _r_grad2[i][0])
+                        + _dofij[1]*(       _r_grad1[i][1]
+                                          + _r_grad2[i][1])
+                        + _dofij[2]*(       _r_grad1[i][2]
+                                          + _r_grad2[i][2]));
+
+        for (cs_lnum_t j = 0; j < 3; j++) {
+            _grad[i][j] += _cell_i_faces_sgn * (pfaci + rfac) * _i_f_face_normal[j];
+        }
+    }
+  }
+  for(cs_lnum_t i = 0; i < stride; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      grad[c_id1][i][j] = _grad[i][j];
+    }
+  }
+}
+
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_b_face_gather_v3(cs_lnum_t           n_b_cells,
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              int                           inc,
+                              const cs_real_3_t *restrict diipb,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              const cs_real_3_t *restrict b_f_face_normal,
+                              const cs_lnum_t      *restrict b_cells,
+                              const cs_lnum_t      *restrict cell_b_faces,
+                              const cs_lnum_t      *restrict cell_b_faces_idx)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+
+  if(c_id1 >= n_b_cells){
+    return;
+  }
+
+  cs_lnum_t c_id = b_cells[c_id1];
+  
+  cs_real_t pfac, rfac, vecfac;
+  cs_lnum_t f_id;
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  auto _grad = grad[c_id];
+  auto _r_grad = r_grad[c_id];
+  auto _pvar = pvar[c_id];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    f_id = cell_b_faces[index];
+
+    auto _diipb = diipb[f_id];
+    auto _coefav = coefav[f_id];
+    auto _coefbv = coefbv[f_id];
+    auto _b_f_face_normal = b_f_face_normal[f_id];
+
+    for (cs_lnum_t i = 0; i < stride; i++) {
+
+      pfac = inc*_coefav[i];
+
+      for (cs_lnum_t k = 0; k < 3; k++){
+        pfac += _coefbv[i][k] * _pvar[k];
+      }
+
+      pfac -= _pvar[i];
+
+    //   /* Reconstruction part */
+      rfac = 0.;
+      for (cs_lnum_t k = 0; k < stride; k++) {
+        vecfac =   _r_grad[k][0] * _diipb[0]
+                 + _r_grad[k][1] * _diipb[1]
+                 + _r_grad[k][2] * _diipb[2];
+        rfac += _coefbv[i][k] * vecfac;
+      }
+
+      for (cs_lnum_t j = 0; j < 3; j++){
+        _grad[i][j] += (pfac + rfac) * _b_f_face_normal[j];
+      }
+
+    }
+  }
+  for(cs_lnum_t i = 0; i < stride; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      grad[c_id][i][j] = _grad[i][j];
+    }
+  }
+  
+}
diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh
new file mode 100644
index 0000000000..35eed09b76
--- /dev/null
+++ b/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh
@@ -0,0 +1,150 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_i_face_gather_v4(cs_lnum_t            n_cells,
+                          const cs_real_t (*restrict pvar)[stride],
+                          const cs_real_t         *weight,
+                          const cs_real_t      *c_weight,
+                          const cs_real_t (*restrict r_grad)[stride][3],
+                          cs_real_t (*restrict grad)[stride][3],
+                          const cs_real_3_t *restrict dofij,
+                          const cs_real_3_t *restrict i_f_face_normal,
+                          const cs_lnum_t *restrict cell_cells_idx,
+                          const cs_lnum_t *restrict cell_cells,
+                          const cs_lnum_t *restrict cell_i_faces,
+                          const short int *restrict cell_i_faces_sgn)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+
+
+  cs_lnum_t c_id2, f_id;
+  cs_real_t pond, ktpond, pfaci, pfacj, rfac;
+
+  size_t c_idx = c_id1 / (stride*3);
+  size_t i = (c_id1 / 3) % stride;
+  size_t j = c_id1 % 3;
+
+  cs_lnum_t s_id = cell_cells_idx[c_idx];
+  cs_lnum_t e_id = cell_cells_idx[c_idx + 1];
+
+  auto _grad = grad[c_idx][i][j];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    c_id2 = cell_cells[index];
+    f_id = cell_i_faces[index];
+
+    pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id];
+    ktpond = (c_weight == NULL) ?
+            pond :                    // no cell weighting
+            pond * c_weight[c_idx] // cell weighting active
+            / (      pond * c_weight[c_idx]
+                + (1.0-pond)* c_weight[c_id2]);
+
+    pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_idx][i]);
+
+    /* Reconstruction part */
+    rfac = 0.5 * (  dofij[f_id][0]*(  r_grad[c_idx][i][0]
+                                    + r_grad[c_id2][i][0])
+                    + dofij[f_id][1]*(  r_grad[c_idx][i][1]
+                                    + r_grad[c_id2][i][1])
+                    + dofij[f_id][2]*(  r_grad[c_idx][i][2]
+                                    + r_grad[c_id2][i][2]));
+
+    _grad += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j];
+  }
+  grad[c_idx][i][j] = _grad;
+}
+
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_b_face_gather_v4(cs_lnum_t           n_b_cells,
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              int                           inc,
+                              const cs_real_3_t *restrict diipb,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              const cs_real_3_t *restrict b_f_face_normal,
+                              const cs_lnum_t      *restrict b_cells,
+                              const cs_lnum_t      *restrict cell_b_faces,
+                              const cs_lnum_t      *restrict cell_b_faces_idx)
+{
+  cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+
+  if(c_idx >= n_b_cells){
+    return;
+  }
+
+  size_t c_id1 = c_idx / stride;
+  size_t i = c_idx % stride;
+
+  cs_lnum_t c_id = b_cells[c_id1];
+  
+  cs_real_t pfac, rfac, vecfac;
+  cs_lnum_t f_id;
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  auto _grad = grad[c_id][i];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    f_id = cell_b_faces[index];
+
+    pfac = inc*coefav[f_id][i];
+
+    pfac += coefbv[f_id][i][0] * pvar[c_id][0]
+          + coefbv[f_id][i][1] * pvar[c_id][1]
+          + coefbv[f_id][i][2] * pvar[c_id][2];
+
+    pfac -= pvar[c_id][i];
+
+  //   /* Reconstruction part */
+    rfac = 0.;
+    for (cs_lnum_t k = 0; k < stride; k++) {
+      vecfac =   r_grad[c_id][k][0] * diipb[f_id][0]
+                          + r_grad[c_id][k][1] * diipb[f_id][1]
+                          + r_grad[c_id][k][2] * diipb[f_id][2];
+      rfac += coefbv[f_id][i][k] * vecfac;
+    }
+
+    _grad[0] += (pfac + rfac) * b_f_face_normal[f_id][0];
+    _grad[1] += (pfac + rfac) * b_f_face_normal[f_id][1];
+    _grad[2] += (pfac + rfac) * b_f_face_normal[f_id][2];
+  }
+  grad[c_id][i][0] = _grad[0];
+  grad[c_id][i][1] = _grad[1];
+  grad[c_id][i][2] = _grad[2];
+}
diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh
new file mode 100644
index 0000000000..cd7ebe49e1
--- /dev/null
+++ b/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh
@@ -0,0 +1,191 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_i_face_gather_v5(cs_lnum_t            n_cells,
+                          const cs_real_t (*restrict pvar)[stride],
+                          const cs_real_t         *weight,
+                          const cs_real_t      *c_weight,
+                          const cs_real_t (*restrict r_grad)[stride][3],
+                          cs_real_t (*restrict grad)[stride][3],
+                          const cs_real_3_t *restrict dofij,
+                          const cs_real_3_t *restrict i_f_face_normal,
+                          const cs_lnum_t *restrict cell_cells_idx,
+                          const cs_lnum_t *restrict cell_cells,
+                          const cs_lnum_t *restrict cell_i_faces,
+                          const short int *restrict cell_i_faces_sgn)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+  cs_lnum_t lindex = threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+
+  cs_lnum_t c_id2, f_id;
+  cs_real_t pond, ktpond, pfaci, pfacj, rfac;
+
+  cs_lnum_t s_id = cell_cells_idx[c_id1];
+  cs_lnum_t e_id = cell_cells_idx[c_id1 + 1];
+
+  __shared__ cs_real_t _grad[256][stride][3];
+
+  for(cs_lnum_t i = 0; i < stride; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      _grad[lindex][i][j] = grad[c_id1][i][j];
+    }
+  }
+
+  
+  auto _pvar1 = pvar[c_id1];
+  auto _r_grad1 = r_grad[c_id1];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    c_id2 = cell_cells[index];
+    f_id = cell_i_faces[index];
+
+    auto _pvar2 = pvar[c_id2];
+    auto _r_grad2 = r_grad[c_id2];
+    auto _dofij =  dofij[f_id];
+    auto _i_f_face_normal =  i_f_face_normal[f_id];
+    auto _cell_i_faces_sgn =  cell_i_faces_sgn[index];
+
+    pond = (_cell_i_faces_sgn > 0) ? weight[f_id] : 1. - weight[f_id];
+    ktpond = (c_weight == NULL) ?
+            pond :                    // no cell weighting
+            pond * c_weight[c_id1] // cell weighting active
+            / (      pond * c_weight[c_id1]
+                + (1.0-pond)* c_weight[c_id2]);
+
+      for (cs_lnum_t i = 0; i < stride; i++) {
+        pfaci = (1.0-ktpond) * (_pvar2[i] - _pvar1[i]);
+
+        /* Reconstruction part */
+        rfac = 0.5 * (  _dofij[0]*(  _r_grad1[i][0]
+                                   + _r_grad2[i][0])
+                      + _dofij[1]*(  _r_grad1[i][1]
+                                   + _r_grad2[i][1])
+                      + _dofij[2]*(  _r_grad1[i][2]
+                                   + _r_grad2[i][2]));
+
+        for (cs_lnum_t j = 0; j < 3; j++) {
+          _grad[lindex][i][j] += _cell_i_faces_sgn * (pfaci + rfac) * _i_f_face_normal[j];
+        }
+      }
+    }
+
+  for(cs_lnum_t i = 0; i < stride; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      grad[c_id1][i][j] = _grad[lindex][i][j];
+    }
+  }
+
+}
+
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_b_face_gather_v5(cs_lnum_t           n_b_cells,
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              int                           inc,
+                              const cs_real_3_t *restrict diipb,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              const cs_real_3_t *restrict b_f_face_normal,
+                              const cs_lnum_t      *restrict b_cells,
+                              const cs_lnum_t      *restrict cell_b_faces,
+                              const cs_lnum_t      *restrict cell_b_faces_idx)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+  cs_lnum_t lindex = threadIdx.x;
+
+  if(c_id1 >= n_b_cells){
+    return;
+  }
+
+  cs_lnum_t c_id = b_cells[c_id1];
+  
+  cs_real_t pfac, rfac, vecfac;
+  cs_lnum_t f_id;
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  __shared__ cs_real_t _grad[256][stride][3];
+
+  for(cs_lnum_t i = 0; i < stride; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      _grad[lindex][i][j] = grad[c_id][i][j];
+    }
+  }
+  
+  auto _r_grad = r_grad[c_id];
+  auto _pvar = pvar[c_id];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    f_id = cell_b_faces[index];
+
+    auto _diipb = diipb[f_id];
+    auto _coefav = coefav[f_id];
+    auto _coefbv = coefbv[f_id];
+    auto _b_f_face_normal = b_f_face_normal[f_id];
+
+    for (cs_lnum_t i = 0; i < stride; i++) {
+
+      pfac = inc*_coefav[i];
+
+      for (cs_lnum_t k = 0; k < 3; k++){
+        pfac += _coefbv[i][k] * _pvar[k];
+      }
+
+      pfac -= _pvar[i];
+
+    //   /* Reconstruction part */
+      rfac = 0.;
+      for (cs_lnum_t k = 0; k < stride; k++) {
+        vecfac =   _r_grad[k][0] * _diipb[0]
+                 + _r_grad[k][1] * _diipb[1]
+                 + _r_grad[k][2] * _diipb[2];
+        rfac += _coefbv[i][k] * vecfac;
+      }
+
+      for (cs_lnum_t j = 0; j < 3; j++){
+        _grad[lindex][i][j] += (pfac + rfac) * _b_f_face_normal[j];
+      }
+    }
+  }
+
+  for(cs_lnum_t i = 0; i < stride; i++){
+    for(cs_lnum_t j = 0; j < 3; j++){
+      grad[c_id][i][j] = _grad[lindex][i][j];
+    }
+  }
+}
diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh
new file mode 100644
index 0000000000..a0d0f2b000
--- /dev/null
+++ b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh
@@ -0,0 +1,174 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_i_face(cs_lnum_t            n_i_faces,
+                          const cs_lnum_2_t      *i_face_cells,
+                          const cs_real_t (*restrict pvar)[stride],
+                          const cs_real_t         *weight,
+                          const cs_real_t      *c_weight,
+                          const cs_real_t (*restrict r_grad)[stride][3],
+                          cs_real_t (*restrict grad)[stride][3],
+                          const cs_real_3_t *restrict dofij,
+                          const cs_real_3_t *restrict i_f_face_normal)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_i_faces){
+    return;
+  }
+  cs_lnum_t c_id1, c_id2;
+  cs_real_t pond, ktpond, pfaci, pfacj, rfac;
+
+  c_id1 = i_face_cells[f_id][0];
+  c_id2 = i_face_cells[f_id][1];
+
+  pond = weight[f_id];
+  ktpond = (c_weight == NULL) ?
+        pond :                    // no cell weighting
+        pond * c_weight[c_id1] // cell weighting active
+          / (      pond * c_weight[c_id1]
+            + (1.0-pond)* c_weight[c_id2]);
+
+
+  for (cs_lnum_t i = 0; i < stride; i++) {
+    pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]);
+    pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]);
+
+    /* Reconstruction part */
+    rfac = 0.5 * (  dofij[f_id][0]*(  r_grad[c_id1][i][0]
+                                              + r_grad[c_id2][i][0])
+                            + dofij[f_id][1]*(  r_grad[c_id1][i][1]
+                                              + r_grad[c_id2][i][1])
+                            + dofij[f_id][2]*(  r_grad[c_id1][i][2]
+                                              + r_grad[c_id2][i][2]));
+
+    for (cs_lnum_t j = 0; j < 3; j++) {
+      atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_id][j]);
+      atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_id][j]));
+
+    }
+  }
+
+}
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_b_face(cs_lnum_t            n_b_faces,
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              int                           inc,
+                              const cs_real_3_t *restrict diipb,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              const cs_real_3_t *restrict b_f_face_normal,
+                              const cs_lnum_t *restrict b_face_cells)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_b_faces){
+    return;
+  }
+  cs_lnum_t c_id;
+  cs_real_t pfac, rfac, vecfac;
+
+  c_id = b_face_cells[f_id];
+
+  for (cs_lnum_t i = 0; i < stride; i++) {
+
+    pfac = inc*coefav[f_id][i];
+
+    for (cs_lnum_t k = 0; k < 3; k++){
+      pfac += coefbv[f_id][i][k] * pvar[c_id][k];
+    }
+
+    pfac -= pvar[c_id][i];
+
+  //   /* Reconstruction part */
+    rfac = 0.;
+    for (cs_lnum_t k = 0; k < 3; k++) {
+      vecfac =   r_grad[c_id][k][0] * diipb[f_id][0]
+                          + r_grad[c_id][k][1] * diipb[f_id][1]
+                          + r_grad[c_id][k][2] * diipb[f_id][2];
+      rfac += coefbv[f_id][i][k] * vecfac;
+    }
+
+    for (cs_lnum_t j = 0; j < 3; j++)
+    atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_id][j]);
+
+  }
+}
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_correction(cs_lnum_t            n_cells,
+                               cs_lnum_t            has_dc,
+                               const int *restrict c_disable_flag,
+                               const cs_real_t *restrict cell_f_vol,
+                               cs_real_t (*restrict grad)[stride][3],
+                               const cs_real_33_t *restrict corr_grad_lin,
+                               bool                         test_bool
+                              )
+{
+  cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_id >= n_cells){
+    return;
+  }
+  cs_real_t dvol;
+  /* Is the cell disabled (for solid or porous)? Not the case if coupled */
+  if (has_dc * c_disable_flag[has_dc * c_id] == 0)
+    dvol = 1. / cell_f_vol[c_id];
+  else
+    dvol = 0.;
+
+
+  for (cs_lnum_t i = 0; i < 3; i++) {
+    for (cs_lnum_t j = 0; j < 3; j++)
+      grad[c_id][i][j] *= dvol;
+  }
+
+
+  if (test_bool) {
+    cs_real_t gradpa[3];
+    // printf("dvol = %.17lg\n", dvol);
+    for (cs_lnum_t i = 0; i < stride; i++) {
+      for (cs_lnum_t j = 0; j < 3; j++) {
+        gradpa[j] = grad[c_id][i][j];
+        grad[c_id][i][j] = 0.;
+      }
+
+      for (cs_lnum_t j = 0; j < 3; j++){
+        for (cs_lnum_t k = 0; k < 3; k++){
+          grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k];
+        }
+      }
+    }
+  }
+
+}
diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh
new file mode 100644
index 0000000000..f68189bfca
--- /dev/null
+++ b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh
@@ -0,0 +1,134 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_i_face_cf(cs_lnum_t            n_i_faces,
+                          const cs_lnum_2_t      *i_face_cells,
+                          const cs_real_t (*restrict pvar)[stride],
+                          const cs_real_t         *weight,
+                          const cs_real_t      *c_weight,
+                          const cs_real_t (*restrict r_grad)[stride][3],
+                          cs_real_t (*restrict grad)[stride][3],
+                          const cs_real_3_t *restrict dofij,
+                          const cs_real_3_t *restrict i_f_face_normal)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_i_faces){
+    return;
+  }
+  cs_lnum_t c_id1, c_id2;
+  cs_real_t pond, ktpond, pfaci, pfacj, rfac;
+
+  c_id1 = i_face_cells[f_id][0];
+  c_id2 = i_face_cells[f_id][1];
+
+  pond = weight[f_id];
+  ktpond = (c_weight == NULL) ?
+        pond :                    // no cell weighting
+        pond * c_weight[c_id1] // cell weighting active
+          / (      pond * c_weight[c_id1]
+            + (1.0-pond)* c_weight[c_id2]);
+
+
+  using Cell = AtomicCell<cs_real_t, stride, 3>;
+  Cell grad_cf1, grad_cf2;
+
+
+  for (cs_lnum_t i = 0; i < stride; i++) {
+    pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]);
+    pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]);
+
+    /* Reconstruction part */
+    rfac = 0.5 * (  dofij[f_id][0]*(  r_grad[c_id1][i][0]
+                                              + r_grad[c_id2][i][0])
+                            + dofij[f_id][1]*(  r_grad[c_id1][i][1]
+                                              + r_grad[c_id2][i][1])
+                            + dofij[f_id][2]*(  r_grad[c_id1][i][2]
+                                              + r_grad[c_id2][i][2]));
+
+    for (cs_lnum_t j = 0; j < 3; j++) {
+      grad_cf1[i][j].get() = (pfaci + rfac) * i_f_face_normal[f_id][j];
+      grad_cf2[i][j].get() = - ((pfacj + rfac) * i_f_face_normal[f_id][j]);
+    }
+  }
+  Cell::ref(grad[c_id1]).conflict_free_add(-1u, grad_cf1);
+  Cell::ref(grad[c_id2]).conflict_free_add(-1u, grad_cf2);
+
+}
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_b_face_cf(cs_lnum_t            n_b_faces,
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              int                           inc,
+                              const cs_real_3_t *restrict diipb,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              const cs_real_3_t *restrict b_f_face_normal,
+                              const cs_lnum_t *restrict b_face_cells)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_b_faces){
+    return;
+  }
+  cs_lnum_t c_id;
+  cs_real_t pfac, rfac, vecfac;
+
+  c_id = b_face_cells[f_id];
+
+  using Cell = AtomicCell<cs_real_t, stride, 3>;
+  Cell grad_cf;
+
+  for (cs_lnum_t i = 0; i < stride; i++) {
+
+    pfac = inc*coefav[f_id][i];
+
+    for (cs_lnum_t k = 0; k < 3; k++){
+      pfac += coefbv[f_id][i][k] * pvar[c_id][k];
+    }
+
+    pfac -= pvar[c_id][i];
+
+  //   /* Reconstruction part */
+    rfac = 0.;
+    for (cs_lnum_t k = 0; k < stride; k++) {
+      vecfac =   r_grad[c_id][k][0] * diipb[f_id][0]
+                          + r_grad[c_id][k][1] * diipb[f_id][1]
+                          + r_grad[c_id][k][2] * diipb[f_id][2];
+      rfac += coefbv[f_id][i][k] * vecfac;
+    }
+
+    for (cs_lnum_t j = 0; j < 3; j++){
+    grad_cf[i][j].get() = (pfac + rfac) * b_f_face_normal[f_id][j];
+    }
+  }
+  Cell::ref(grad[c_id]).conflict_free_add(-1u, grad_cf);
+}
diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh
new file mode 100644
index 0000000000..a2dfcf3c4c
--- /dev/null
+++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh
@@ -0,0 +1,182 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_i_face_v2(cs_lnum_t            n_i_faces,
+                          const cs_lnum_2_t      *i_face_cells,
+                          const cs_real_t (*restrict pvar)[stride],
+                          const cs_real_t         *weight,
+                          const cs_real_t      *c_weight,
+                          const cs_real_t (*restrict r_grad)[stride][3],
+                          cs_real_t (*restrict grad)[stride][3],
+                          const cs_real_3_t *restrict dofij,
+                          const cs_real_3_t *restrict i_f_face_normal)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_i_faces){
+    return;
+  }
+
+  size_t f_idt = f_id / stride;
+  size_t i = f_id % stride;
+
+  cs_lnum_t c_id1, c_id2;
+  cs_real_t pond, ktpond, pfaci, pfacj, rfac;
+
+  c_id1 = i_face_cells[f_idt][0];
+  c_id2 = i_face_cells[f_idt][1];
+
+  pond = weight[f_idt];
+  ktpond = (c_weight == NULL) ?
+        pond :                    // no cell weighting
+        pond * c_weight[c_id1] // cell weighting active
+          / (      pond * c_weight[c_id1]
+            + (1.0-pond)* c_weight[c_id2]);
+
+
+  pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]);
+  pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]);
+
+  /* Reconstruction part */
+  rfac = 0.5 * (  dofij[f_idt][0]*(  r_grad[c_id1][i][0]
+                                            + r_grad[c_id2][i][0])
+                          + dofij[f_idt][1]*(  r_grad[c_id1][i][1]
+                                            + r_grad[c_id2][i][1])
+                          + dofij[f_idt][2]*(  r_grad[c_id1][i][2]
+                                            + r_grad[c_id2][i][2]));
+
+  for (cs_lnum_t j = 0; j < 3; j++) {
+    atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_idt][j]);
+    atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_idt][j]));
+  }
+    
+}
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_b_face_v2(cs_lnum_t            n_b_faces,
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              int                           inc,
+                              const cs_real_3_t *restrict diipb,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              const cs_real_3_t *restrict b_f_face_normal,
+                              const cs_lnum_t *restrict b_face_cells)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_b_faces){
+    return;
+  }
+
+  size_t f_idt = f_id / stride;
+  size_t i = f_id % stride;
+
+  cs_lnum_t c_id;
+  cs_real_t pond, ktpond, pfac, rfac, vecfac;
+
+  // if (coupled_faces[f_idt * cpl_stride])
+  //   return;
+
+  c_id = b_face_cells[f_idt];
+
+  pfac = inc*coefav[f_idt][i];
+
+  for (cs_lnum_t k = 0; k < 3; k++){
+    pfac += coefbv[f_idt][i][k] * pvar[c_id][k];
+  }
+
+  pfac -= pvar[c_id][i];
+
+//   /* Reconstruction part */
+  rfac = 0.;
+  for (cs_lnum_t k = 0; k < 3; k++) {
+    vecfac =   r_grad[c_id][k][0] * diipb[f_idt][0]
+                        + r_grad[c_id][k][1] * diipb[f_idt][1]
+                        + r_grad[c_id][k][2] * diipb[f_idt][2];
+    rfac += coefbv[f_idt][i][k] * vecfac;
+  }
+
+  for (cs_lnum_t j = 0; j < 3; j++){
+    atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_idt][j]);
+  }
+
+}
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_correction_v2(  cs_lnum_t                       n_cells,
+                                    cs_lnum_t                       has_dc,
+                                    const int *restrict             c_disable_flag,
+                                    const cs_real_t *restrict       cell_f_vol,
+                                    cs_real_t (*restrict grad)[stride][3],
+                                    const cs_real_33_t *restrict    corr_grad_lin,
+                                    bool                            test_bool
+                                  )
+{
+  cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x;
+  
+
+  if(c_id >= n_cells){
+    return;
+  }
+  
+  size_t c_idt = c_id / stride;
+  size_t i = c_id % stride;
+
+  cs_real_t dvol;
+  /* Is the cell disabled (for solid or porous)? Not the case if coupled */
+  if (has_dc * c_disable_flag[has_dc * c_idt] == 0)
+    dvol = 1. / cell_f_vol[c_idt];
+  else
+    dvol = 0.;
+
+  for (cs_lnum_t j = 0; j < 3; j++){
+    grad[c_idt][i][j] *= dvol;
+  }
+
+
+  if (test_bool) {
+    cs_real_t gradpa[3];
+    for (cs_lnum_t j = 0; j < 3; j++) {
+      gradpa[j] = grad[c_idt][i][j];
+    }
+
+    for (cs_lnum_t j = 0; j < 3; j++) {
+      grad[c_idt][i][j] = corr_grad_lin[c_idt][j][0] * gradpa[0]
+                         + corr_grad_lin[c_idt][j][1] * gradpa[1]
+                         + corr_grad_lin[c_idt][j][2] * gradpa[2];
+    }
+  }
+
+}
\ No newline at end of file
diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh
new file mode 100644
index 0000000000..ae4dbd5092
--- /dev/null
+++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh
@@ -0,0 +1,141 @@
+/*============================================================================
+ * Gradient reconstruction, CUDA implementations.
+ *============================================================================*/
+
+/*
+  This file is part of code_saturne, a general-purpose CFD tool.
+
+  Copyright (C) 1998-2023 EDF S.A.
+
+  This program is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free Software
+  Foundation; either version 2 of the License, or (at your option) any later
+  version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+  Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+/*----------------------------------------------------------------------------*/
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_i_face_v2_cf(cs_lnum_t            n_i_faces,
+                          const cs_lnum_2_t      *i_face_cells,
+                          const cs_real_t (*restrict pvar)[stride],
+                          const cs_real_t         *weight,
+                          const cs_real_t      *c_weight,
+                          const cs_real_t (*restrict r_grad)[stride][3],
+                          cs_real_t (*restrict grad)[stride][3],
+                          const cs_real_3_t *restrict dofij,
+                          const cs_real_3_t *restrict i_f_face_normal)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_i_faces){
+    return;
+  }
+
+  size_t f_idt = f_id / stride;
+  size_t i = f_id % stride;
+
+  cs_lnum_t c_id1, c_id2;
+  cs_real_t pond, ktpond, pfaci, pfacj, rfac;
+
+  c_id1 = i_face_cells[f_idt][0];
+  c_id2 = i_face_cells[f_idt][1];
+
+  pond = weight[f_idt];
+  ktpond = (c_weight == NULL) ?
+        pond :                    // no cell weighting
+        pond * c_weight[c_id1] // cell weighting active
+          / (      pond * c_weight[c_id1]
+            + (1.0-pond)* c_weight[c_id2]);
+
+
+  pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]);
+  pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]);
+
+  /* Reconstruction part */
+  rfac = 0.5 * (  dofij[f_idt][0]*(  r_grad[c_id1][i][0]
+                                            + r_grad[c_id2][i][0])
+                          + dofij[f_idt][1]*(  r_grad[c_id1][i][1]
+                                            + r_grad[c_id2][i][1])
+                          + dofij[f_idt][2]*(  r_grad[c_id1][i][2]
+                                            + r_grad[c_id2][i][2]));
+
+  using Cell = AtomicCell<cs_real_t,3>;
+  Cell grad_cf1, grad_cf2;
+
+  for (cs_lnum_t j = 0; j < 3; j++) {
+    grad_cf1[j].get() = (pfaci + rfac) * i_f_face_normal[f_idt][j];
+    grad_cf2[j].get() = - ((pfacj + rfac) * i_f_face_normal[f_idt][j]);
+  }
+  Cell::ref(grad[c_id1][i]).conflict_free_add(-1u, grad_cf1);
+  Cell::ref(grad[c_id2][i]).conflict_free_add(-1u, grad_cf2);
+    
+}
+
+
+
+
+template <cs_lnum_t stride>
+__global__ static void
+_compute_reconstruct_v_b_face_v2_cf(cs_lnum_t            n_b_faces,
+                              const cs_real_t (*restrict coefbv)[stride][stride],
+                              const cs_real_t (*restrict coefav)[stride],
+                              const cs_real_t (*restrict pvar)[stride],
+                              int                           inc,
+                              const cs_real_3_t *restrict diipb,
+                              const cs_real_t (*restrict r_grad)[stride][3],
+                              cs_real_t (*restrict grad)[stride][3],
+                              const cs_real_3_t *restrict b_f_face_normal,
+                              const cs_lnum_t *restrict b_face_cells)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_b_faces){
+    return;
+  }
+
+  size_t f_idt = f_id / stride;
+  size_t i = f_id % stride;
+
+  cs_lnum_t c_id;
+  cs_real_t pond, ktpond, pfac, rfac, vecfac;
+
+  c_id = b_face_cells[f_idt];
+
+  pfac = inc*coefav[f_idt][i];
+
+  for (cs_lnum_t k = 0; k < 3; k++){
+    pfac += coefbv[f_idt][i][k] * pvar[c_id][k];
+  }
+
+  pfac -= pvar[c_id][i];
+
+//   /* Reconstruction part */
+  rfac = 0.;
+  for (cs_lnum_t k = 0; k < stride; k++) {
+    vecfac =   r_grad[c_id][k][0] * diipb[f_idt][0]
+                        + r_grad[c_id][k][1] * diipb[f_idt][1]
+                        + r_grad[c_id][k][2] * diipb[f_idt][2];
+    rfac += coefbv[f_idt][i][k] * vecfac;
+  }
+
+  using Cell = AtomicCell<cs_real_t,3>;
+  Cell grad_cf;
+
+  for (cs_lnum_t j = 0; j < 3; j++){
+    grad_cf[j].get() = (pfac + rfac) * b_f_face_normal[f_idt][j];
+  }
+  Cell::ref(grad[c_id][i]).conflict_free_add(-1u, grad_cf);
+
+}
diff --git a/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh b/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh
new file mode 100644
index 0000000000..868a179d12
--- /dev/null
+++ b/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh
@@ -0,0 +1,109 @@
+__global__ static void
+cs_slope_test_gradient_vector_cuda_i_gather( const cs_lnum_t      n_cells,
+                                    const cs_real_3_t *restrict   i_face_cog,
+                                    const cs_real_3_t *restrict   cell_cen,
+                                    const cs_real_3_t            *pvar,
+                                    const cs_real_t *restrict     i_massflux,
+                                    const cs_real_3_t *restrict   i_f_face_normal,
+                                    const cs_lnum_t *restrict     cell_cells_idx,
+                                    const cs_lnum_t *restrict     cell_cells,
+                                    const cs_lnum_t *restrict     cell_i_faces,
+                                    const short int *restrict     cell_i_faces_sgn,
+                                    cs_real_33_t                 *grad,
+                                    cs_real_33_t                 *grdpa)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_id1 >= n_cells){
+    return;
+  }
+
+  cs_real_t difv[3], djfv[3], vfac[3];
+  cs_real_t pif, pjf, pfac, face_sgn;
+  cs_lnum_t c_id2, f_id;
+
+  cs_lnum_t s_id = cell_cells_idx[c_id1];
+  cs_lnum_t e_id = cell_cells_idx[c_id1 + 1];
+
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    c_id2 = cell_cells[index];
+    f_id = cell_i_faces[index];
+    face_sgn = cell_i_faces_sgn[index];
+
+    for (int jsou = 0; jsou < 3; jsou++) {
+      difv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id1][jsou];
+      djfv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id2][jsou];
+    }
+
+    for (int isou = 0; isou < 3; isou++) {
+      pif = pvar[c_id1][isou];
+      pjf = pvar[c_id2][isou];
+      for (int jsou = 0; jsou < 3; jsou++) {
+        pif = pif + grad[c_id1][isou][jsou] * difv[jsou];
+        pjf = pjf + grad[c_id2][isou][jsou] * djfv[jsou];
+      }
+
+      pfac = pjf;
+      if (i_massflux[f_id] * face_sgn > 0.) 
+        pfac = pif;
+      pfac *= face_sgn;
+
+      for (int jsou = 0; jsou < 3; jsou++) {
+        vfac[jsou] = pfac*i_f_face_normal[f_id][jsou];
+        grdpa[c_id1][isou][jsou] += vfac[jsou];
+      }
+    }
+  }
+}
+
+
+__global__ static void
+cs_slope_test_gradient_vector_cuda_b_gather(const cs_lnum_t      n_b_cells,
+                                     const cs_real_3_t          *pvar,
+                                     const cs_real_3_t *restrict diipb,
+                                     const int                   inc,
+                                     const cs_real_3_t          *coefa,
+                                     const cs_real_33_t         *coefb,
+                                     const cs_real_3_t *restrict b_f_face_normal,
+                                     const cs_lnum_t   *restrict b_cells,
+                                     const cs_lnum_t   *restrict cell_b_faces,
+                                     const cs_lnum_t   *restrict cell_b_faces_idx,
+                                     const cs_real_33_t         *grad,
+                                     cs_real_33_t               *grdpa)
+{
+  cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x;
+
+
+  if(c_id1 >= n_b_cells){
+    return;
+  }
+
+  cs_lnum_t c_id = b_cells[c_id1];
+  
+  cs_real_t pfac, rfac, vecfac;
+  cs_real_t diipbv[3];
+  cs_lnum_t f_id;
+  cs_lnum_t s_id = cell_b_faces_idx[c_id];
+  cs_lnum_t e_id = cell_b_faces_idx[c_id + 1];
+
+  for(cs_lnum_t index = s_id; index < e_id; index++){
+    f_id = cell_b_faces[index];
+
+    for (int jsou = 0; jsou < 3; jsou++)
+      diipbv[jsou] = diipb[f_id][jsou];
+
+    for (int isou = 0; isou < 3; isou++) {
+      pfac = inc*coefa[f_id][isou];
+      /*coefu is a matrix */
+      for (int jsou =  0; jsou < 3; jsou++)
+        pfac += coefb[f_id][jsou][isou]*(  pvar[c_id][jsou]
+                                            + grad[c_id][jsou][0]*diipbv[0]
+                                            + grad[c_id][jsou][1]*diipbv[1]
+                                            + grad[c_id][jsou][2]*diipbv[2]);
+
+      for (int jsou = 0; jsou < 3; jsou++)
+        grdpa[c_id][isou][jsou] += pfac*b_f_face_normal[f_id][jsou];
+    }
+  }
+}
diff --git a/src/alge/cs_slope_test_gradient_vector_cuda_scatter.cuh b/src/alge/cs_slope_test_gradient_vector_cuda_scatter.cuh
new file mode 100644
index 0000000000..80daba1938
--- /dev/null
+++ b/src/alge/cs_slope_test_gradient_vector_cuda_scatter.cuh
@@ -0,0 +1,113 @@
+__global__ static void
+cs_slope_test_gradient_vector_cuda_i( const cs_lnum_t             n_i_faces,
+                                    const cs_lnum_2_t *restrict   i_face_cells,
+                                    const cs_real_3_t *restrict   i_face_cog,
+                                    const cs_real_3_t *restrict   cell_cen,
+                                    const cs_real_3_t            *pvar,
+                                    const cs_real_t *restrict     i_massflux,
+                                    const cs_real_3_t *restrict   i_f_face_normal,
+                                    cs_real_33_t                 *grad,
+                                    cs_real_33_t                 *grdpa)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_i_faces){
+    return;
+  }
+  cs_real_t difv[3], djfv[3], vfac[3];
+  cs_real_t pif, pjf, pfac;
+  cs_lnum_t c_id1, c_id2;
+
+  c_id1 = i_face_cells[f_id][0];
+  c_id2 = i_face_cells[f_id][1];
+
+  for (int jsou = 0; jsou < 3; jsou++) {
+    difv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id1][jsou];
+    djfv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id2][jsou];
+  }
+
+  /* x-y-z component, p = u, v, w */
+
+  for (int isou = 0; isou < 3; isou++) {
+    pif = pvar[c_id1][isou];
+    pjf = pvar[c_id2][isou];
+    for (int jsou = 0; jsou < 3; jsou++) {
+      pif = pif + grad[c_id1][isou][jsou]*difv[jsou];
+      pjf = pjf + grad[c_id2][isou][jsou]*djfv[jsou];
+    }
+
+    pfac = pjf;
+    if (i_massflux[f_id] > 0.) pfac = pif;
+
+    /* U gradient */
+
+    for (int jsou = 0; jsou < 3; jsou++) {
+      vfac[jsou] = pfac*i_f_face_normal[f_id][jsou];
+      atomicAdd(&grdpa[c_id1][isou][jsou],  vfac[jsou]);
+      atomicAdd(&grdpa[c_id2][isou][jsou],- vfac[jsou]);
+    }
+  }
+}
+
+
+__global__ static void
+cs_slope_test_gradient_vector_cuda_b(const cs_lnum_t             n_b_faces,
+                                     const cs_real_3_t          *pvar,
+                                     const cs_lnum_t *restrict   b_face_cells,
+                                     const cs_real_3_t *restrict diipb,
+                                     const int                   inc,
+                                     const cs_real_3_t          *coefa,
+                                     const cs_real_33_t         *coefb,
+                                     const cs_real_3_t *restrict b_f_face_normal,
+                                     const cs_real_33_t         *grad,
+                                     cs_real_33_t               *grdpa)
+{
+  cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(f_id >= n_b_faces){
+    return;
+  }
+
+  cs_real_t diipbv[3];
+  cs_lnum_t c_id1 = b_face_cells[f_id];
+  cs_real_t pfac;
+
+  for (int jsou = 0; jsou < 3; jsou++)
+    diipbv[jsou] = diipb[f_id][jsou];
+
+  /* x-y-z components, p = u, v, w */
+
+  for (int isou = 0; isou < 3; isou++) {
+    pfac = inc*coefa[f_id][isou];
+    /*coefu is a matrix */
+    for (int jsou =  0; jsou < 3; jsou++)
+      pfac += coefb[f_id][jsou][isou]*(  pvar[c_id1][jsou]
+                                          + grad[c_id1][jsou][0]*diipbv[0]
+                                          + grad[c_id1][jsou][1]*diipbv[1]
+                                          + grad[c_id1][jsou][2]*diipbv[2]);
+
+    for (int jsou = 0; jsou < 3; jsou++)
+      atomicAdd(&grdpa[c_id1][isou][jsou], pfac*b_f_face_normal[f_id][jsou]);
+  }
+
+}
+
+
+
+__global__ static void
+cs_slope_test_gradient_vector_cuda_f(const cs_lnum_t  n_cells,
+                                     cs_real_t       *cell_vol,
+                                     cs_real_33_t    *grdpa)
+{
+  cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(c_idx >= n_cells){
+    return;
+  }
+  size_t c_id = c_idx / (3*3);
+  size_t i = (c_idx / 3) % 3;
+  size_t j = c_idx % 3;
+
+  cs_real_t unsvol = 1./cell_vol[c_id];
+  grdpa[c_id][i][j] *= unsvol;
+}
diff --git a/src/base/cs_base_cuda.cu b/src/base/cs_base_cuda.cu
index 289a7deeea..21561ae5f8 100644
--- a/src/base/cs_base_cuda.cu
+++ b/src/base/cs_base_cuda.cu
@@ -224,7 +224,7 @@ cs_cuda_mem_free(void         *p,
   CS_CUDA_CHECK_CALL(cudaFree(p), file_name, line_num);
 
 #if 0
-  CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num);
+  CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num));
 #endif
 }
 
@@ -257,7 +257,7 @@ cs_cuda_mem_free_host(void         *p,
   CS_CUDA_CHECK_CALL(cudaFreeHost(p), file_name, line_num);
 
 #if 0
-  CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num);
+  CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num));
 #endif
 }
 
@@ -325,11 +325,13 @@ cs_cuda_copy_h2d_async(void        *dst,
 /*----------------------------------------------------------------------------*/
 
 void
-cs_cuda_copy_d2h(void        *dst,
+_cs_cuda_copy_d2h(void        *dst,
                  const void  *src,
-                 size_t       size)
+                 size_t       size,
+                 const char* filename,
+                 long line)
 {
-  CS_CUDA_CHECK(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
+  CS_CUDA_CHECK_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost), filename, line);
 }
 
 /*----------------------------------------------------------------------------*/
diff --git a/src/base/cs_base_cuda.h b/src/base/cs_base_cuda.h
index fdcf8fac52..88c6748943 100644
--- a/src/base/cs_base_cuda.h
+++ b/src/base/cs_base_cuda.h
@@ -286,9 +286,13 @@ cs_cuda_copy_h2d_async(void        *dst,
 /*----------------------------------------------------------------------------*/
 
 void
-cs_cuda_copy_d2h(void        *dst,
+_cs_cuda_copy_d2h(void        *dst,
                  const void  *src,
-                 size_t       size);
+                 size_t       size,
+                 const char* filename,
+                 long line);
+
+#define cs_cuda_copy_d2h(dst, src, size) _cs_cuda_copy_d2h(dst, src, size, __FILE__, __LINE__)
 
 /*----------------------------------------------------------------------------*/
 /*!