chore(gpu): refactor arithmetic scalar shift

agnesLeroy · agnesLeroy · commit bfd3773322f2 · 2025-02-13T20:58:12.000+01:00
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -177,8 +177,8 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
 
 void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks);
+    CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks);
 
 void cleanup_cuda_integer_radix_logical_scalar_shift(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -2877,7 +2877,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
 
   SHIFT_OR_ROTATE_TYPE shift_type;
 
-  Torus *tmp_rotated;
+  CudaRadixCiphertextFFI *tmp_rotated;
 
   cudaStream_t *local_streams_1;
   cudaStream_t *local_streams_2;
@@ -2909,13 +2909,10 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
       uint32_t big_lwe_size = params.big_lwe_dimension + 1;
       uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
 
-      tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 3) *
-                                                   big_lwe_size_bytes,
-                                               streams[0], gpu_indexes[0]);
-
-      cuda_memset_async(tmp_rotated, 0,
-                        (num_radix_blocks + 3) * big_lwe_size_bytes, streams[0],
-                        gpu_indexes[0]);
+      tmp_rotated = new CudaRadixCiphertextFFI;
+      create_zero_radix_ciphertext_async<Torus>(
+          streams[0], gpu_indexes[0], tmp_rotated, num_radix_blocks + 3,
+          params.big_lwe_dimension);
 
       uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus);
 
@@ -3051,7 +3048,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
     lut_buffers_bivariate.clear();
     lut_buffers_univariate.clear();
 
-    cuda_drop_async(tmp_rotated, streams[0], gpu_indexes[0]);
+    release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_rotated);
+    delete tmp_rotated;
   }
 };
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -48,7 +48,7 @@ __host__ void legacy_host_integer_abs_kb_async(
   cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
                                streams[0], gpu_indexes[0]);
 
-  host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+  legacy_host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
       streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
       mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
   legacy_host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
@@ -84,9 +84,8 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
   copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], mask, ct);
 
   host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, (Torus *)(mask->ptr),
-      num_bits_in_ciphertext - 1, mem_ptr->arithmetic_scalar_shift_mem, bsks,
-      ksks, ct->num_radix_blocks);
+      streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
+      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
   host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
                        ct->num_radix_blocks);
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -64,14 +64,13 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
 /// zeros as would be done in the logical shift.
 void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks) {
+    CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks) {
 
   host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array), shift,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, shift,
       (int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), num_blocks);
+      (uint64_t **)(ksks));
 }
 
 void cleanup_cuda_integer_radix_logical_scalar_shift(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -224,7 +224,7 @@ __host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
 }
 
 template <typename Torus>
-__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
+__host__ void legacy_host_integer_radix_arithmetic_scalar_shift_kb_inplace(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, Torus *lwe_array, uint32_t shift,
     int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
@@ -248,7 +248,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
   size_t rotations = std::min(shift / num_bits_in_block, (size_t)num_blocks);
   size_t shift_within_block = shift % num_bits_in_block;
 
-  Torus *rotated_buffer = mem->tmp_rotated;
+  Torus *rotated_buffer = (Torus *)mem->tmp_rotated->ptr;
   Torus *padding_block = &rotated_buffer[(num_blocks + 1) * big_lwe_size];
   Torus *last_block_copy = &padding_block[big_lwe_size];
 
@@ -339,4 +339,119 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
   }
 }
 
+template <typename Torus>
+__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
+    int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
+    Torus *const *ksks) {
+
+  auto num_blocks = lwe_array->num_radix_blocks;
+  auto params = mem->params;
+  auto message_modulus = params.message_modulus;
+
+  size_t num_bits_in_block = (size_t)log2_int(message_modulus);
+  size_t total_num_bits = num_bits_in_block * num_blocks;
+  shift = shift % total_num_bits;
+
+  if (shift == 0) {
+    return;
+  }
+  size_t rotations = std::min(shift / num_bits_in_block, (size_t)num_blocks);
+  size_t shift_within_block = shift % num_bits_in_block;
+
+  CudaRadixCiphertextFFI padding_block;
+  as_radix_ciphertext_slice<Torus>(&padding_block, mem->tmp_rotated,
+                                   num_blocks + 1, num_blocks + 2);
+  CudaRadixCiphertextFFI last_block_copy;
+  as_radix_ciphertext_slice<Torus>(&last_block_copy, mem->tmp_rotated,
+                                   num_blocks + 2, num_blocks + 3);
+
+  if (mem->shift_type == RIGHT_SHIFT) {
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         mem->tmp_rotated, lwe_array, rotations,
+                                         num_blocks);
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             lwe_array, 0, num_blocks,
+                                             mem->tmp_rotated, 0, num_blocks);
+
+    if (num_bits_in_block == 1) {
+      // if there is only 1 bit in the msg part, it means shift_within block is
+      // 0 thus only rotations is required.
+
+      // We still need to pad with the value of the sign bit.
+      // And here since a block only has 1 bit of message
+      // we can optimize things by not doing the pbs to extract this sign bit
+      for (uint i = 0; i < num_blocks; i++) {
+        copy_radix_ciphertext_slice_async<Torus>(
+            streams[0], gpu_indexes[0], mem->tmp_rotated,
+            num_blocks - rotations + i, num_blocks - rotations + i + 1,
+            mem->tmp_rotated, num_blocks - rotations - 1,
+            num_blocks - rotations);
+      }
+      return;
+    }
+
+    if (num_blocks != rotations) {
+      // In the arithmetic shift case we have to pad with the value of the sign
+      // bit. This creates the need for a different shifting lut than in the
+      // logical shift case. We also need another PBS to create the padding
+      // block.
+      CudaRadixCiphertextFFI last_block;
+      as_radix_ciphertext_slice<Torus>(&last_block, lwe_array,
+                                       num_blocks - rotations - 1,
+                                       num_blocks - rotations);
+      copy_radix_ciphertext_slice_async<Torus>(
+          streams[0], gpu_indexes[0], &last_block_copy, 0, 1, mem->tmp_rotated,
+          num_blocks - rotations - 1, num_blocks - rotations);
+      if (shift_within_block != 0) {
+        auto partial_current_blocks = lwe_array;
+        CudaRadixCiphertextFFI partial_next_blocks;
+        as_radix_ciphertext_slice<Torus>(&partial_next_blocks, mem->tmp_rotated,
+                                         1, mem->tmp_rotated->num_radix_blocks);
+        size_t partial_block_count = num_blocks - rotations;
+        auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
+
+        integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+            streams, gpu_indexes, gpu_count, partial_current_blocks,
+            partial_current_blocks, &partial_next_blocks, bsks, ksks,
+            lut_bivariate, partial_block_count,
+            lut_bivariate->params.message_modulus);
+      }
+      // Since our CPU threads will be working on different streams we shall
+      // assert the work in the main stream is completed
+      for (uint j = 0; j < gpu_count; j++) {
+        cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+      }
+      auto lut_univariate_padding_block =
+          mem->lut_buffers_univariate[num_bits_in_block - 1];
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          mem->local_streams_1, gpu_indexes, gpu_count, &padding_block,
+          &last_block_copy, bsks, ksks, lut_univariate_padding_block, 1);
+      // Replace blocks 'pulled' from the left with the correct padding
+      // block
+      for (uint i = 0; i < rotations; i++) {
+        copy_radix_ciphertext_slice_async<Torus>(
+            mem->local_streams_1[0], gpu_indexes[0], lwe_array,
+            num_blocks - rotations + i, num_blocks - rotations + i + 1,
+            &padding_block, 0, 1);
+      }
+      if (shift_within_block != 0) {
+        auto lut_univariate_shift_last_block =
+            mem->lut_buffers_univariate[shift_within_block - 1];
+        integer_radix_apply_univariate_lookup_table_kb<Torus>(
+            mem->local_streams_2, gpu_indexes, gpu_count, &last_block,
+            &last_block_copy, bsks, ksks, lut_univariate_shift_last_block, 1);
+      }
+      for (uint j = 0; j < mem->active_gpu_count; j++) {
+        cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
+        cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
+      }
+    }
+  } else {
+    PANIC("Cuda error (scalar shift): left scalar shift is never of the "
+          "arithmetic type")
+  }
+}
+
 #endif // CUDA_SCALAR_SHIFT_CUH
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -467,12 +467,11 @@ unsafe extern "C" {
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        lwe_array: *mut ffi::c_void,
+        lwe_array: *mut CudaRadixCiphertextFFI,
         shift: u32,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
-        num_blocks: u32,
     );
 }
 unsafe extern "C" {
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
@@ -1671,7 +1671,7 @@ pub unsafe fn unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_as
     B: Numeric,
 >(
     streams: &CudaStreams,
-    radix_lwe_left: &mut CudaVec<T>,
+    radix_lwe_left: &mut CudaRadixCiphertext,
     shift: u32,
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
@@ -1685,13 +1685,12 @@ pub unsafe fn unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_as
     ks_base_log: DecompositionBaseLog,
     pbs_level: DecompositionLevelCount,
     pbs_base_log: DecompositionBaseLog,
-    num_blocks: u32,
     pbs_type: PBSType,
     grouping_factor: LweBskGroupingFactor,
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_left.gpu_index(0),
+        radix_lwe_left.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
@@ -1705,6 +1704,24 @@ pub unsafe fn unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_as
         "GPU error: all data should reside on the same GPU."
     );
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let mut radix_lwe_left_degrees = radix_lwe_left
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.0)
+        .collect();
+    let mut radix_lwe_left_noise_levels = radix_lwe_left
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_radix_lwe_left = prepare_cuda_radix_ffi(
+        radix_lwe_left,
+        &mut radix_lwe_left_degrees,
+        &mut radix_lwe_left_noise_levels,
+    );
+
     scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
@@ -1719,7 +1736,7 @@ pub unsafe fn unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_as
         pbs_level.0 as u32,
         pbs_base_log.0 as u32,
         grouping_factor.0 as u32,
-        num_blocks,
+        radix_lwe_left.d_blocks.lwe_ciphertext_count().0 as u32,
         message_modulus.0 as u32,
         carry_modulus.0 as u32,
         pbs_type as u32,
@@ -1730,19 +1747,19 @@ pub unsafe fn unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_as
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
         streams.len() as u32,
-        radix_lwe_left.as_mut_c_ptr(0),
+        &mut cuda_ffi_radix_lwe_left,
         shift,
         mem_ptr,
         bootstrapping_key.ptr.as_ptr(),
         keyswitch_key.ptr.as_ptr(),
-        num_blocks,
     );
     cleanup_cuda_integer_radix_arithmetic_scalar_shift(
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
         streams.len() as u32,
         std::ptr::addr_of_mut!(mem_ptr),
     );
+    update_noise_degree(radix_lwe_left, &cuda_ffi_radix_lwe_left);
 }
 
 #[allow(clippy::too_many_arguments)]
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs
@@ -194,7 +194,7 @@ impl CudaServerKey {
                 CudaBootstrappingKey::Classic(d_bsk) => {
                     unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async(
                         streams,
-                        &mut ct.as_mut().d_blocks.0.d_vec,
+                        ct.as_mut(),
                         u32::cast_from(shift),
                         &d_bsk.d_vec,
                         &self.key_switching_key.d_vec,
@@ -212,15 +212,14 @@ impl CudaServerKey {
                         self.key_switching_key.decomposition_base_log(),
                         d_bsk.decomp_level_count,
                         d_bsk.decomp_base_log,
-                        lwe_ciphertext_count.0 as u32,
                         PBSType::Classical,
                         LweBskGroupingFactor(0),
                     );
                 }
                 CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                     unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async(
                         streams,
-                        &mut ct.as_mut().d_blocks.0.d_vec,
+                        ct.as_mut(),
                         u32::cast_from(shift),
                         &d_multibit_bsk.d_vec,
                         &self.key_switching_key.d_vec,
@@ -238,7 +237,6 @@ impl CudaServerKey {
                         self.key_switching_key.decomposition_base_log(),
                         d_multibit_bsk.decomp_level_count,
                         d_multibit_bsk.decomp_base_log,
-                        lwe_ciphertext_count.0 as u32,
                         PBSType::MultiBit,
                         d_multibit_bsk.grouping_factor,
                     );