fix(gpu): enforce tighter bounds on compression output

pdroalves · pdroalves · commit 780716222353 · 2025-02-17T17:48:12.000-03:00
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -14,27 +14,26 @@
 
 template <typename Torus>
 __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
-                     uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) {
-  auto nbits = sizeof(Torus) * 8;
+                     uint32_t num_glwes, uint32_t in_len, uint32_t out_len) {
   auto tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-  auto glwe_index = tid / out_len;
-  auto i = tid % out_len;
-  auto chunk_array_in = array_in + glwe_index * in_len;
-  auto chunk_array_out = array_out + glwe_index * out_len;
+  if (tid < num_glwes * out_len) {
+    auto NBITS = sizeof(Torus) * 8;
+    auto glwe_index = tid / out_len;
+    auto i = tid % out_len;
+    auto chunk_array_in = array_in + glwe_index * in_len;
+    auto chunk_array_out = array_out + glwe_index * out_len;
 
-  if (tid < num_coeffs) {
-
-    auto k = nbits * i / log_modulus;
+    auto k = NBITS * i / log_modulus;
     auto j = k;
 
-    auto start_shift = i * nbits - j * log_modulus;
+    auto start_shift = i * NBITS - j * log_modulus;
 
     auto value = chunk_array_in[j] >> start_shift;
     j++;
 
-    while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
-      auto shift = j * log_modulus - i * nbits;
+    while (j * log_modulus < ((i + 1) * NBITS) && j < in_len) {
+      auto shift = j * log_modulus - i * NBITS;
       value |= chunk_array_in[j] << shift;
       j++;
     }
@@ -51,30 +50,30 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
     PANIC("Cuda error: Input and output must be different");
 
   cuda_set_device(gpu_index);
+  auto NBITS = sizeof(Torus) * 8;
   auto compression_params = mem_ptr->compression_params;
-
   auto log_modulus = mem_ptr->storage_log_modulus;
-  // [0..num_glwes-1) GLWEs
-  auto in_len = (compression_params.glwe_dimension + 1) *
-                compression_params.polynomial_size;
-  auto number_bits_to_pack = in_len * log_modulus;
-  auto nbits = sizeof(Torus) * 8;
-  // number_bits_to_pack.div_ceil(Scalar::BITS)
-  auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
 
-  // Last GLWE
-  number_bits_to_pack = in_len * log_modulus;
-  auto last_out_len = (number_bits_to_pack + nbits - 1) / nbits;
+  auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
+                              compression_params.polynomial_size;
+  auto glwe_mask_size =
+      compression_params.glwe_dimension * compression_params.polynomial_size;
 
-  auto num_coeffs = (num_glwes - 1) * out_len + last_out_len;
+  auto uncompressed_len = num_glwes * glwe_mask_size + num_lwes;
+  auto number_bits_to_pack = uncompressed_len * log_modulus;
 
-  int num_blocks = 0, num_threads = 0;
-  getNumBlocksAndThreads(num_coeffs, 1024, num_blocks, num_threads);
+  // equivalent to number_bits_to_pack.div_ceil(Scalar::BITS)
+  auto compressed_len = (number_bits_to_pack + NBITS - 1) / NBITS;
 
+  // Kernel settings
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(num_glwes * compressed_len, 1024, num_blocks,
+                         num_threads);
   dim3 grid(num_blocks);
   dim3 threads(num_threads);
   pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
-                                            num_coeffs, in_len, out_len);
+                                            num_glwes, uncompressed_len,
+                                            compressed_len);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -144,7 +143,7 @@ template <typename Torus>
 __global__ void extract(Torus *glwe_array_out, Torus const *array_in,
                         uint32_t index, uint32_t log_modulus,
                         uint32_t input_len, uint32_t initial_out_len) {
-  auto nbits = sizeof(Torus) * 8;
+  auto NBITS = sizeof(Torus) * 8;
 
   auto i = threadIdx.x + blockIdx.x * blockDim.x;
   auto chunk_array_in = array_in + index * input_len;
@@ -154,10 +153,10 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in,
     auto start = i * log_modulus;
     auto end = (i + 1) * log_modulus;
 
-    auto start_block = start / nbits;
-    auto start_remainder = start % nbits;
+    auto start_block = start / NBITS;
+    auto start_remainder = start % NBITS;
 
-    auto end_block_inclusive = (end - 1) / nbits;
+    auto end_block_inclusive = (end - 1) / NBITS;
 
     Torus unpacked_i;
     if (start_block == end_block_inclusive) {
@@ -166,13 +165,13 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in,
     } else {
       auto first_part = chunk_array_in[start_block] >> start_remainder;
       auto second_part = chunk_array_in[start_block + 1]
-                         << (nbits - start_remainder);
+                         << (NBITS - start_remainder);
 
       unpacked_i = (first_part | second_part) & mask;
     }
 
     // Extract
-    glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
+    glwe_array_out[i] = unpacked_i << (NBITS - log_modulus);
   }
 }
 
@@ -186,38 +185,38 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
     PANIC("Cuda error: Input and output must be different");
 
   cuda_set_device(gpu_index);
-
+  auto NBITS = sizeof(Torus) * 8;
   auto compression_params = mem_ptr->compression_params;
-
   auto log_modulus = mem_ptr->storage_log_modulus;
 
   uint32_t body_count =
       std::min(mem_ptr->body_count, compression_params.polynomial_size);
-  auto initial_out_len =
+  // num_glwes = 1 in this case
+  auto uncompressed_len =
       compression_params.glwe_dimension * compression_params.polynomial_size +
       body_count;
 
-  auto compressed_glwe_accumulator_size =
-      (compression_params.glwe_dimension + 1) *
-      compression_params.polynomial_size;
-  auto number_bits_to_unpack = compressed_glwe_accumulator_size * log_modulus;
-  auto nbits = sizeof(Torus) * 8;
+  auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
+                              compression_params.polynomial_size;
+  auto number_bits_to_unpack = uncompressed_len * log_modulus;
   // number_bits_to_unpack.div_ceil(Scalar::BITS)
-  auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
+  auto compressed_len = (number_bits_to_unpack + NBITS - 1) / NBITS;
 
   // We assure the tail of the glwe is zeroed
-  auto zeroed_slice = glwe_array_out + initial_out_len;
+  auto zeroed_slice = glwe_array_out + uncompressed_len;
   cuda_memset_async(zeroed_slice, 0,
                     (compression_params.polynomial_size - body_count) *
                         sizeof(Torus),
                     stream, gpu_index);
+
+  // Kernel settings
   int num_blocks = 0, num_threads = 0;
-  getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
+  getNumBlocksAndThreads(uncompressed_len, 128, num_blocks, num_threads);
   dim3 grid(num_blocks);
   dim3 threads(num_threads);
-  extract<Torus><<<grid, threads, 0, stream>>>(glwe_array_out, array_in,
-                                               glwe_index, log_modulus,
-                                               input_len, initial_out_len);
+  extract<Torus><<<grid, threads, 0, stream>>>(
+      glwe_array_out, array_in, glwe_index, log_modulus, compressed_len,
+      uncompressed_len);
   check_cuda_error(cudaGetLastError());
 }
 
diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
@@ -2,7 +2,7 @@ use crate::core_crypto::entities::packed_integers::PackedIntegers;
 use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
 use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::compressed_modulus_switched_glwe_ciphertext::CompressedModulusSwitchedGlweCiphertext;
-use crate::core_crypto::prelude::{glwe_ciphertext_size, CiphertextCount, LweCiphertextCount};
+use crate::core_crypto::prelude::{CiphertextCount, LweCiphertextCount};
 use crate::integer::ciphertext::{CompressedCiphertextList, DataKind};
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::{
@@ -343,25 +343,11 @@ impl CompressedCiphertextList {
         let message_modulus = self.packed_list.message_modulus;
         let carry_modulus = self.packed_list.carry_modulus;
 
-        let mut flat_cpu_data = modulus_switched_glwe_ciphertext_list
+        let flat_cpu_data = modulus_switched_glwe_ciphertext_list
             .iter()
             .flat_map(|ct| ct.packed_integers.packed_coeffs.clone())
             .collect_vec();
 
-        let glwe_ciphertext_count = self.packed_list.modulus_switched_glwe_ciphertext_list.len();
-        let glwe_size = self.packed_list.modulus_switched_glwe_ciphertext_list[0]
-            .glwe_dimension()
-            .to_glwe_size();
-        let polynomial_size =
-            self.packed_list.modulus_switched_glwe_ciphertext_list[0].polynomial_size();
-
-        // FIXME: have a more precise memory handling, this is too long and should be "just" the
-        // original flat_cpu_data.len()
-        let unpacked_glwe_ciphertext_flat_len =
-            glwe_ciphertext_count * glwe_ciphertext_size(glwe_size, polynomial_size);
-
-        flat_cpu_data.resize(unpacked_glwe_ciphertext_flat_len, 0u64);
-
         let flat_gpu_data = unsafe {
             let v = CudaVec::from_cpu_async(flat_cpu_data.as_slice(), streams, 0);
             streams.synchronize();
diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -3,8 +3,8 @@ use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::{
-    glwe_ciphertext_size, CiphertextModulus, CiphertextModulusLog, GlweCiphertextCount,
-    LweCiphertextCount, PolynomialSize,
+    glwe_ciphertext_size, glwe_mask_size, CiphertextModulus, CiphertextModulusLog,
+    GlweCiphertextCount, LweCiphertextCount, PolynomialSize,
 };
 use crate::integer::ciphertext::DataKind;
 use crate::integer::compression_keys::CompressionKey;
@@ -173,15 +173,15 @@ impl CudaCompressionKey {
             .sum();
 
         let num_glwes = num_lwes.div_ceil(self.lwe_per_glwe.0);
-        let glwe_ciphertext_size =
-            glwe_ciphertext_size(compressed_glwe_size, compressed_polynomial_size);
+        let glwe_mask_size = glwe_mask_size(
+            compressed_glwe_size.to_glwe_dimension(),
+            compressed_polynomial_size,
+        );
         // The number of u64 (both mask and bodies)
-        // FIXME: have a more precise memory handling, this is too long and should be
-        // num_glwes * glwe_mask_size + num_lwes
-        let uncompressed_len = num_glwes * glwe_ciphertext_size;
+        let uncompressed_len = num_glwes * glwe_mask_size + num_lwes;
         let number_bits_to_pack = uncompressed_len * self.storage_log_modulus.0;
         let compressed_len = number_bits_to_pack.div_ceil(u64::BITS as usize);
-        let mut packed_glwe_list = CudaVec::new(compressed_len, streams, 0);
+        let mut packed_glwe_list = CudaVec::new(num_glwes * compressed_len, streams, 0);
 
         unsafe {
             let input_lwes = Self::flatten_async(ciphertexts, streams);