.

pdroalves · pdroalves · commit 6e792552911e · 2025-02-18T09:23:56.000-03:00
diff --git a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -68,6 +68,7 @@ endif()
 add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
 
 # Check if the DEBUG flag is defined
+set(CMAKE_BUILD_TYPE "Debug")
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
   # Debug mode
   message("Compiling in Debug mode")
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -102,7 +102,7 @@ template <typename Torus> struct int_decompression {
       // Example: in the 2_2 case we are mapping a 2 bits message onto a 4 bits
       // space, we want to keep the original 2 bits value in the 4 bits space,
       // so we apply the identity and the encoding will rescale it for us.
-      auto decompression_rescale_f = [encryption_params](Torus x) -> Torus {
+      auto decompression_rescale_f = [](Torus x) -> Torus {
         return x;
       };
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -12,13 +12,14 @@
 #include "polynomial/functions.cuh"
 #include "utils/kernel_dimensions.cuh"
 
+// This kernel follows the naming used in the rust implementation
 template <typename Torus>
 __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
                      uint32_t num_glwes, uint32_t in_len, uint32_t out_len) {
   auto tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   if (tid < num_glwes * out_len) {
-    auto NBITS = sizeof(Torus) * 8;
+    const auto NBITS = sizeof(Torus) * 8;
     auto glwe_index = tid / out_len;
     auto i = tid % out_len;
     auto chunk_array_in = array_in + glwe_index * in_len;
@@ -50,7 +51,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
     PANIC("Cuda error: Input and output must be different");
 
   cuda_set_device(gpu_index);
-  auto NBITS = sizeof(Torus) * 8;
+  const auto NBITS = sizeof(Torus) * 8;
   auto compression_params = mem_ptr->compression_params;
   auto log_modulus = mem_ptr->storage_log_modulus;
 
@@ -139,24 +140,26 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                    num_radix_blocks, mem_ptr);
 }
 
+// This kernel follows the naming used in the rust implementation
+    // except for output_len, which relates to initial_len
 template <typename Torus>
-__global__ void extract(Torus *glwe_array_out, Torus const *array_in,
-                        uint32_t index, uint32_t log_modulus,
-                        uint32_t input_len, uint32_t initial_out_len) {
-  auto NBITS = sizeof(Torus) * 8;
+__global__ void extract(Torus *glwe_array_out, const Torus *array_in,
+                        const uint32_t index, const uint32_t log_modulus,
+                        const uint32_t input_len, const uint32_t output_len) {
+  const auto NBITS = sizeof(Torus) * 8;
 
   auto i = threadIdx.x + blockIdx.x * blockDim.x;
   auto chunk_array_in = array_in + index * input_len;
-  if (i < initial_out_len) {
+  if (i < output_len) {
     // Unpack
-    Torus mask = ((Torus)1 << log_modulus) - 1;
-    auto start = i * log_modulus;
-    auto end = (i + 1) * log_modulus;
+    auto mask = (static_cast<Torus>(1) << log_modulus) - 1;
+    const auto start = i * log_modulus;
+    const auto end = (i + 1) * log_modulus;
 
     auto start_block = start / NBITS;
     auto start_remainder = start % NBITS;
 
-    auto end_block_inclusive = (end - 1) / NBITS;
+    const auto end_block_inclusive = (end - 1) / NBITS;
 
     Torus unpacked_i;
     if (start_block == end_block_inclusive) {
@@ -185,30 +188,44 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
     PANIC("Cuda error: Input and output must be different");
 
   cuda_set_device(gpu_index);
-  auto NBITS = sizeof(Torus) * 8;
   auto compression_params = mem_ptr->compression_params;
+    auto num_glwes = (mem_ptr->body_count + compression_params.polynomial_size - 1) / compression_params.polynomial_size;
+    printf("glwe_index: %u / %u\n", glwe_index, num_glwes);
+  const auto NBITS = sizeof(Torus) * 8;
+  printf("CUDA NBITS: %u\n", NBITS);
   auto log_modulus = mem_ptr->storage_log_modulus;
+  printf("CUDA log_modulus: %u\n", log_modulus);
+
+  auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
+                              compression_params.polynomial_size;
+  printf("CUDA glwe_ciphertext_size: %u\n", glwe_ciphertext_size);
+
+  auto glwe_mask_size =
+          compression_params.glwe_dimension * compression_params.polynomial_size;
+  printf("CUDA glwe_mask_size: %u\n", glwe_mask_size);
+
+  uint32_t num_lwes = (glwe_index == num_glwes - 1) ? (mem_ptr->body_count % compression_params.polynomial_size) : compression_params.polynomial_size;
+  printf("CUDA body_count: %u\n", num_lwes);
 
-  uint32_t body_count =
-      std::min(mem_ptr->body_count, compression_params.polynomial_size);
   // num_glwes = 1 in this case
   auto uncompressed_len =
-      compression_params.glwe_dimension * compression_params.polynomial_size +
-      body_count;
+          glwe_ciphertext_size;
+  printf("CUDA uncompressed_len: %u\n", uncompressed_len);
 
-  auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
-                              compression_params.polynomial_size;
   auto number_bits_to_unpack = uncompressed_len * log_modulus;
+  printf("CUDA number_bits_to_unpack: %u\n", number_bits_to_unpack);
+
   // number_bits_to_unpack.div_ceil(Scalar::BITS)
   auto compressed_len = (number_bits_to_unpack + NBITS - 1) / NBITS;
+  printf("CUDA compressed_len: %u\n", compressed_len);
 
   // We assure the tail of the glwe is zeroed
   auto zeroed_slice = glwe_array_out + uncompressed_len;
   cuda_memset_async(zeroed_slice, 0,
-                    (compression_params.polynomial_size - body_count) *
+                    (compression_params.polynomial_size - num_lwes) *
                         sizeof(Torus),
                     stream, gpu_index);
-
+// cuda_memset_async(glwe_array_out, 0, glwe_ciphertext_size * sizeof(Torus), stream, gpu_index);
   // Kernel settings
   int num_blocks = 0, num_threads = 0;
   getNumBlocksAndThreads(uncompressed_len, 128, num_blocks, num_threads);
diff --git a/tfhe/src/core_crypto/entities/packed_integers.rs b/tfhe/src/core_crypto/entities/packed_integers.rs
@@ -105,56 +105,10 @@ impl<Scalar: UnsignedInteger> PackedIntegers<Scalar> {
             let end_block_inclusive = (end - 1) / Scalar::BITS;
 
             if start_block == end_block_inclusive {
-                // Lowest bits are on the right
-                //
-                // Target mapping:
-                //                                   Scalar::BITS
-                //                                |---------------|
-                //
-                // packed_coeffs: | start_block+1 |  start_block  |
-                // container    :             |  i+1  |   i   |  i-1  |
-                //
-                //                                    |-------|
-                //                                   log_modulus
-                //
-                //                                            |---|
-                //                                       start_remainder
-                //
-                // In container[i] we want the bits of packed_coeffs[start_block] starting from
-                // index start_remainder
-                //
-                // container[i] = lowest_bits of single_part
-                //
                 let single_part = self.packed_coeffs[start_block] >> start_remainder;
 
                 single_part & mask
             } else {
-                // Lowest bits are on the right
-                //
-                // Target mapping:
-                //                                   Scalar::BITS
-                //                                 |---------------|
-                //
-                // packed_coeffs:  | start_block+1 |  start_block  |
-                // container    :      |  i+1  |   i   |  i-1  |
-                //
-                //                             |-------|
-                //                            log_modulus
-                //
-                //                                     |-----------|
-                //                                    start_remainder
-                //
-                //                                 |---|
-                //                     Scalar::BITS - start_remainder
-                //
-                // In the lowest bits of container[i] we want the highest bits of
-                // packed_coeffs[start_block] starting from index start_remainder
-                //
-                // In the next bits, we want the lowest bits of packed_coeffs[start_block + 1]
-                // left shifted to avoid overlapping
-                //
-                // container[i] = lowest_bits of (first_part|second_part)
-                //
                 assert_eq!(end_block_inclusive, start_block + 1);
 
                 let first_part = self.packed_coeffs[start_block] >> start_remainder;