diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh index bd2dc4805c..fceb07544a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -14,27 +14,26 @@ template __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus, - uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) { - auto nbits = sizeof(Torus) * 8; + uint32_t num_glwes, uint32_t in_len, uint32_t out_len) { auto tid = threadIdx.x + blockIdx.x * blockDim.x; - auto glwe_index = tid / out_len; - auto i = tid % out_len; - auto chunk_array_in = array_in + glwe_index * in_len; - auto chunk_array_out = array_out + glwe_index * out_len; + if (tid < num_glwes * out_len) { + auto NBITS = sizeof(Torus) * 8; + auto glwe_index = tid / out_len; + auto i = tid % out_len; + auto chunk_array_in = array_in + glwe_index * in_len; + auto chunk_array_out = array_out + glwe_index * out_len; - if (tid < num_coeffs) { - - auto k = nbits * i / log_modulus; + auto k = NBITS * i / log_modulus; auto j = k; - auto start_shift = i * nbits - j * log_modulus; + auto start_shift = i * NBITS - j * log_modulus; auto value = chunk_array_in[j] >> start_shift; j++; - while (j * log_modulus < ((i + 1) * nbits) && j < in_len) { - auto shift = j * log_modulus - i * nbits; + while (j * log_modulus < ((i + 1) * NBITS) && j < in_len) { + auto shift = j * log_modulus - i * NBITS; value |= chunk_array_in[j] << shift; j++; } @@ -51,30 +50,31 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index, PANIC("Cuda error: Input and output must be different"); cuda_set_device(gpu_index); + auto NBITS = sizeof(Torus) * 8; auto compression_params = mem_ptr->compression_params; - auto log_modulus = mem_ptr->storage_log_modulus; - // [0..num_glwes-1) GLWEs - auto in_len = (compression_params.glwe_dimension + 1) * - compression_params.polynomial_size; - auto number_bits_to_pack = in_len * log_modulus; - auto nbits = sizeof(Torus) * 8; - // number_bits_to_pack.div_ceil(Scalar::BITS) - auto out_len = (number_bits_to_pack + nbits - 1) / nbits; - // Last GLWE - number_bits_to_pack = in_len * log_modulus; - auto last_out_len = (number_bits_to_pack + nbits - 1) / nbits; + auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) * + compression_params.polynomial_size; + auto glwe_mask_size = + compression_params.glwe_dimension * compression_params.polynomial_size; + + auto uncompressed_len = num_glwes * glwe_mask_size + num_lwes; + auto number_bits_to_pack = uncompressed_len * log_modulus; - auto num_coeffs = (num_glwes - 1) * out_len + last_out_len; + // equivalent to number_bits_to_pack.div_ceil(Scalar::BITS) + auto compressed_len = (number_bits_to_pack + NBITS - 1) / NBITS; + // Kernel settings int num_blocks = 0, num_threads = 0; - getNumBlocksAndThreads(num_coeffs, 1024, num_blocks, num_threads); + getNumBlocksAndThreads(num_glwes * compressed_len, 1024, num_blocks, + num_threads); dim3 grid(num_blocks); dim3 threads(num_threads); pack<<>>(array_out, array_in, log_modulus, - num_coeffs, in_len, out_len); + num_glwes, uncompressed_len, + compressed_len); check_cuda_error(cudaGetLastError()); } @@ -144,7 +144,7 @@ template __global__ void extract(Torus *glwe_array_out, Torus const *array_in, uint32_t index, uint32_t log_modulus, uint32_t input_len, uint32_t initial_out_len) { - auto nbits = sizeof(Torus) * 8; + auto NBITS = sizeof(Torus) * 8; auto i = threadIdx.x + blockIdx.x * blockDim.x; auto chunk_array_in = array_in + index * input_len; @@ -154,10 +154,10 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in, auto start = i * log_modulus; auto end = (i + 1) * log_modulus; - auto start_block = start / nbits; - auto start_remainder = start % nbits; + auto start_block = start / NBITS; + auto start_remainder = start % NBITS; - auto end_block_inclusive = (end - 1) / nbits; + auto end_block_inclusive = (end - 1) / NBITS; Torus unpacked_i; if (start_block == end_block_inclusive) { @@ -166,13 +166,13 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in, } else { auto first_part = chunk_array_in[start_block] >> start_remainder; auto second_part = chunk_array_in[start_block + 1] - << (nbits - start_remainder); + << (NBITS - start_remainder); unpacked_i = (first_part | second_part) & mask; } // Extract - glwe_array_out[i] = unpacked_i << (nbits - log_modulus); + glwe_array_out[i] = unpacked_i << (NBITS - log_modulus); } } @@ -201,9 +201,9 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index, (compression_params.glwe_dimension + 1) * compression_params.polynomial_size; auto number_bits_to_unpack = compressed_glwe_accumulator_size * log_modulus; - auto nbits = sizeof(Torus) * 8; + auto NBITS = sizeof(Torus) * 8; // number_bits_to_unpack.div_ceil(Scalar::BITS) - auto input_len = (number_bits_to_unpack + nbits - 1) / nbits; + auto input_len = (number_bits_to_unpack + NBITS - 1) / NBITS; // We assure the tail of the glwe is zeroed auto zeroed_slice = glwe_array_out + initial_out_len; diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs index 38933d1729..27f93462dd 100644 --- a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs +++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs @@ -2,7 +2,7 @@ use crate::core_crypto::entities::packed_integers::PackedIntegers; use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex}; use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::compressed_modulus_switched_glwe_ciphertext::CompressedModulusSwitchedGlweCiphertext; -use crate::core_crypto::prelude::{glwe_ciphertext_size, CiphertextCount, LweCiphertextCount}; +use crate::core_crypto::prelude::{glwe_ciphertext_size, glwe_mask_size, CiphertextCount, LweCiphertextCount}; use crate::integer::ciphertext::{CompressedCiphertextList, DataKind}; use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use crate::integer::gpu::ciphertext::{ @@ -348,20 +348,6 @@ impl CompressedCiphertextList { .flat_map(|ct| ct.packed_integers.packed_coeffs.clone()) .collect_vec(); - let glwe_ciphertext_count = self.packed_list.modulus_switched_glwe_ciphertext_list.len(); - let glwe_size = self.packed_list.modulus_switched_glwe_ciphertext_list[0] - .glwe_dimension() - .to_glwe_size(); - let polynomial_size = - self.packed_list.modulus_switched_glwe_ciphertext_list[0].polynomial_size(); - - // FIXME: have a more precise memory handling, this is too long and should be "just" the - // original flat_cpu_data.len() - let unpacked_glwe_ciphertext_flat_len = - glwe_ciphertext_count * glwe_ciphertext_size(glwe_size, polynomial_size); - - flat_cpu_data.resize(unpacked_glwe_ciphertext_flat_len, 0u64); - let flat_gpu_data = unsafe { let v = CudaVec::from_cpu_async(flat_cpu_data.as_slice(), streams, 0); streams.synchronize(); diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs index c19f6ceabe..ee734e1a46 100644 --- a/tfhe/src/integer/gpu/list_compression/server_keys.rs +++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs @@ -3,8 +3,8 @@ use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::vec::CudaVec; use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::{ - glwe_ciphertext_size, CiphertextModulus, CiphertextModulusLog, GlweCiphertextCount, - LweCiphertextCount, PolynomialSize, + glwe_ciphertext_size, glwe_mask_size, CiphertextModulus, CiphertextModulusLog, + GlweCiphertextCount, LweCiphertextCount, PolynomialSize, }; use crate::integer::ciphertext::DataKind; use crate::integer::compression_keys::CompressionKey; @@ -173,15 +173,15 @@ impl CudaCompressionKey { .sum(); let num_glwes = num_lwes.div_ceil(self.lwe_per_glwe.0); - let glwe_ciphertext_size = - glwe_ciphertext_size(compressed_glwe_size, compressed_polynomial_size); + let glwe_mask_size = glwe_mask_size( + compressed_glwe_size.to_glwe_dimension(), + compressed_polynomial_size, + ); // The number of u64 (both mask and bodies) - // FIXME: have a more precise memory handling, this is too long and should be - // num_glwes * glwe_mask_size + num_lwes - let uncompressed_len = num_glwes * glwe_ciphertext_size; + let uncompressed_len = num_glwes * glwe_mask_size + num_lwes; let number_bits_to_pack = uncompressed_len * self.storage_log_modulus.0; let compressed_len = number_bits_to_pack.div_ceil(u64::BITS as usize); - let mut packed_glwe_list = CudaVec::new(compressed_len, streams, 0); + let mut packed_glwe_list = CudaVec::new(num_glwes * compressed_len, streams, 0); unsafe { let input_lwes = Self::flatten_async(ciphertexts, streams);