Skip to content

Commit

Permalink
fix(gpu): enforce tighter bounds on compression output
Browse files Browse the repository at this point in the history
  • Loading branch information
pdroalves committed Feb 17, 2025
1 parent 0809eb9 commit 0a52484
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,26 @@

template <typename Torus>
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) {
auto nbits = sizeof(Torus) * 8;
uint32_t num_glwes, uint32_t in_len, uint32_t out_len) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;

auto glwe_index = tid / out_len;
auto i = tid % out_len;
auto chunk_array_in = array_in + glwe_index * in_len;
auto chunk_array_out = array_out + glwe_index * out_len;
if (tid < num_glwes * out_len) {
auto NBITS = sizeof(Torus) * 8;
auto glwe_index = tid / out_len;
auto i = tid % out_len;
auto chunk_array_in = array_in + glwe_index * in_len;
auto chunk_array_out = array_out + glwe_index * out_len;

if (tid < num_coeffs) {

auto k = nbits * i / log_modulus;
auto k = NBITS * i / log_modulus;
auto j = k;

auto start_shift = i * nbits - j * log_modulus;
auto start_shift = i * NBITS - j * log_modulus;

auto value = chunk_array_in[j] >> start_shift;
j++;

while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
auto shift = j * log_modulus - i * nbits;
while (j * log_modulus < ((i + 1) * NBITS) && j < in_len) {
auto shift = j * log_modulus - i * NBITS;
value |= chunk_array_in[j] << shift;
j++;
}
Expand All @@ -51,30 +50,31 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
PANIC("Cuda error: Input and output must be different");

cuda_set_device(gpu_index);
auto NBITS = sizeof(Torus) * 8;
auto compression_params = mem_ptr->compression_params;

auto log_modulus = mem_ptr->storage_log_modulus;
// [0..num_glwes-1) GLWEs
auto in_len = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
auto number_bits_to_pack = in_len * log_modulus;
auto nbits = sizeof(Torus) * 8;
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto out_len = (number_bits_to_pack + nbits - 1) / nbits;

// Last GLWE
number_bits_to_pack = in_len * log_modulus;
auto last_out_len = (number_bits_to_pack + nbits - 1) / nbits;
auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
auto glwe_mask_size =
compression_params.glwe_dimension * compression_params.polynomial_size;

auto uncompressed_len = num_glwes * glwe_mask_size + num_lwes;
auto number_bits_to_pack = uncompressed_len * log_modulus;

auto num_coeffs = (num_glwes - 1) * out_len + last_out_len;
// equivalent to number_bits_to_pack.div_ceil(Scalar::BITS)
auto compressed_len = (number_bits_to_pack + NBITS - 1) / NBITS;

// Kernel settings
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(num_coeffs, 1024, num_blocks, num_threads);
getNumBlocksAndThreads(num_glwes * compressed_len, 1024, num_blocks,
num_threads);

dim3 grid(num_blocks);
dim3 threads(num_threads);
pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
num_coeffs, in_len, out_len);
num_glwes, uncompressed_len,
compressed_len);
check_cuda_error(cudaGetLastError());
}

Expand Down Expand Up @@ -144,7 +144,7 @@ template <typename Torus>
__global__ void extract(Torus *glwe_array_out, Torus const *array_in,
uint32_t index, uint32_t log_modulus,
uint32_t input_len, uint32_t initial_out_len) {
auto nbits = sizeof(Torus) * 8;
auto NBITS = sizeof(Torus) * 8;

auto i = threadIdx.x + blockIdx.x * blockDim.x;
auto chunk_array_in = array_in + index * input_len;
Expand All @@ -154,10 +154,10 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in,
auto start = i * log_modulus;
auto end = (i + 1) * log_modulus;

auto start_block = start / nbits;
auto start_remainder = start % nbits;
auto start_block = start / NBITS;
auto start_remainder = start % NBITS;

auto end_block_inclusive = (end - 1) / nbits;
auto end_block_inclusive = (end - 1) / NBITS;

Torus unpacked_i;
if (start_block == end_block_inclusive) {
Expand All @@ -166,13 +166,13 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in,
} else {
auto first_part = chunk_array_in[start_block] >> start_remainder;
auto second_part = chunk_array_in[start_block + 1]
<< (nbits - start_remainder);
<< (NBITS - start_remainder);

unpacked_i = (first_part | second_part) & mask;
}

// Extract
glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
glwe_array_out[i] = unpacked_i << (NBITS - log_modulus);
}
}

Expand Down Expand Up @@ -201,9 +201,9 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
(compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
auto number_bits_to_unpack = compressed_glwe_accumulator_size * log_modulus;
auto nbits = sizeof(Torus) * 8;
auto NBITS = sizeof(Torus) * 8;
// number_bits_to_unpack.div_ceil(Scalar::BITS)
auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
auto input_len = (number_bits_to_unpack + NBITS - 1) / NBITS;

// We assure the tail of the glwe is zeroed
auto zeroed_slice = glwe_array_out + initial_out_len;
Expand Down
16 changes: 1 addition & 15 deletions tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::core_crypto::entities::packed_integers::PackedIntegers;
use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
use crate::core_crypto::gpu::CudaStreams;
use crate::core_crypto::prelude::compressed_modulus_switched_glwe_ciphertext::CompressedModulusSwitchedGlweCiphertext;
use crate::core_crypto::prelude::{glwe_ciphertext_size, CiphertextCount, LweCiphertextCount};
use crate::core_crypto::prelude::{glwe_ciphertext_size, glwe_mask_size, CiphertextCount, LweCiphertextCount};
use crate::integer::ciphertext::{CompressedCiphertextList, DataKind};
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use crate::integer::gpu::ciphertext::{
Expand Down Expand Up @@ -348,20 +348,6 @@ impl CompressedCiphertextList {
.flat_map(|ct| ct.packed_integers.packed_coeffs.clone())
.collect_vec();

let glwe_ciphertext_count = self.packed_list.modulus_switched_glwe_ciphertext_list.len();
let glwe_size = self.packed_list.modulus_switched_glwe_ciphertext_list[0]
.glwe_dimension()
.to_glwe_size();
let polynomial_size =
self.packed_list.modulus_switched_glwe_ciphertext_list[0].polynomial_size();

// FIXME: have a more precise memory handling, this is too long and should be "just" the
// original flat_cpu_data.len()
let unpacked_glwe_ciphertext_flat_len =
glwe_ciphertext_count * glwe_ciphertext_size(glwe_size, polynomial_size);

flat_cpu_data.resize(unpacked_glwe_ciphertext_flat_len, 0u64);

let flat_gpu_data = unsafe {
let v = CudaVec::from_cpu_async(flat_cpu_data.as_slice(), streams, 0);
streams.synchronize();
Expand Down
16 changes: 8 additions & 8 deletions tfhe/src/integer/gpu/list_compression/server_keys.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::vec::CudaVec;
use crate::core_crypto::gpu::CudaStreams;
use crate::core_crypto::prelude::{
glwe_ciphertext_size, CiphertextModulus, CiphertextModulusLog, GlweCiphertextCount,
LweCiphertextCount, PolynomialSize,
glwe_ciphertext_size, glwe_mask_size, CiphertextModulus, CiphertextModulusLog,
GlweCiphertextCount, LweCiphertextCount, PolynomialSize,
};
use crate::integer::ciphertext::DataKind;
use crate::integer::compression_keys::CompressionKey;
Expand Down Expand Up @@ -173,15 +173,15 @@ impl CudaCompressionKey {
.sum();

let num_glwes = num_lwes.div_ceil(self.lwe_per_glwe.0);
let glwe_ciphertext_size =
glwe_ciphertext_size(compressed_glwe_size, compressed_polynomial_size);
let glwe_mask_size = glwe_mask_size(
compressed_glwe_size.to_glwe_dimension(),
compressed_polynomial_size,
);
// The number of u64 (both mask and bodies)
// FIXME: have a more precise memory handling, this is too long and should be
// num_glwes * glwe_mask_size + num_lwes
let uncompressed_len = num_glwes * glwe_ciphertext_size;
let uncompressed_len = num_glwes * glwe_mask_size + num_lwes;
let number_bits_to_pack = uncompressed_len * self.storage_log_modulus.0;
let compressed_len = number_bits_to_pack.div_ceil(u64::BITS as usize);
let mut packed_glwe_list = CudaVec::new(compressed_len, streams, 0);
let mut packed_glwe_list = CudaVec::new(num_glwes * compressed_len, streams, 0);

unsafe {
let input_lwes = Self::flatten_async(ciphertexts, streams);
Expand Down

0 comments on commit 0a52484

Please sign in to comment.