From 380a1d1e13e8e680fb36d2046b515a8ac70cf73a Mon Sep 17 00:00:00 2001 From: Guillermo Oyarzun Date: Fri, 14 Feb 2025 17:59:03 +0100 Subject: [PATCH] fix(gpu): enable large number of samples in pbs tbc --- .../cuda/src/pbs/programmable_bootstrap.cuh | 49 +++++++++++++++++++ .../programmable_bootstrap_tbc_multibit.cuh | 40 +++++++-------- 2 files changed, 69 insertions(+), 20 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 7209aba72e..40483092b5 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -76,6 +76,55 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer, synchronize_threads_in_block(); } +// This is a temporary function to have a working version of the tbc-PBS +// with large integers. +// TO DO: we should erase this function when the other flavors of the PBS also +// work with the number of samples in the first block dimension. +template +__device__ void mul_ggsw_glwe_in_fourier_domain_tbc( + double2 *fft, double2 *join_buffer, + const double2 *__restrict__ bootstrapping_key, int iteration, G &group, + bool support_dsm = false) { + const uint32_t polynomial_size = params::degree; + const uint32_t glwe_dimension = gridDim.y - 1; + const uint32_t level_count = gridDim.z; + + // The first product is used to initialize level_join_buffer + auto this_block_rank = get_this_block_rank(group, support_dsm); + + // Continues multiplying fft by every polynomial in that particular bsk level + // Each y-block accumulates in a different polynomial at each iteration + auto bsk_slice = get_ith_mask_kth_block( + bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size, + glwe_dimension, level_count); + for (int j = 0; j < glwe_dimension + 1; j++) { + int idx = (j + this_block_rank) % (glwe_dimension + 1); + + auto bsk_poly = bsk_slice + idx * polynomial_size / 2; + auto buffer_slice = get_join_buffer_element(blockIdx.z, idx, group, + join_buffer, polynomial_size, + glwe_dimension, support_dsm); + + polynomial_product_accumulate_in_fourier_domain( + buffer_slice, fft, bsk_poly, j == 0); + group.sync(); + } + + // ----------------------------------------------------------------- + // All blocks are synchronized here; after this sync, level_join_buffer has + // the values needed from every other block + + // accumulate rest of the products into fft buffer + for (int l = 0; l < level_count; l++) { + auto cur_src_acc = get_join_buffer_element(l, blockIdx.y, group, + join_buffer, polynomial_size, + glwe_dimension, support_dsm); + + polynomial_accumulate_in_fourier_domain(fft, cur_src_acc, l == 0); + } + + synchronize_threads_in_block(); +} template void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 3184e956e7..d9a57b6013 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -49,8 +49,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) if (support_dsm) selected_memory += sizeof(Torus) * polynomial_size; } else { - int block_index = blockIdx.x + blockIdx.y * gridDim.x + - blockIdx.z * gridDim.x * gridDim.y; + int block_index = blockIdx.z + blockIdx.y * gridDim.z + + blockIdx.x * gridDim.z * gridDim.y; selected_memory = &device_mem[block_index * device_memory_size_per_block]; } @@ -65,25 +65,25 @@ __global__ void __launch_bounds__(params::degree / params::opt) accumulator_fft += sizeof(double2) * (polynomial_size / 2); } - // The third dimension of the block is used to determine on which ciphertext + // The first dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps const Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = - &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * + &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree * (glwe_dimension + 1)]; double2 *block_join_buffer = - &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) * + &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) * params::degree / 2]; Torus *global_accumulator_slice = - &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * + &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree]; const double2 *keybundle = - &keybundle_array[blockIdx.z * keybundle_size_per_input]; + &keybundle_array[blockIdx.x * keybundle_size_per_input]; if (lwe_offset == 0) { // Put "b" in [0, 2N[ @@ -113,12 +113,12 @@ __global__ void __launch_bounds__(params::degree / params::opt) // accumulator decomposed at level 0, 1 at 1, etc.) GadgetMatrix gadget_acc(base_log, level_count, accumulator_rotated); - gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x); + gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.z); NSMFFT_direct>(accumulator_fft); synchronize_threads_in_block(); // Perform G^-1(ACC) * GGSW -> GLWE - mul_ggsw_glwe_in_fourier_domain( + mul_ggsw_glwe_in_fourier_domain_tbc( accumulator_fft, block_join_buffer, keybundle, i, cluster, support_dsm); NSMFFT_inverse>(accumulator_fft); synchronize_threads_in_block(); @@ -128,10 +128,10 @@ __global__ void __launch_bounds__(params::degree / params::opt) auto accumulator = accumulator_rotated; - if (blockIdx.x == 0) { + if (blockIdx.z == 0) { if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) { auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z] * + &lwe_array_out[lwe_output_indexes[blockIdx.x] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -145,9 +145,9 @@ __global__ void __launch_bounds__(params::degree / params::opt) for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + - (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); + (i * gridDim.x * (glwe_dimension * polynomial_size + 1)); auto next_block_lwe_array_out = - &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * + &next_lwe_array_out[lwe_output_indexes[blockIdx.x] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -162,9 +162,9 @@ __global__ void __launch_bounds__(params::degree / params::opt) auto next_lwe_array_out = lwe_array_out + - (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); + (i * gridDim.x * (glwe_dimension * polynomial_size + 1)); auto next_block_lwe_array_out = - &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * + &next_lwe_array_out[lwe_output_indexes[blockIdx.x] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -334,7 +334,7 @@ __host__ void execute_tbc_external_product_loop( auto global_accumulator = buffer->global_accumulator; auto buffer_fft = buffer->global_join_buffer; - dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples); + dim3 grid_accumulate(num_samples, glwe_dimension + 1, level_count); dim3 thds(polynomial_size / params::opt, 1, 1); cudaLaunchConfig_t config = {0}; @@ -346,9 +346,9 @@ __host__ void execute_tbc_external_product_loop( cudaLaunchAttribute attribute[1]; attribute[0].id = cudaLaunchAttributeClusterDimension; - attribute[0].val.clusterDim.x = level_count; // Cluster size in X-dimension + attribute[0].val.clusterDim.x = 1; attribute[0].val.clusterDim.y = (glwe_dimension + 1); - attribute[0].val.clusterDim.z = 1; + attribute[0].val.clusterDim.z = level_count; // Cluster size in Z-dimension config.attrs = attribute; config.numAttrs = 1; config.stream = stream; @@ -463,7 +463,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( int cluster_size; - dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples); + dim3 grid_accumulate(num_samples, glwe_dimension + 1, level_count); dim3 thds(polynomial_size / params::opt, 1, 1); cudaLaunchConfig_t config = {0};