Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(gpu): improve compression tests #1571

Merged
merged 1 commit into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions backends/tfhe-cuda-backend/cuda/include/compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ template <typename Torus> struct int_compression {
sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_glwe_array_out = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
gpu_indexes[0]);

scratch_packing_keyswitch_lwe_list_to_glwe_64(
streams[0], gpu_indexes[0], &fp_ks_buffer,
Expand Down Expand Up @@ -100,6 +101,7 @@ template <typename Torus> struct int_decompression {

Torus *tmp_extracted_glwe;
Torus *tmp_extracted_lwe;
uint32_t *tmp_indexes_array;

int_radix_lut<Torus> *carry_extract_lut;

Expand All @@ -117,20 +119,21 @@ template <typename Torus> struct int_decompression {
if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;

Torus lwe_accumulator_size = (compression_params.glwe_dimension *
compression_params.polynomial_size +
1);
carry_extract_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, 1,
num_radix_blocks, allocate_gpu_memory);

tmp_extracted_glwe = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
tmp_indexes_array = (uint32_t *)cuda_malloc_async(
num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0]);
tmp_extracted_lwe = (Torus *)cuda_malloc_async(
num_radix_blocks *
(compression_params.glwe_dimension *
compression_params.polynomial_size +
1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
// Decompression
// Carry extract LUT
auto carry_extract_f = [encryption_params](Torus x) -> Torus {
Expand All @@ -151,6 +154,7 @@ template <typename Torus> struct int_decompression {
uint32_t gpu_count) {
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]);

carry_extract_lut->release(streams, gpu_indexes, gpu_count);
delete (carry_extract_lut);
Expand Down
3 changes: 2 additions & 1 deletion backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
uint32_t lwe_per_glwe = params::degree;
auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;

auto nth = nth_array[input_id];
// nth is ensured to be in [0, lwe_per_glwe)
auto nth = nth_array[input_id] % lwe_per_glwe;

sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
Expand Down
189 changes: 131 additions & 58 deletions backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -12,50 +12,57 @@

template <typename Torus>
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
uint32_t in_len, uint32_t len) {
uint32_t num_glwes, uint32_t in_len, uint32_t out_len) {
auto nbits = sizeof(Torus) * 8;
auto tid = threadIdx.x + blockIdx.x * blockDim.x;

auto glwe_index = tid / out_len;
auto i = tid % out_len;
auto chunk_array_in = array_in + glwe_index * in_len;
auto chunk_array_out = array_out + glwe_index * out_len;

if (tid < num_glwes * out_len) {

auto i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < len) {
auto k = nbits * i / log_modulus;
auto j = k;

auto start_shift = i * nbits - j * log_modulus;

auto value = array_in[j] >> start_shift;
auto value = chunk_array_in[j] >> start_shift;
j++;

while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
auto shift = j * log_modulus - i * nbits;
value |= array_in[j] << shift;
value |= chunk_array_in[j] << shift;
j++;
}

array_out[i] = value;
chunk_array_out[i] = value;
}
}

template <typename Torus>
__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
Torus *array_out, Torus *array_in, uint32_t body_count,
Torus *array_out, Torus *array_in, uint32_t num_glwes,
int_compression<Torus> *mem_ptr) {
cudaSetDevice(gpu_index);
auto params = mem_ptr->compression_params;
auto compression_params = mem_ptr->compression_params;

auto log_modulus = mem_ptr->storage_log_modulus;
auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
auto in_len = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
auto number_bits_to_pack = in_len * log_modulus;
auto nbits = sizeof(Torus) * 8;
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto len = (number_bits_to_pack + nbits - 1) / nbits;
auto out_len = (number_bits_to_pack + nbits - 1) / nbits;

int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(len, 128, num_blocks, num_threads);
getNumBlocksAndThreads(num_glwes * out_len, 1024, num_blocks, num_threads);

dim3 grid(num_blocks);
dim3 threads(num_threads);
pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
in_len, len);
num_glwes, in_len, out_len);
}

template <typename Torus>
Expand All @@ -79,42 +86,52 @@ __host__ void host_integer_compress(cudaStream_t *streams,
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);

// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
cuda_memset_async(tmp_glwe_array_out, 0,
num_glwes * (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
for (int i = 0; i < num_glwes; i++) {
auto lwe_subset = lwe_shifted + i * lwe_in_size;
auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;
auto rem_lwes = num_lwes;

auto lwe_subset = lwe_shifted;
auto glwe_out = tmp_glwe_array_out;
while (rem_lwes > 0) {
auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);

host_packing_keyswitch_lwe_list_to_glwe<Torus>(
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
compression_params.polynomial_size, compression_params.ks_base_log,
compression_params.ks_level, body_count);
compression_params.ks_level, chunk_size);

rem_lwes -= chunk_size;
lwe_subset += chunk_size * lwe_in_size;
glwe_out += glwe_out_size;
}

// Modulus switch
host_modulus_switch_inplace<Torus>(
streams[0], gpu_indexes[0], tmp_glwe_array_out,
num_glwes * (compression_params.glwe_dimension *
compression_params.polynomial_size +
body_count),
num_glwes * (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size,
mem_ptr->storage_log_modulus);
check_cuda_error(cudaGetLastError());

host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
tmp_glwe_array_out, body_count, mem_ptr);
tmp_glwe_array_out, num_glwes, mem_ptr);
}

template <typename Torus>
__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
uint32_t log_modulus, uint32_t initial_out_len) {
uint32_t log_modulus, uint32_t input_len,
uint32_t initial_out_len) {
auto nbits = sizeof(Torus) * 8;

auto i = threadIdx.x + blockIdx.x * blockDim.x;

auto chunk_array_in = array_in + index * input_len;
if (i < initial_out_len) {
// Unpack
Torus mask = ((Torus)1 << log_modulus) - 1;
Expand All @@ -128,11 +145,12 @@ __global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,

Torus unpacked_i;
if (start_block == end_block_inclusive) {
auto single_part = array_in[start_block] >> start_remainder;
auto single_part = chunk_array_in[start_block] >> start_remainder;
unpacked_i = single_part & mask;
} else {
auto first_part = array_in[start_block] >> start_remainder;
auto second_part = array_in[start_block + 1] << (nbits - start_remainder);
auto first_part = chunk_array_in[start_block] >> start_remainder;
auto second_part = chunk_array_in[start_block + 1]
<< (nbits - start_remainder);

unpacked_i = (first_part | second_part) & mask;
}
Expand All @@ -149,71 +167,126 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
int_decompression<Torus> *mem_ptr) {
cudaSetDevice(gpu_index);

auto params = mem_ptr->compression_params;
auto compression_params = mem_ptr->compression_params;

auto log_modulus = mem_ptr->storage_log_modulus;

uint32_t body_count = mem_ptr->body_count;

uint32_t body_count =
std::min(mem_ptr->body_count, compression_params.polynomial_size);
auto initial_out_len =
params.glwe_dimension * params.polynomial_size + body_count;
compression_params.glwe_dimension * compression_params.polynomial_size +
body_count;

auto compressed_glwe_accumulator_size =
(compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
auto number_bits_to_unpack = compressed_glwe_accumulator_size * log_modulus;
auto nbits = sizeof(Torus) * 8;
// number_bits_to_unpack.div_ceil(Scalar::BITS)
auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;

// We assure the tail of the glwe is zeroed
auto zeroed_slice = glwe_array_out + initial_out_len;
cuda_memset_async(zeroed_slice, 0,
(params.polynomial_size - body_count) * sizeof(Torus),
(compression_params.polynomial_size - body_count) *
sizeof(Torus),
stream, gpu_index);
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
dim3 grid(num_blocks);
dim3 threads(num_threads);
extract<Torus><<<grid, threads, 0, stream>>>(
glwe_array_out, array_in, glwe_index, log_modulus, initial_out_len);
extract<Torus><<<grid, threads, 0, stream>>>(glwe_array_out, array_in,
glwe_index, log_modulus,
input_len, initial_out_len);
check_cuda_error(cudaGetLastError());
}

template <typename Torus>
__host__ void
host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *packed_glwe_in, uint32_t *indexes_array,
uint32_t indexes_array_size, void **bsks,
int_decompression<Torus> *mem_ptr) {

auto polynomial_size = mem_ptr->encryption_params.polynomial_size;
if (indexes_array_size > polynomial_size)
uint32_t gpu_count, Torus *d_lwe_array_out,
Torus *d_packed_glwe_in, uint32_t *h_indexes_array,
uint32_t indexes_array_size, void **d_bsks,
int_decompression<Torus> *h_mem_ptr) {

auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
cuda_memcpy_async_to_gpu(d_indexes_array, h_indexes_array,
indexes_array_size * sizeof(uint32_t), streams[0],
gpu_indexes[0]);

auto compression_params = h_mem_ptr->compression_params;
auto lwe_per_glwe = compression_params.polynomial_size;
if (indexes_array_size > lwe_per_glwe)
PANIC("Cuda error: too many LWEs to decompress. The number of LWEs should "
"be smaller than "
"polynomial_size.")

auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
auto compression_params = mem_ptr->compression_params;
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
packed_glwe_in, 0, mem_ptr);
auto num_lwes = h_mem_ptr->num_lwes;

// the first element is the last index in h_indexes_array that lies in the
// related GLWE
std::vector<std::pair<int, Torus *>> glwe_vec;

// Extract all GLWEs
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;

auto num_lwes = mem_ptr->num_lwes;
auto current_glwe_index = h_indexes_array[0] / lwe_per_glwe;
auto extracted_glwe = h_mem_ptr->tmp_extracted_glwe;
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
d_packed_glwe_in, current_glwe_index, h_mem_ptr);
glwe_vec.push_back(std::make_pair(0, extracted_glwe));
for (int i = 1; i < indexes_array_size; i++) {
auto glwe_index = h_indexes_array[i] / lwe_per_glwe;
if (glwe_index != current_glwe_index) {
extracted_glwe += glwe_accumulator_size;
current_glwe_index = glwe_index;
// Extracts a new GLWE
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
d_packed_glwe_in, glwe_index, h_mem_ptr);
glwe_vec.push_back(std::make_pair(i, extracted_glwe));
} else {
// Updates the index
glwe_vec.back().first++;
}
}
// Sample extract all LWEs
Torus lwe_accumulator_size =
(compression_params.glwe_dimension * compression_params.polynomial_size +
1);

auto extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
uint32_t current_idx = 0;
for (const auto &max_idx_and_glwe : glwe_vec) {
uint32_t max_idx = max_idx_and_glwe.first;
extracted_glwe = max_idx_and_glwe.second;

cuda_glwe_sample_extract_64(
streams[0], gpu_indexes[0], extracted_lwe, extracted_glwe,
d_indexes_array, max_idx + 1 - current_idx,
compression_params.glwe_dimension, compression_params.polynomial_size);

extracted_lwe += lwe_accumulator_size;
current_idx = max_idx;
}

// Sample extract
auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
extracted_glwe, indexes_array, indexes_array_size,
compression_params.glwe_dimension,
compression_params.polynomial_size);
// Reset
extracted_lwe = h_mem_ptr->tmp_extracted_lwe;

// In the case of extracting a single LWE this parameters are dummy
// In the case of extracting a single LWE these parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
auto encryption_params = mem_ptr->encryption_params;
auto lut = mem_ptr->carry_extract_lut;
auto encryption_params = h_mem_ptr->encryption_params;
auto lut = h_mem_ptr->carry_extract_lut;
auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
if (active_gpu_count == 1) {

execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_out,
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
lut->lwe_indexes_in, bsks, lut->buffer,
lut->lwe_indexes_in, d_bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
Expand All @@ -240,7 +313,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
Expand All @@ -249,7 +322,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_array_out, lwe_after_pbs_vec,
d_lwe_array_out, lwe_after_pbs_vec,
lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_lwes,
encryption_params.big_lwe_dimension + 1);
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/src/cuda_bind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ extern "C" {
gpu_count: u32,
lwe_array_out: *mut c_void,
glwe_in: *const c_void,
indexes_array: *const c_void,
indexes_array: *const u32,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the c++ entry point should be modified as well if you change this, right now it takes void*. We should always pass u32 for indexes arrays.

indexes_array_size: u32,
bsks: *const *mut c_void,
mem_ptr: *mut i8,
Expand Down
Loading
Loading