Skip to content

Commit

Permalink
chore(gpu): refactor overflowing sub to track noise / degree
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Feb 14, 2025
1 parent cda43fd commit c5f44a6
Show file tree
Hide file tree
Showing 16 changed files with 275 additions and 183 deletions.
7 changes: 4 additions & 3 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -351,9 +351,10 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(

void cuda_integer_overflowing_sub_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lhs_array, const void *rhs_array, void *overflow_block,
const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t compute_overflow,
uint32_t uses_input_borrow);

void cleanup_cuda_integer_overflowing_sub(void *const *streams,
Expand Down
60 changes: 29 additions & 31 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -2118,9 +2118,9 @@ template <typename Torus> struct int_sc_prop_memory {
};

template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
Torus *shifted_blocks_and_borrow_states;
Torus *shifted_blocks;
Torus *borrow_states;
CudaRadixCiphertextFFI *shifted_blocks_and_borrow_states;
CudaRadixCiphertextFFI *shifted_blocks;
CudaRadixCiphertextFFI *borrow_states;

int_radix_lut<Torus> *luts_array_first_step;

Expand All @@ -2133,23 +2133,19 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

shifted_blocks_and_borrow_states = (Torus *)cuda_malloc_async(
num_many_lut * num_radix_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
cuda_memset_async(shifted_blocks_and_borrow_states, 0,
num_many_lut * num_radix_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
shifted_blocks = (Torus *)cuda_malloc_async(
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
borrow_states = (Torus *)cuda_malloc_async(
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
cuda_memset_async(borrow_states, 0, num_radix_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
shifted_blocks_and_borrow_states = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], shifted_blocks_and_borrow_states,
num_radix_blocks * num_many_lut, params.big_lwe_dimension);
shifted_blocks = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
shifted_blocks, num_radix_blocks,
params.big_lwe_dimension);
borrow_states = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
borrow_states, num_radix_blocks,
params.big_lwe_dimension);

uint32_t num_luts_first_step = 2 * grouping_size + 1;

Expand Down Expand Up @@ -2299,10 +2295,13 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {

cuda_drop_async(shifted_blocks_and_borrow_states, streams[0],
gpu_indexes[0]);
cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]);
cuda_drop_async(borrow_states, streams[0], gpu_indexes[0]);
release_radix_ciphertext(streams[0], gpu_indexes[0],
shifted_blocks_and_borrow_states);
delete shifted_blocks_and_borrow_states;
release_radix_ciphertext(streams[0], gpu_indexes[0], shifted_blocks);
delete shifted_blocks;
release_radix_ciphertext(streams[0], gpu_indexes[0], borrow_states);
delete borrow_states;

luts_array_first_step->release(streams, gpu_indexes, gpu_count);
delete luts_array_first_step;
Expand All @@ -2315,7 +2314,7 @@ template <typename Torus> struct int_borrow_prop_memory {

uint32_t group_size;
uint32_t num_groups;
Torus *overflow_block;
CudaRadixCiphertextFFI *overflow_block;

int_radix_lut<Torus> *lut_message_extract;
int_radix_lut<Torus> *lut_borrow_flag;
Expand Down Expand Up @@ -2345,8 +2344,6 @@ template <typename Torus> struct int_borrow_prop_memory {
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
compute_overflow = compute_overflow_in;
// for compute shifted blocks and block states
uint32_t block_modulus = message_modulus * carry_modulus;
Expand All @@ -2368,10 +2365,10 @@ template <typename Torus> struct int_borrow_prop_memory {
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
grouping_size, num_groups, true);

overflow_block = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
cuda_memset_async(overflow_block, 0, big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
overflow_block = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
overflow_block, 1,
params.big_lwe_dimension);

lut_message_extract =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
Expand Down Expand Up @@ -2447,7 +2444,8 @@ template <typename Torus> struct int_borrow_prop_memory {

shifted_blocks_borrow_state_mem->release(streams, gpu_indexes, gpu_count);
prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count);
cuda_drop_async(overflow_block, streams[0], gpu_indexes[0]);
release_radix_ciphertext(streams[0], gpu_indexes[0], overflow_block);
delete overflow_block;

lut_message_extract->release(streams, gpu_indexes, gpu_count);
delete lut_message_extract;
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ __host__ void compare_radix_blocks_kb(
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace<Torus>(
legacy_host_integer_radix_add_scalar_one_inplace<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
num_radix_blocks, message_modulus, carry_modulus);
}
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
mem_ptr->overflow_sub_mem->update_lut_indexes(
streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
merged_interesting_remainder.len);
host_integer_overflowing_sub<uint64_t>(
legacy_host_integer_overflowing_sub<uint64_t>(
streams, gpu_indexes, gpu_count, new_remainder.data,
(uint64_t *)merged_interesting_remainder.data,
interesting_divisor.data, subtraction_overflowed.data,
Expand Down
16 changes: 7 additions & 9 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -131,19 +131,17 @@ void cuda_add_and_propagate_single_carry_kb_64_inplace(

void cuda_integer_overflowing_sub_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lhs_array, const void *rhs_array, void *overflow_block,
const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t compute_overflow,
uint32_t uses_input_borrow) {

host_integer_overflowing_sub<uint64_t>(
(cudaStream_t const *)streams, gpu_indexes, gpu_count,
static_cast<uint64_t *>(lhs_array), static_cast<uint64_t *>(lhs_array),
static_cast<const uint64_t *>(rhs_array),
static_cast<uint64_t *>(overflow_block),
static_cast<const uint64_t *>(input_borrow),
(cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array,
lhs_array, rhs_array, overflow_block, input_borrow,
(int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
num_blocks, compute_overflow, uses_input_borrow);
compute_overflow, uses_input_borrow);
}

void cleanup_cuda_propagate_single_carry(void *const *streams,
Expand Down
41 changes: 19 additions & 22 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1887,14 +1887,8 @@ void host_compute_propagation_simulators_and_group_carries(
num_groups - 1);
}
}
// This function is used to perform step 1 of Thomas' new borrow propagation
// algorithm It uses a many lut to calculate two luts in parallel
// shifted_blocks: contains (block % message modulus) << 1
// block states: contains the propagation states for the different blocks
// depending on the group it belongs to and the internal position within the
// block.
template <typename Torus>
void host_compute_shifted_blocks_and_borrow_states(
void legacy_host_compute_shifted_blocks_and_borrow_states(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
Expand All @@ -1906,16 +1900,17 @@ void host_compute_shifted_blocks_and_borrow_states(
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
auto shifted_blocks_and_borrow_states =
(Torus *)mem->shifted_blocks_and_borrow_states->ptr;
auto luts_array_first_step = mem->luts_array_first_step;

legacy_integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, shifted_blocks_and_borrow_states,
lwe_array, bsks, ksks, num_radix_blocks, luts_array_first_step,
num_many_lut, lut_stride);

auto shifted_blocks = mem->shifted_blocks;
auto borrow_states = mem->borrow_states;
auto shifted_blocks = (Torus *)mem->shifted_blocks->ptr;
auto borrow_states = (Torus *)mem->borrow_states->ptr;
cuda_memcpy_async_gpu_to_gpu(borrow_states, shifted_blocks_and_borrow_states,
big_lwe_size_bytes * num_radix_blocks,
streams[0], gpu_indexes[0]);
Expand Down Expand Up @@ -2735,10 +2730,8 @@ void scratch_cuda_integer_overflowing_sub(
compute_overflow, allocate_gpu_memory);
}

// This function perform the three steps of Thomas' new borrow propagation
// includes the logic to extract overflow when requested
template <typename Torus>
void host_single_borrow_propagate(
void legacy_host_single_borrow_propagate(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lhsrhs_array, Torus *overflow_block,
const Torus *input_borrow, int_borrow_prop_memory<Torus> *mem,
Expand All @@ -2758,19 +2751,20 @@ void host_single_borrow_propagate(

assert(mem->num_groups >= num_groups);
if (uses_input_borrow == 1) {
host_unchecked_sub_with_correcting_term<Torus>(
legacy_host_unchecked_sub_with_correcting_term<Torus>(
streams[0], gpu_indexes[0], lhsrhs_array, lhsrhs_array, input_borrow,
big_lwe_dimension, 1, message_modulus, carry_modulus,
message_modulus - 1);
}
// Step 1
host_compute_shifted_blocks_and_borrow_states<Torus>(
legacy_host_compute_shifted_blocks_and_borrow_states<Torus>(
streams, gpu_indexes, gpu_count, lhsrhs_array, params,
mem->shifted_blocks_borrow_state_mem, bsks, ksks, num_radix_blocks,
lut_stride, num_many_lut);

auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
cuda_memcpy_async_gpu_to_gpu(mem->overflow_block,
auto borrow_states =
(Torus *)mem->shifted_blocks_borrow_state_mem->borrow_states->ptr;
cuda_memcpy_async_gpu_to_gpu((Torus *)mem->overflow_block->ptr,
borrow_states +
(num_radix_blocks - 1) * big_lwe_size,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
Expand All @@ -2781,7 +2775,8 @@ void host_single_borrow_propagate(
mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks,
num_groups);

auto shifted_blocks = mem->shifted_blocks_borrow_state_mem->shifted_blocks;
auto shifted_blocks =
(Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
auto prepared_blocks =
(Torus *)mem->prop_simu_group_carries_mem->prepared_blocks->ptr;
auto simulators = (Torus *)mem->prop_simu_group_carries_mem->simulators->ptr;
Expand All @@ -2790,13 +2785,14 @@ void host_single_borrow_propagate(
shifted_blocks, simulators, big_lwe_dimension,
num_radix_blocks);

host_integer_radix_add_scalar_one_inplace<Torus>(
legacy_host_integer_radix_add_scalar_one_inplace<Torus>(
streams, gpu_indexes, gpu_count, prepared_blocks, big_lwe_dimension,
num_radix_blocks, message_modulus, carry_modulus);

if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
legacy_host_addition<Torus>(
streams[0], gpu_indexes[0], mem->overflow_block, mem->overflow_block,
streams[0], gpu_indexes[0], (Torus *)mem->overflow_block->ptr,
(Torus *)mem->overflow_block->ptr,
(Torus *)mem->prop_simu_group_carries_mem->simulators->ptr +
(num_radix_blocks - 1) * big_lwe_size,
big_lwe_dimension, 1);
Expand All @@ -2809,7 +2805,8 @@ void host_single_borrow_propagate(
// borrows
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
legacy_host_addition<Torus>(
streams[0], gpu_indexes[0], mem->overflow_block, mem->overflow_block,
streams[0], gpu_indexes[0], (Torus *)mem->overflow_block->ptr,
(Torus *)mem->overflow_block->ptr,
resolved_borrows + (num_groups - 1) * big_lwe_size, big_lwe_dimension,
1);
}
Expand All @@ -2826,7 +2823,7 @@ void host_single_borrow_propagate(
auto borrow_flag = mem->lut_borrow_flag;
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
mem->overflow_block, bsks, ksks, 1, borrow_flag);
(Torus *)mem->overflow_block->ptr, bsks, ksks, 1, borrow_flag);
}
for (int j = 0; j < mem->active_gpu_count; j++) {
cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1[j],
Expand Down
49 changes: 46 additions & 3 deletions backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ __host__ void host_integer_overflowing_sub_kb(
*/
template <typename Torus>
__host__ void host_integer_overflowing_sub(
__host__ void legacy_host_integer_overflowing_sub(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array,
const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow,
Expand All @@ -191,18 +191,61 @@ __host__ void host_integer_overflowing_sub(
uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;

auto stream = (cudaStream_t *)streams;
host_unchecked_sub_with_correcting_term<Torus>(
legacy_host_unchecked_sub_with_correcting_term<Torus>(
stream[0], gpu_indexes[0], static_cast<Torus *>(lwe_out_array),
static_cast<Torus *>(lhs_array), static_cast<const Torus *>(rhs_array),
radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
radix_params.carry_modulus, radix_params.message_modulus - 1);

host_single_borrow_propagate<Torus>(
legacy_host_single_borrow_propagate<Torus>(
streams, gpu_indexes, gpu_count, static_cast<Torus *>(lwe_out_array),
static_cast<Torus *>(overflow_block),
static_cast<const Torus *>(input_borrow),
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
num_blocks, num_groups, compute_overflow, uses_input_borrow);
}

template <typename Torus>
__host__ void host_integer_overflowing_sub(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI *input_left,
const CudaRadixCiphertextFFI *input_right,
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow,
int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t compute_overflow, uint32_t uses_input_borrow) {

if (output->num_radix_blocks != input_left->num_radix_blocks ||
output->num_radix_blocks != input_right->num_radix_blocks)
PANIC("Cuda error: lwe_array_in and output num radix blocks must be "
"the same")

if (output->lwe_dimension != input_left->lwe_dimension ||
output->lwe_dimension != input_right->lwe_dimension)
PANIC("Cuda error: lwe_array_in and output lwe_dimension must be "
"the same")

auto num_blocks = output->num_radix_blocks;
auto radix_params = mem_ptr->params;

// We need to recalculate the num_groups, because on the division the number
// of num_blocks changes
uint32_t block_modulus =
radix_params.message_modulus * radix_params.carry_modulus;
uint32_t num_bits_in_block = log2_int(block_modulus);
uint32_t grouping_size = num_bits_in_block;
uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;

auto stream = (cudaStream_t *)streams;
host_unchecked_sub_with_correcting_term<Torus>(
stream[0], gpu_indexes[0], output, input_left, input_right, num_blocks,
radix_params.message_modulus, radix_params.carry_modulus);

host_single_borrow_propagate<Torus>(
streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
num_groups, compute_overflow, uses_input_borrow);
}

#endif
Loading

0 comments on commit c5f44a6

Please sign in to comment.