Skip to content

Commit c5f44a6

Browse files
committed
chore(gpu): refactor overflowing sub to track noise / degree
1 parent cda43fd commit c5f44a6

File tree

16 files changed

+275
-183
lines changed

16 files changed

+275
-183
lines changed

backends/tfhe-cuda-backend/cuda/include/integer/integer.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -351,9 +351,10 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
351351

352352
void cuda_integer_overflowing_sub_kb_64_inplace(
353353
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
354-
void *lhs_array, const void *rhs_array, void *overflow_block,
355-
const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
356-
void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
354+
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
355+
CudaRadixCiphertextFFI *overflow_block,
356+
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
357+
void *const *bsks, void *const *ksks, uint32_t compute_overflow,
357358
uint32_t uses_input_borrow);
358359

359360
void cleanup_cuda_integer_overflowing_sub(void *const *streams,

backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2118,9 +2118,9 @@ template <typename Torus> struct int_sc_prop_memory {
21182118
};
21192119

21202120
template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
2121-
Torus *shifted_blocks_and_borrow_states;
2122-
Torus *shifted_blocks;
2123-
Torus *borrow_states;
2121+
CudaRadixCiphertextFFI *shifted_blocks_and_borrow_states;
2122+
CudaRadixCiphertextFFI *shifted_blocks;
2123+
CudaRadixCiphertextFFI *borrow_states;
21242124

21252125
int_radix_lut<Torus> *luts_array_first_step;
21262126

@@ -2133,23 +2133,19 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
21332133
auto polynomial_size = params.polynomial_size;
21342134
auto message_modulus = params.message_modulus;
21352135
auto carry_modulus = params.carry_modulus;
2136-
auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
2137-
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
21382136

2139-
shifted_blocks_and_borrow_states = (Torus *)cuda_malloc_async(
2140-
num_many_lut * num_radix_blocks * big_lwe_size_bytes, streams[0],
2141-
gpu_indexes[0]);
2142-
cuda_memset_async(shifted_blocks_and_borrow_states, 0,
2143-
num_many_lut * num_radix_blocks * big_lwe_size_bytes,
2144-
streams[0], gpu_indexes[0]);
2145-
shifted_blocks = (Torus *)cuda_malloc_async(
2146-
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
2147-
cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
2148-
streams[0], gpu_indexes[0]);
2149-
borrow_states = (Torus *)cuda_malloc_async(
2150-
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
2151-
cuda_memset_async(borrow_states, 0, num_radix_blocks * big_lwe_size_bytes,
2152-
streams[0], gpu_indexes[0]);
2137+
shifted_blocks_and_borrow_states = new CudaRadixCiphertextFFI;
2138+
create_zero_radix_ciphertext_async<Torus>(
2139+
streams[0], gpu_indexes[0], shifted_blocks_and_borrow_states,
2140+
num_radix_blocks * num_many_lut, params.big_lwe_dimension);
2141+
shifted_blocks = new CudaRadixCiphertextFFI;
2142+
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
2143+
shifted_blocks, num_radix_blocks,
2144+
params.big_lwe_dimension);
2145+
borrow_states = new CudaRadixCiphertextFFI;
2146+
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
2147+
borrow_states, num_radix_blocks,
2148+
params.big_lwe_dimension);
21532149

21542150
uint32_t num_luts_first_step = 2 * grouping_size + 1;
21552151

@@ -2299,10 +2295,13 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
22992295
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
23002296
uint32_t gpu_count) {
23012297

2302-
cuda_drop_async(shifted_blocks_and_borrow_states, streams[0],
2303-
gpu_indexes[0]);
2304-
cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]);
2305-
cuda_drop_async(borrow_states, streams[0], gpu_indexes[0]);
2298+
release_radix_ciphertext(streams[0], gpu_indexes[0],
2299+
shifted_blocks_and_borrow_states);
2300+
delete shifted_blocks_and_borrow_states;
2301+
release_radix_ciphertext(streams[0], gpu_indexes[0], shifted_blocks);
2302+
delete shifted_blocks;
2303+
release_radix_ciphertext(streams[0], gpu_indexes[0], borrow_states);
2304+
delete borrow_states;
23062305

23072306
luts_array_first_step->release(streams, gpu_indexes, gpu_count);
23082307
delete luts_array_first_step;
@@ -2315,7 +2314,7 @@ template <typename Torus> struct int_borrow_prop_memory {
23152314

23162315
uint32_t group_size;
23172316
uint32_t num_groups;
2318-
Torus *overflow_block;
2317+
CudaRadixCiphertextFFI *overflow_block;
23192318

23202319
int_radix_lut<Torus> *lut_message_extract;
23212320
int_radix_lut<Torus> *lut_borrow_flag;
@@ -2345,8 +2344,6 @@ template <typename Torus> struct int_borrow_prop_memory {
23452344
auto polynomial_size = params.polynomial_size;
23462345
auto message_modulus = params.message_modulus;
23472346
auto carry_modulus = params.carry_modulus;
2348-
auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
2349-
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
23502347
compute_overflow = compute_overflow_in;
23512348
// for compute shifted blocks and block states
23522349
uint32_t block_modulus = message_modulus * carry_modulus;
@@ -2368,10 +2365,10 @@ template <typename Torus> struct int_borrow_prop_memory {
23682365
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
23692366
grouping_size, num_groups, true);
23702367

2371-
overflow_block = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
2372-
gpu_indexes[0]);
2373-
cuda_memset_async(overflow_block, 0, big_lwe_size_bytes, streams[0],
2374-
gpu_indexes[0]);
2368+
overflow_block = new CudaRadixCiphertextFFI;
2369+
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
2370+
overflow_block, 1,
2371+
params.big_lwe_dimension);
23752372

23762373
lut_message_extract =
23772374
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
@@ -2447,7 +2444,8 @@ template <typename Torus> struct int_borrow_prop_memory {
24472444

24482445
shifted_blocks_borrow_state_mem->release(streams, gpu_indexes, gpu_count);
24492446
prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count);
2450-
cuda_drop_async(overflow_block, streams[0], gpu_indexes[0]);
2447+
release_radix_ciphertext(streams[0], gpu_indexes[0], overflow_block);
2448+
delete overflow_block;
24512449

24522450
lut_message_extract->release(streams, gpu_indexes, gpu_count);
24532451
delete lut_message_extract;

backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ __host__ void compare_radix_blocks_kb(
378378
// Add one
379379
// Here Lhs can have the following values: (-1) % (message modulus * carry
380380
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
381-
host_integer_radix_add_scalar_one_inplace<Torus>(
381+
legacy_host_integer_radix_add_scalar_one_inplace<Torus>(
382382
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
383383
num_radix_blocks, message_modulus, carry_modulus);
384384
}

backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
437437
mem_ptr->overflow_sub_mem->update_lut_indexes(
438438
streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
439439
merged_interesting_remainder.len);
440-
host_integer_overflowing_sub<uint64_t>(
440+
legacy_host_integer_overflowing_sub<uint64_t>(
441441
streams, gpu_indexes, gpu_count, new_remainder.data,
442442
(uint64_t *)merged_interesting_remainder.data,
443443
interesting_divisor.data, subtraction_overflowed.data,

backends/tfhe-cuda-backend/cuda/src/integer/integer.cu

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -131,19 +131,17 @@ void cuda_add_and_propagate_single_carry_kb_64_inplace(
131131

132132
void cuda_integer_overflowing_sub_kb_64_inplace(
133133
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
134-
void *lhs_array, const void *rhs_array, void *overflow_block,
135-
const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
136-
void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
134+
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
135+
CudaRadixCiphertextFFI *overflow_block,
136+
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
137+
void *const *bsks, void *const *ksks, uint32_t compute_overflow,
137138
uint32_t uses_input_borrow) {
138139

139140
host_integer_overflowing_sub<uint64_t>(
140-
(cudaStream_t const *)streams, gpu_indexes, gpu_count,
141-
static_cast<uint64_t *>(lhs_array), static_cast<uint64_t *>(lhs_array),
142-
static_cast<const uint64_t *>(rhs_array),
143-
static_cast<uint64_t *>(overflow_block),
144-
static_cast<const uint64_t *>(input_borrow),
141+
(cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array,
142+
lhs_array, rhs_array, overflow_block, input_borrow,
145143
(int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
146-
num_blocks, compute_overflow, uses_input_borrow);
144+
compute_overflow, uses_input_borrow);
147145
}
148146

149147
void cleanup_cuda_propagate_single_carry(void *const *streams,

backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1887,14 +1887,8 @@ void host_compute_propagation_simulators_and_group_carries(
18871887
num_groups - 1);
18881888
}
18891889
}
1890-
// This function is used to perform step 1 of Thomas' new borrow propagation
1891-
// algorithm It uses a many lut to calculate two luts in parallel
1892-
// shifted_blocks: contains (block % message modulus) << 1
1893-
// block states: contains the propagation states for the different blocks
1894-
// depending on the group it belongs to and the internal position within the
1895-
// block.
18961890
template <typename Torus>
1897-
void host_compute_shifted_blocks_and_borrow_states(
1891+
void legacy_host_compute_shifted_blocks_and_borrow_states(
18981892
cudaStream_t const *streams, uint32_t const *gpu_indexes,
18991893
uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
19001894
int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
@@ -1906,16 +1900,17 @@ void host_compute_shifted_blocks_and_borrow_states(
19061900
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
19071901
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
19081902

1909-
auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
1903+
auto shifted_blocks_and_borrow_states =
1904+
(Torus *)mem->shifted_blocks_and_borrow_states->ptr;
19101905
auto luts_array_first_step = mem->luts_array_first_step;
19111906

19121907
legacy_integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
19131908
streams, gpu_indexes, gpu_count, shifted_blocks_and_borrow_states,
19141909
lwe_array, bsks, ksks, num_radix_blocks, luts_array_first_step,
19151910
num_many_lut, lut_stride);
19161911

1917-
auto shifted_blocks = mem->shifted_blocks;
1918-
auto borrow_states = mem->borrow_states;
1912+
auto shifted_blocks = (Torus *)mem->shifted_blocks->ptr;
1913+
auto borrow_states = (Torus *)mem->borrow_states->ptr;
19191914
cuda_memcpy_async_gpu_to_gpu(borrow_states, shifted_blocks_and_borrow_states,
19201915
big_lwe_size_bytes * num_radix_blocks,
19211916
streams[0], gpu_indexes[0]);
@@ -2735,10 +2730,8 @@ void scratch_cuda_integer_overflowing_sub(
27352730
compute_overflow, allocate_gpu_memory);
27362731
}
27372732

2738-
// This function perform the three steps of Thomas' new borrow propagation
2739-
// includes the logic to extract overflow when requested
27402733
template <typename Torus>
2741-
void host_single_borrow_propagate(
2734+
void legacy_host_single_borrow_propagate(
27422735
cudaStream_t const *streams, uint32_t const *gpu_indexes,
27432736
uint32_t gpu_count, Torus *lhsrhs_array, Torus *overflow_block,
27442737
const Torus *input_borrow, int_borrow_prop_memory<Torus> *mem,
@@ -2758,19 +2751,20 @@ void host_single_borrow_propagate(
27582751

27592752
assert(mem->num_groups >= num_groups);
27602753
if (uses_input_borrow == 1) {
2761-
host_unchecked_sub_with_correcting_term<Torus>(
2754+
legacy_host_unchecked_sub_with_correcting_term<Torus>(
27622755
streams[0], gpu_indexes[0], lhsrhs_array, lhsrhs_array, input_borrow,
27632756
big_lwe_dimension, 1, message_modulus, carry_modulus,
27642757
message_modulus - 1);
27652758
}
27662759
// Step 1
2767-
host_compute_shifted_blocks_and_borrow_states<Torus>(
2760+
legacy_host_compute_shifted_blocks_and_borrow_states<Torus>(
27682761
streams, gpu_indexes, gpu_count, lhsrhs_array, params,
27692762
mem->shifted_blocks_borrow_state_mem, bsks, ksks, num_radix_blocks,
27702763
lut_stride, num_many_lut);
27712764

2772-
auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
2773-
cuda_memcpy_async_gpu_to_gpu(mem->overflow_block,
2765+
auto borrow_states =
2766+
(Torus *)mem->shifted_blocks_borrow_state_mem->borrow_states->ptr;
2767+
cuda_memcpy_async_gpu_to_gpu((Torus *)mem->overflow_block->ptr,
27742768
borrow_states +
27752769
(num_radix_blocks - 1) * big_lwe_size,
27762770
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
@@ -2781,7 +2775,8 @@ void host_single_borrow_propagate(
27812775
mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks,
27822776
num_groups);
27832777

2784-
auto shifted_blocks = mem->shifted_blocks_borrow_state_mem->shifted_blocks;
2778+
auto shifted_blocks =
2779+
(Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
27852780
auto prepared_blocks =
27862781
(Torus *)mem->prop_simu_group_carries_mem->prepared_blocks->ptr;
27872782
auto simulators = (Torus *)mem->prop_simu_group_carries_mem->simulators->ptr;
@@ -2790,13 +2785,14 @@ void host_single_borrow_propagate(
27902785
shifted_blocks, simulators, big_lwe_dimension,
27912786
num_radix_blocks);
27922787

2793-
host_integer_radix_add_scalar_one_inplace<Torus>(
2788+
legacy_host_integer_radix_add_scalar_one_inplace<Torus>(
27942789
streams, gpu_indexes, gpu_count, prepared_blocks, big_lwe_dimension,
27952790
num_radix_blocks, message_modulus, carry_modulus);
27962791

27972792
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
27982793
legacy_host_addition<Torus>(
2799-
streams[0], gpu_indexes[0], mem->overflow_block, mem->overflow_block,
2794+
streams[0], gpu_indexes[0], (Torus *)mem->overflow_block->ptr,
2795+
(Torus *)mem->overflow_block->ptr,
28002796
(Torus *)mem->prop_simu_group_carries_mem->simulators->ptr +
28012797
(num_radix_blocks - 1) * big_lwe_size,
28022798
big_lwe_dimension, 1);
@@ -2809,7 +2805,8 @@ void host_single_borrow_propagate(
28092805
// borrows
28102806
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
28112807
legacy_host_addition<Torus>(
2812-
streams[0], gpu_indexes[0], mem->overflow_block, mem->overflow_block,
2808+
streams[0], gpu_indexes[0], (Torus *)mem->overflow_block->ptr,
2809+
(Torus *)mem->overflow_block->ptr,
28132810
resolved_borrows + (num_groups - 1) * big_lwe_size, big_lwe_dimension,
28142811
1);
28152812
}
@@ -2826,7 +2823,7 @@ void host_single_borrow_propagate(
28262823
auto borrow_flag = mem->lut_borrow_flag;
28272824
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
28282825
mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
2829-
mem->overflow_block, bsks, ksks, 1, borrow_flag);
2826+
(Torus *)mem->overflow_block->ptr, bsks, ksks, 1, borrow_flag);
28302827
}
28312828
for (int j = 0; j < mem->active_gpu_count; j++) {
28322829
cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1[j],

backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ __host__ void host_integer_overflowing_sub_kb(
172172
173173
*/
174174
template <typename Torus>
175-
__host__ void host_integer_overflowing_sub(
175+
__host__ void legacy_host_integer_overflowing_sub(
176176
cudaStream_t const *streams, uint32_t const *gpu_indexes,
177177
uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array,
178178
const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow,
@@ -191,18 +191,61 @@ __host__ void host_integer_overflowing_sub(
191191
uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
192192

193193
auto stream = (cudaStream_t *)streams;
194-
host_unchecked_sub_with_correcting_term<Torus>(
194+
legacy_host_unchecked_sub_with_correcting_term<Torus>(
195195
stream[0], gpu_indexes[0], static_cast<Torus *>(lwe_out_array),
196196
static_cast<Torus *>(lhs_array), static_cast<const Torus *>(rhs_array),
197197
radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
198198
radix_params.carry_modulus, radix_params.message_modulus - 1);
199199

200-
host_single_borrow_propagate<Torus>(
200+
legacy_host_single_borrow_propagate<Torus>(
201201
streams, gpu_indexes, gpu_count, static_cast<Torus *>(lwe_out_array),
202202
static_cast<Torus *>(overflow_block),
203203
static_cast<const Torus *>(input_borrow),
204204
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
205205
num_blocks, num_groups, compute_overflow, uses_input_borrow);
206206
}
207207

208+
template <typename Torus>
209+
__host__ void host_integer_overflowing_sub(
210+
cudaStream_t const *streams, uint32_t const *gpu_indexes,
211+
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
212+
CudaRadixCiphertextFFI *input_left,
213+
const CudaRadixCiphertextFFI *input_right,
214+
CudaRadixCiphertextFFI *overflow_block,
215+
const CudaRadixCiphertextFFI *input_borrow,
216+
int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
217+
Torus *const *ksks, uint32_t compute_overflow, uint32_t uses_input_borrow) {
218+
219+
if (output->num_radix_blocks != input_left->num_radix_blocks ||
220+
output->num_radix_blocks != input_right->num_radix_blocks)
221+
PANIC("Cuda error: lwe_array_in and output num radix blocks must be "
222+
"the same")
223+
224+
if (output->lwe_dimension != input_left->lwe_dimension ||
225+
output->lwe_dimension != input_right->lwe_dimension)
226+
PANIC("Cuda error: lwe_array_in and output lwe_dimension must be "
227+
"the same")
228+
229+
auto num_blocks = output->num_radix_blocks;
230+
auto radix_params = mem_ptr->params;
231+
232+
// We need to recalculate the num_groups, because on the division the number
233+
// of num_blocks changes
234+
uint32_t block_modulus =
235+
radix_params.message_modulus * radix_params.carry_modulus;
236+
uint32_t num_bits_in_block = log2_int(block_modulus);
237+
uint32_t grouping_size = num_bits_in_block;
238+
uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
239+
240+
auto stream = (cudaStream_t *)streams;
241+
host_unchecked_sub_with_correcting_term<Torus>(
242+
stream[0], gpu_indexes[0], output, input_left, input_right, num_blocks,
243+
radix_params.message_modulus, radix_params.carry_modulus);
244+
245+
host_single_borrow_propagate<Torus>(
246+
streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
247+
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
248+
num_groups, compute_overflow, uses_input_borrow);
249+
}
250+
208251
#endif

0 commit comments

Comments
 (0)