zama-ai
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Lines changed: 4 additions & 3 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Lines changed: 29 additions & 31 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Lines changed: 29 additions & 31 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Lines changed: 1 addition & 1 deletion b/‎backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
Lines changed: 1 addition & 1 deletion b/‎backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Lines changed: 7 additions & 9 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Lines changed: 7 additions & 9 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Lines changed: 19 additions & 22 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Lines changed: 19 additions & 22 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
Lines changed: 46 additions & 3 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
Lines changed: 46 additions & 3 deletions
@@ -351,9 +351,10 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
 
 void cuda_integer_overflowing_sub_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lhs_array, const void *rhs_array, void *overflow_block,
-    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
+    CudaRadixCiphertextFFI *overflow_block,
+    const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t compute_overflow,
     uint32_t uses_input_borrow);
 
 void cleanup_cuda_integer_overflowing_sub(void *const *streams,
 
@@ -2118,9 +2118,9 @@ template <typename Torus> struct int_sc_prop_memory {
 };
 
 template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
-  Torus *shifted_blocks_and_borrow_states;
-  Torus *shifted_blocks;
-  Torus *borrow_states;
+  CudaRadixCiphertextFFI *shifted_blocks_and_borrow_states;
+  CudaRadixCiphertextFFI *shifted_blocks;
+  CudaRadixCiphertextFFI *borrow_states;
 
   int_radix_lut<Torus> *luts_array_first_step;
 
@@ -2133,23 +2133,19 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
     auto polynomial_size = params.polynomial_size;
     auto message_modulus = params.message_modulus;
     auto carry_modulus = params.carry_modulus;
-    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
-    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
 
-    shifted_blocks_and_borrow_states = (Torus *)cuda_malloc_async(
-        num_many_lut * num_radix_blocks * big_lwe_size_bytes, streams[0],
-        gpu_indexes[0]);
-    cuda_memset_async(shifted_blocks_and_borrow_states, 0,
-                      num_many_lut * num_radix_blocks * big_lwe_size_bytes,
-                      streams[0], gpu_indexes[0]);
-    shifted_blocks = (Torus *)cuda_malloc_async(
-        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-    cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
-                      streams[0], gpu_indexes[0]);
-    borrow_states = (Torus *)cuda_malloc_async(
-        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-    cuda_memset_async(borrow_states, 0, num_radix_blocks * big_lwe_size_bytes,
-                      streams[0], gpu_indexes[0]);
+    shifted_blocks_and_borrow_states = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], shifted_blocks_and_borrow_states,
+        num_radix_blocks * num_many_lut, params.big_lwe_dimension);
+    shifted_blocks = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                              shifted_blocks, num_radix_blocks,
+                                              params.big_lwe_dimension);
+    borrow_states = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                              borrow_states, num_radix_blocks,
+                                              params.big_lwe_dimension);
 
     uint32_t num_luts_first_step = 2 * grouping_size + 1;
 
@@ -2299,10 +2295,13 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
 
-    cuda_drop_async(shifted_blocks_and_borrow_states, streams[0],
-                    gpu_indexes[0]);
-    cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]);
-    cuda_drop_async(borrow_states, streams[0], gpu_indexes[0]);
+    release_radix_ciphertext(streams[0], gpu_indexes[0],
+                             shifted_blocks_and_borrow_states);
+    delete shifted_blocks_and_borrow_states;
+    release_radix_ciphertext(streams[0], gpu_indexes[0], shifted_blocks);
+    delete shifted_blocks;
+    release_radix_ciphertext(streams[0], gpu_indexes[0], borrow_states);
+    delete borrow_states;
 
     luts_array_first_step->release(streams, gpu_indexes, gpu_count);
     delete luts_array_first_step;
@@ -2315,7 +2314,7 @@ template <typename Torus> struct int_borrow_prop_memory {
 
   uint32_t group_size;
   uint32_t num_groups;
-  Torus *overflow_block;
+  CudaRadixCiphertextFFI *overflow_block;
 
   int_radix_lut<Torus> *lut_message_extract;
   int_radix_lut<Torus> *lut_borrow_flag;
@@ -2345,8 +2344,6 @@ template <typename Torus> struct int_borrow_prop_memory {
     auto polynomial_size = params.polynomial_size;
     auto message_modulus = params.message_modulus;
     auto carry_modulus = params.carry_modulus;
-    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
-    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
     compute_overflow = compute_overflow_in;
     // for compute shifted blocks and block states
     uint32_t block_modulus = message_modulus * carry_modulus;
@@ -2368,10 +2365,10 @@ template <typename Torus> struct int_borrow_prop_memory {
         streams, gpu_indexes, gpu_count, params, num_radix_blocks,
         grouping_size, num_groups, true);
 
-    overflow_block = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
-                                                gpu_indexes[0]);
-    cuda_memset_async(overflow_block, 0, big_lwe_size_bytes, streams[0],
-                      gpu_indexes[0]);
+    overflow_block = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                              overflow_block, 1,
+                                              params.big_lwe_dimension);
 
     lut_message_extract =
         new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
@@ -2447,7 +2444,8 @@ template <typename Torus> struct int_borrow_prop_memory {
 
     shifted_blocks_borrow_state_mem->release(streams, gpu_indexes, gpu_count);
     prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count);
-    cuda_drop_async(overflow_block, streams[0], gpu_indexes[0]);
+    release_radix_ciphertext(streams[0], gpu_indexes[0], overflow_block);
+    delete overflow_block;
 
     lut_message_extract->release(streams, gpu_indexes, gpu_count);
     delete lut_message_extract;
 
@@ -378,7 +378,7 @@ __host__ void compare_radix_blocks_kb(
   // Add one
   // Here Lhs can have the following values: (-1) % (message modulus * carry
   // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace<Torus>(
+  legacy_host_integer_radix_add_scalar_one_inplace<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
       num_radix_blocks, message_modulus, carry_modulus);
 }
 
@@ -437,7 +437,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
       mem_ptr->overflow_sub_mem->update_lut_indexes(
           streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
           merged_interesting_remainder.len);
-      host_integer_overflowing_sub<uint64_t>(
+      legacy_host_integer_overflowing_sub<uint64_t>(
           streams, gpu_indexes, gpu_count, new_remainder.data,
           (uint64_t *)merged_interesting_remainder.data,
           interesting_divisor.data, subtraction_overflowed.data,
 
@@ -131,19 +131,17 @@ void cuda_add_and_propagate_single_carry_kb_64_inplace(
 
 void cuda_integer_overflowing_sub_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lhs_array, const void *rhs_array, void *overflow_block,
-    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
+    CudaRadixCiphertextFFI *overflow_block,
+    const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t compute_overflow,
     uint32_t uses_input_borrow) {
 
   host_integer_overflowing_sub<uint64_t>(
-      (cudaStream_t const *)streams, gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lhs_array), static_cast<uint64_t *>(lhs_array),
-      static_cast<const uint64_t *>(rhs_array),
-      static_cast<uint64_t *>(overflow_block),
-      static_cast<const uint64_t *>(input_borrow),
+      (cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array,
+      lhs_array, rhs_array, overflow_block, input_borrow,
       (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
-      num_blocks, compute_overflow, uses_input_borrow);
+      compute_overflow, uses_input_borrow);
 }
 
 void cleanup_cuda_propagate_single_carry(void *const *streams,
 
@@ -1887,14 +1887,8 @@ void host_compute_propagation_simulators_and_group_carries(
         num_groups - 1);
   }
 }
-// This function is used to perform step 1 of Thomas' new borrow propagation
-// algorithm It uses a many lut to calculate two luts in parallel
-// shifted_blocks: contains (block % message modulus) << 1
-// block states: contains the propagation states for the different blocks
-// depending on the group it belongs to and the internal position within the
-// block.
 template <typename Torus>
-void host_compute_shifted_blocks_and_borrow_states(
+void legacy_host_compute_shifted_blocks_and_borrow_states(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
     int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
@@ -1906,16 +1900,17 @@ void host_compute_shifted_blocks_and_borrow_states(
   uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
   auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
 
-  auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
+  auto shifted_blocks_and_borrow_states =
+      (Torus *)mem->shifted_blocks_and_borrow_states->ptr;
   auto luts_array_first_step = mem->luts_array_first_step;
 
   legacy_integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, shifted_blocks_and_borrow_states,
       lwe_array, bsks, ksks, num_radix_blocks, luts_array_first_step,
       num_many_lut, lut_stride);
 
-  auto shifted_blocks = mem->shifted_blocks;
-  auto borrow_states = mem->borrow_states;
+  auto shifted_blocks = (Torus *)mem->shifted_blocks->ptr;
+  auto borrow_states = (Torus *)mem->borrow_states->ptr;
   cuda_memcpy_async_gpu_to_gpu(borrow_states, shifted_blocks_and_borrow_states,
                                big_lwe_size_bytes * num_radix_blocks,
                                streams[0], gpu_indexes[0]);
@@ -2735,10 +2730,8 @@ void scratch_cuda_integer_overflowing_sub(
       compute_overflow, allocate_gpu_memory);
 }
 
-// This function perform the three steps of Thomas' new borrow propagation
-// includes the logic to extract overflow when requested
 template <typename Torus>
-void host_single_borrow_propagate(
+void legacy_host_single_borrow_propagate(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, Torus *lhsrhs_array, Torus *overflow_block,
     const Torus *input_borrow, int_borrow_prop_memory<Torus> *mem,
@@ -2758,19 +2751,20 @@ void host_single_borrow_propagate(
 
   assert(mem->num_groups >= num_groups);
   if (uses_input_borrow == 1) {
-    host_unchecked_sub_with_correcting_term<Torus>(
+    legacy_host_unchecked_sub_with_correcting_term<Torus>(
         streams[0], gpu_indexes[0], lhsrhs_array, lhsrhs_array, input_borrow,
         big_lwe_dimension, 1, message_modulus, carry_modulus,
         message_modulus - 1);
   }
   // Step 1
-  host_compute_shifted_blocks_and_borrow_states<Torus>(
+  legacy_host_compute_shifted_blocks_and_borrow_states<Torus>(
       streams, gpu_indexes, gpu_count, lhsrhs_array, params,
       mem->shifted_blocks_borrow_state_mem, bsks, ksks, num_radix_blocks,
       lut_stride, num_many_lut);
 
-  auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
-  cuda_memcpy_async_gpu_to_gpu(mem->overflow_block,
+  auto borrow_states =
+      (Torus *)mem->shifted_blocks_borrow_state_mem->borrow_states->ptr;
+  cuda_memcpy_async_gpu_to_gpu((Torus *)mem->overflow_block->ptr,
                                borrow_states +
                                    (num_radix_blocks - 1) * big_lwe_size,
                                big_lwe_size_bytes, streams[0], gpu_indexes[0]);
@@ -2781,7 +2775,8 @@ void host_single_borrow_propagate(
       mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks,
       num_groups);
 
-  auto shifted_blocks = mem->shifted_blocks_borrow_state_mem->shifted_blocks;
+  auto shifted_blocks =
+      (Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
   auto prepared_blocks =
       (Torus *)mem->prop_simu_group_carries_mem->prepared_blocks->ptr;
   auto simulators = (Torus *)mem->prop_simu_group_carries_mem->simulators->ptr;
@@ -2790,13 +2785,14 @@ void host_single_borrow_propagate(
                           shifted_blocks, simulators, big_lwe_dimension,
                           num_radix_blocks);
 
-  host_integer_radix_add_scalar_one_inplace<Torus>(
+  legacy_host_integer_radix_add_scalar_one_inplace<Torus>(
       streams, gpu_indexes, gpu_count, prepared_blocks, big_lwe_dimension,
       num_radix_blocks, message_modulus, carry_modulus);
 
   if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
     legacy_host_addition<Torus>(
-        streams[0], gpu_indexes[0], mem->overflow_block, mem->overflow_block,
+        streams[0], gpu_indexes[0], (Torus *)mem->overflow_block->ptr,
+        (Torus *)mem->overflow_block->ptr,
         (Torus *)mem->prop_simu_group_carries_mem->simulators->ptr +
             (num_radix_blocks - 1) * big_lwe_size,
         big_lwe_dimension, 1);
@@ -2809,7 +2805,8 @@ void host_single_borrow_propagate(
   //  borrows
   if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
     legacy_host_addition<Torus>(
-        streams[0], gpu_indexes[0], mem->overflow_block, mem->overflow_block,
+        streams[0], gpu_indexes[0], (Torus *)mem->overflow_block->ptr,
+        (Torus *)mem->overflow_block->ptr,
         resolved_borrows + (num_groups - 1) * big_lwe_size, big_lwe_dimension,
         1);
   }
@@ -2826,7 +2823,7 @@ void host_single_borrow_propagate(
     auto borrow_flag = mem->lut_borrow_flag;
     legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
         mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
-        mem->overflow_block, bsks, ksks, 1, borrow_flag);
+        (Torus *)mem->overflow_block->ptr, bsks, ksks, 1, borrow_flag);
   }
   for (int j = 0; j < mem->active_gpu_count; j++) {
     cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1[j],
 
@@ -172,7 +172,7 @@ __host__ void host_integer_overflowing_sub_kb(
 
 */
 template <typename Torus>
-__host__ void host_integer_overflowing_sub(
+__host__ void legacy_host_integer_overflowing_sub(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array,
     const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow,
@@ -191,18 +191,61 @@ __host__ void host_integer_overflowing_sub(
   uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
 
   auto stream = (cudaStream_t *)streams;
-  host_unchecked_sub_with_correcting_term<Torus>(
+  legacy_host_unchecked_sub_with_correcting_term<Torus>(
       stream[0], gpu_indexes[0], static_cast<Torus *>(lwe_out_array),
       static_cast<Torus *>(lhs_array), static_cast<const Torus *>(rhs_array),
       radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
       radix_params.carry_modulus, radix_params.message_modulus - 1);
 
-  host_single_borrow_propagate<Torus>(
+  legacy_host_single_borrow_propagate<Torus>(
       streams, gpu_indexes, gpu_count, static_cast<Torus *>(lwe_out_array),
       static_cast<Torus *>(overflow_block),
       static_cast<const Torus *>(input_borrow),
       (int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
       num_blocks, num_groups, compute_overflow, uses_input_borrow);
 }
 
+template <typename Torus>
+__host__ void host_integer_overflowing_sub(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI *input_left,
+    const CudaRadixCiphertextFFI *input_right,
+    CudaRadixCiphertextFFI *overflow_block,
+    const CudaRadixCiphertextFFI *input_borrow,
+    int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
+    Torus *const *ksks, uint32_t compute_overflow, uint32_t uses_input_borrow) {
+
+  if (output->num_radix_blocks != input_left->num_radix_blocks ||
+      output->num_radix_blocks != input_right->num_radix_blocks)
+    PANIC("Cuda error: lwe_array_in and output num radix blocks must be "
+          "the same")
+
+  if (output->lwe_dimension != input_left->lwe_dimension ||
+      output->lwe_dimension != input_right->lwe_dimension)
+    PANIC("Cuda error: lwe_array_in and output lwe_dimension must be "
+          "the same")
+
+  auto num_blocks = output->num_radix_blocks;
+  auto radix_params = mem_ptr->params;
+
+  // We need to recalculate the num_groups, because on the division the number
+  // of num_blocks changes
+  uint32_t block_modulus =
+      radix_params.message_modulus * radix_params.carry_modulus;
+  uint32_t num_bits_in_block = log2_int(block_modulus);
+  uint32_t grouping_size = num_bits_in_block;
+  uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
+
+  auto stream = (cudaStream_t *)streams;
+  host_unchecked_sub_with_correcting_term<Torus>(
+      stream[0], gpu_indexes[0], output, input_left, input_right, num_blocks,
+      radix_params.message_modulus, radix_params.carry_modulus);
+
+  host_single_borrow_propagate<Torus>(
+      streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
+      (int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
+      num_groups, compute_overflow, uses_input_borrow);
+}
+
 #endif
Original file line number	Diff line number	Diff line change
`@@ -378,7 +378,7 @@ __host__ void compare_radix_blocks_kb(`
`378`	`378`	`// Add one`
`379`	`379`	`// Here Lhs can have the following values: (-1) % (message modulus * carry`
`380`	`380`	`// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2`
`381`		`- host_integer_radix_add_scalar_one_inplace<Torus>(`
	`381`	`+ legacy_host_integer_radix_add_scalar_one_inplace<Torus>(`
`382`	`382`	`streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,`
`383`	`383`	`num_radix_blocks, message_modulus, carry_modulus);`
`384`	`384`	`}`