Skip to content

Commit 6e79255

Browse files
committed
.
1 parent 7807162 commit 6e79255

File tree

4 files changed

+39
-67
lines changed

4 files changed

+39
-67
lines changed

backends/tfhe-cuda-backend/cuda/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ endif()
6868
add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
6969

7070
# Check if the DEBUG flag is defined
71+
set(CMAKE_BUILD_TYPE "Debug")
7172
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
7273
# Debug mode
7374
message("Compiling in Debug mode")

backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ template <typename Torus> struct int_decompression {
102102
// Example: in the 2_2 case we are mapping a 2 bits message onto a 4 bits
103103
// space, we want to keep the original 2 bits value in the 4 bits space,
104104
// so we apply the identity and the encoding will rescale it for us.
105-
auto decompression_rescale_f = [encryption_params](Torus x) -> Torus {
105+
auto decompression_rescale_f = [](Torus x) -> Torus {
106106
return x;
107107
};
108108

backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,14 @@
1212
#include "polynomial/functions.cuh"
1313
#include "utils/kernel_dimensions.cuh"
1414

15+
// This kernel follows the naming used in the rust implementation
1516
template <typename Torus>
1617
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
1718
uint32_t num_glwes, uint32_t in_len, uint32_t out_len) {
1819
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
1920

2021
if (tid < num_glwes * out_len) {
21-
auto NBITS = sizeof(Torus) * 8;
22+
const auto NBITS = sizeof(Torus) * 8;
2223
auto glwe_index = tid / out_len;
2324
auto i = tid % out_len;
2425
auto chunk_array_in = array_in + glwe_index * in_len;
@@ -50,7 +51,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
5051
PANIC("Cuda error: Input and output must be different");
5152

5253
cuda_set_device(gpu_index);
53-
auto NBITS = sizeof(Torus) * 8;
54+
const auto NBITS = sizeof(Torus) * 8;
5455
auto compression_params = mem_ptr->compression_params;
5556
auto log_modulus = mem_ptr->storage_log_modulus;
5657

@@ -139,24 +140,26 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
139140
num_radix_blocks, mem_ptr);
140141
}
141142

143+
// This kernel follows the naming used in the rust implementation
144+
// except for output_len, which relates to initial_len
142145
template <typename Torus>
143-
__global__ void extract(Torus *glwe_array_out, Torus const *array_in,
144-
uint32_t index, uint32_t log_modulus,
145-
uint32_t input_len, uint32_t initial_out_len) {
146-
auto NBITS = sizeof(Torus) * 8;
146+
__global__ void extract(Torus *glwe_array_out, const Torus *array_in,
147+
const uint32_t index, const uint32_t log_modulus,
148+
const uint32_t input_len, const uint32_t output_len) {
149+
const auto NBITS = sizeof(Torus) * 8;
147150

148151
auto i = threadIdx.x + blockIdx.x * blockDim.x;
149152
auto chunk_array_in = array_in + index * input_len;
150-
if (i < initial_out_len) {
153+
if (i < output_len) {
151154
// Unpack
152-
Torus mask = ((Torus)1 << log_modulus) - 1;
153-
auto start = i * log_modulus;
154-
auto end = (i + 1) * log_modulus;
155+
auto mask = (static_cast<Torus>(1) << log_modulus) - 1;
156+
const auto start = i * log_modulus;
157+
const auto end = (i + 1) * log_modulus;
155158

156159
auto start_block = start / NBITS;
157160
auto start_remainder = start % NBITS;
158161

159-
auto end_block_inclusive = (end - 1) / NBITS;
162+
const auto end_block_inclusive = (end - 1) / NBITS;
160163

161164
Torus unpacked_i;
162165
if (start_block == end_block_inclusive) {
@@ -185,30 +188,44 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
185188
PANIC("Cuda error: Input and output must be different");
186189

187190
cuda_set_device(gpu_index);
188-
auto NBITS = sizeof(Torus) * 8;
189191
auto compression_params = mem_ptr->compression_params;
192+
auto num_glwes = (mem_ptr->body_count + compression_params.polynomial_size - 1) / compression_params.polynomial_size;
193+
printf("glwe_index: %u / %u\n", glwe_index, num_glwes);
194+
const auto NBITS = sizeof(Torus) * 8;
195+
printf("CUDA NBITS: %u\n", NBITS);
190196
auto log_modulus = mem_ptr->storage_log_modulus;
197+
printf("CUDA log_modulus: %u\n", log_modulus);
198+
199+
auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
200+
compression_params.polynomial_size;
201+
printf("CUDA glwe_ciphertext_size: %u\n", glwe_ciphertext_size);
202+
203+
auto glwe_mask_size =
204+
compression_params.glwe_dimension * compression_params.polynomial_size;
205+
printf("CUDA glwe_mask_size: %u\n", glwe_mask_size);
206+
207+
uint32_t num_lwes = (glwe_index == num_glwes - 1) ? (mem_ptr->body_count % compression_params.polynomial_size) : compression_params.polynomial_size;
208+
printf("CUDA body_count: %u\n", num_lwes);
191209

192-
uint32_t body_count =
193-
std::min(mem_ptr->body_count, compression_params.polynomial_size);
194210
// num_glwes = 1 in this case
195211
auto uncompressed_len =
196-
compression_params.glwe_dimension * compression_params.polynomial_size +
197-
body_count;
212+
glwe_ciphertext_size;
213+
printf("CUDA uncompressed_len: %u\n", uncompressed_len);
198214

199-
auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
200-
compression_params.polynomial_size;
201215
auto number_bits_to_unpack = uncompressed_len * log_modulus;
216+
printf("CUDA number_bits_to_unpack: %u\n", number_bits_to_unpack);
217+
202218
// number_bits_to_unpack.div_ceil(Scalar::BITS)
203219
auto compressed_len = (number_bits_to_unpack + NBITS - 1) / NBITS;
220+
printf("CUDA compressed_len: %u\n", compressed_len);
204221

205222
// We assure the tail of the glwe is zeroed
206223
auto zeroed_slice = glwe_array_out + uncompressed_len;
207224
cuda_memset_async(zeroed_slice, 0,
208-
(compression_params.polynomial_size - body_count) *
225+
(compression_params.polynomial_size - num_lwes) *
209226
sizeof(Torus),
210227
stream, gpu_index);
211-
228+
// cuda_memset_async(glwe_array_out, 0, glwe_ciphertext_size * sizeof(Torus), stream, gpu_index);
212229
// Kernel settings
213230
int num_blocks = 0, num_threads = 0;
214231
getNumBlocksAndThreads(uncompressed_len, 128, num_blocks, num_threads);

tfhe/src/core_crypto/entities/packed_integers.rs

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -105,56 +105,10 @@ impl<Scalar: UnsignedInteger> PackedIntegers<Scalar> {
105105
let end_block_inclusive = (end - 1) / Scalar::BITS;
106106

107107
if start_block == end_block_inclusive {
108-
// Lowest bits are on the right
109-
//
110-
// Target mapping:
111-
// Scalar::BITS
112-
// |---------------|
113-
//
114-
// packed_coeffs: | start_block+1 | start_block |
115-
// container : | i+1 | i | i-1 |
116-
//
117-
// |-------|
118-
// log_modulus
119-
//
120-
// |---|
121-
// start_remainder
122-
//
123-
// In container[i] we want the bits of packed_coeffs[start_block] starting from
124-
// index start_remainder
125-
//
126-
// container[i] = lowest_bits of single_part
127-
//
128108
let single_part = self.packed_coeffs[start_block] >> start_remainder;
129109

130110
single_part & mask
131111
} else {
132-
// Lowest bits are on the right
133-
//
134-
// Target mapping:
135-
// Scalar::BITS
136-
// |---------------|
137-
//
138-
// packed_coeffs: | start_block+1 | start_block |
139-
// container : | i+1 | i | i-1 |
140-
//
141-
// |-------|
142-
// log_modulus
143-
//
144-
// |-----------|
145-
// start_remainder
146-
//
147-
// |---|
148-
// Scalar::BITS - start_remainder
149-
//
150-
// In the lowest bits of container[i] we want the highest bits of
151-
// packed_coeffs[start_block] starting from index start_remainder
152-
//
153-
// In the next bits, we want the lowest bits of packed_coeffs[start_block + 1]
154-
// left shifted to avoid overlapping
155-
//
156-
// container[i] = lowest_bits of (first_part|second_part)
157-
//
158112
assert_eq!(end_block_inclusive, start_block + 1);
159113

160114
let first_part = self.packed_coeffs[start_block] >> start_remainder;

0 commit comments

Comments
 (0)