14
14
15
15
template <typename Torus>
16
16
__global__ void pack (Torus *array_out, Torus *array_in, uint32_t log_modulus,
17
- uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) {
18
- auto nbits = sizeof (Torus) * 8 ;
17
+ uint32_t num_glwes, uint32_t in_len, uint32_t out_len) {
19
18
auto tid = threadIdx .x + blockIdx .x * blockDim .x ;
20
19
21
- auto glwe_index = tid / out_len;
22
- auto i = tid % out_len;
23
- auto chunk_array_in = array_in + glwe_index * in_len;
24
- auto chunk_array_out = array_out + glwe_index * out_len;
20
+ if (tid < num_glwes * out_len) {
21
+ auto NBITS = sizeof (Torus) * 8 ;
22
+ auto glwe_index = tid / out_len;
23
+ auto i = tid % out_len;
24
+ auto chunk_array_in = array_in + glwe_index * in_len;
25
+ auto chunk_array_out = array_out + glwe_index * out_len;
25
26
26
- if (tid < num_coeffs) {
27
-
28
- auto k = nbits * i / log_modulus;
27
+ auto k = NBITS * i / log_modulus;
29
28
auto j = k;
30
29
31
- auto start_shift = i * nbits - j * log_modulus;
30
+ auto start_shift = i * NBITS - j * log_modulus;
32
31
33
32
auto value = chunk_array_in[j] >> start_shift;
34
33
j++;
35
34
36
- while (j * log_modulus < ((i + 1 ) * nbits ) && j < in_len) {
37
- auto shift = j * log_modulus - i * nbits ;
35
+ while (j * log_modulus < ((i + 1 ) * NBITS ) && j < in_len) {
36
+ auto shift = j * log_modulus - i * NBITS ;
38
37
value |= chunk_array_in[j] << shift;
39
38
j++;
40
39
}
@@ -51,30 +50,30 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
51
50
PANIC (" Cuda error: Input and output must be different" );
52
51
53
52
cuda_set_device (gpu_index);
53
+ auto NBITS = sizeof (Torus) * 8 ;
54
54
auto compression_params = mem_ptr->compression_params ;
55
-
56
55
auto log_modulus = mem_ptr->storage_log_modulus ;
57
- // [0..num_glwes-1) GLWEs
58
- auto in_len = (compression_params.glwe_dimension + 1 ) *
59
- compression_params.polynomial_size ;
60
- auto number_bits_to_pack = in_len * log_modulus;
61
- auto nbits = sizeof (Torus) * 8 ;
62
- // number_bits_to_pack.div_ceil(Scalar::BITS)
63
- auto out_len = (number_bits_to_pack + nbits - 1 ) / nbits;
64
56
65
- // Last GLWE
66
- number_bits_to_pack = in_len * log_modulus;
67
- auto last_out_len = (number_bits_to_pack + nbits - 1 ) / nbits;
57
+ auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1 ) *
58
+ compression_params.polynomial_size ;
59
+ auto glwe_mask_size =
60
+ compression_params.glwe_dimension * compression_params.polynomial_size ;
68
61
69
- auto num_coeffs = (num_glwes - 1 ) * out_len + last_out_len;
62
+ auto uncompressed_len = num_glwes * glwe_mask_size + num_lwes;
63
+ auto number_bits_to_pack = uncompressed_len * log_modulus;
70
64
71
- int num_blocks = 0 , num_threads = 0 ;
72
- getNumBlocksAndThreads (num_coeffs, 1024 , num_blocks, num_threads) ;
65
+ // equivalent to number_bits_to_pack.div_ceil(Scalar::BITS)
66
+ auto compressed_len = (number_bits_to_pack + NBITS - 1 ) / NBITS ;
73
67
68
+ // Kernel settings
69
+ int num_blocks = 0 , num_threads = 0 ;
70
+ getNumBlocksAndThreads (num_glwes * compressed_len, 1024 , num_blocks,
71
+ num_threads);
74
72
dim3 grid (num_blocks);
75
73
dim3 threads (num_threads);
76
74
pack<Torus><<<grid, threads, 0 , stream>>> (array_out, array_in, log_modulus,
77
- num_coeffs, in_len, out_len);
75
+ num_glwes, uncompressed_len,
76
+ compressed_len);
78
77
check_cuda_error (cudaGetLastError ());
79
78
}
80
79
@@ -144,7 +143,7 @@ template <typename Torus>
144
143
__global__ void extract (Torus *glwe_array_out, Torus const *array_in,
145
144
uint32_t index, uint32_t log_modulus,
146
145
uint32_t input_len, uint32_t initial_out_len) {
147
- auto nbits = sizeof (Torus) * 8 ;
146
+ auto NBITS = sizeof (Torus) * 8 ;
148
147
149
148
auto i = threadIdx .x + blockIdx .x * blockDim .x ;
150
149
auto chunk_array_in = array_in + index * input_len;
@@ -154,10 +153,10 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in,
154
153
auto start = i * log_modulus;
155
154
auto end = (i + 1 ) * log_modulus;
156
155
157
- auto start_block = start / nbits ;
158
- auto start_remainder = start % nbits ;
156
+ auto start_block = start / NBITS ;
157
+ auto start_remainder = start % NBITS ;
159
158
160
- auto end_block_inclusive = (end - 1 ) / nbits ;
159
+ auto end_block_inclusive = (end - 1 ) / NBITS ;
161
160
162
161
Torus unpacked_i;
163
162
if (start_block == end_block_inclusive) {
@@ -166,13 +165,13 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in,
166
165
} else {
167
166
auto first_part = chunk_array_in[start_block] >> start_remainder;
168
167
auto second_part = chunk_array_in[start_block + 1 ]
169
- << (nbits - start_remainder);
168
+ << (NBITS - start_remainder);
170
169
171
170
unpacked_i = (first_part | second_part) & mask;
172
171
}
173
172
174
173
// Extract
175
- glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
174
+ glwe_array_out[i] = unpacked_i << (NBITS - log_modulus);
176
175
}
177
176
}
178
177
@@ -186,38 +185,38 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
186
185
PANIC (" Cuda error: Input and output must be different" );
187
186
188
187
cuda_set_device (gpu_index);
189
-
188
+ auto NBITS = sizeof (Torus) * 8 ;
190
189
auto compression_params = mem_ptr->compression_params ;
191
-
192
190
auto log_modulus = mem_ptr->storage_log_modulus ;
193
191
194
192
uint32_t body_count =
195
193
std::min (mem_ptr->body_count , compression_params.polynomial_size );
196
- auto initial_out_len =
194
+ // num_glwes = 1 in this case
195
+ auto uncompressed_len =
197
196
compression_params.glwe_dimension * compression_params.polynomial_size +
198
197
body_count;
199
198
200
- auto compressed_glwe_accumulator_size =
201
- (compression_params.glwe_dimension + 1 ) *
202
- compression_params.polynomial_size ;
203
- auto number_bits_to_unpack = compressed_glwe_accumulator_size * log_modulus;
204
- auto nbits = sizeof (Torus) * 8 ;
199
+ auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1 ) *
200
+ compression_params.polynomial_size ;
201
+ auto number_bits_to_unpack = uncompressed_len * log_modulus;
205
202
// number_bits_to_unpack.div_ceil(Scalar::BITS)
206
- auto input_len = (number_bits_to_unpack + nbits - 1 ) / nbits ;
203
+ auto compressed_len = (number_bits_to_unpack + NBITS - 1 ) / NBITS ;
207
204
208
205
// We assure the tail of the glwe is zeroed
209
- auto zeroed_slice = glwe_array_out + initial_out_len ;
206
+ auto zeroed_slice = glwe_array_out + uncompressed_len ;
210
207
cuda_memset_async (zeroed_slice, 0 ,
211
208
(compression_params.polynomial_size - body_count) *
212
209
sizeof (Torus),
213
210
stream, gpu_index);
211
+
212
+ // Kernel settings
214
213
int num_blocks = 0 , num_threads = 0 ;
215
- getNumBlocksAndThreads (initial_out_len , 128 , num_blocks, num_threads);
214
+ getNumBlocksAndThreads (uncompressed_len , 128 , num_blocks, num_threads);
216
215
dim3 grid (num_blocks);
217
216
dim3 threads (num_threads);
218
- extract<Torus><<<grid, threads, 0 , stream>>> (glwe_array_out, array_in,
219
- glwe_index, log_modulus,
220
- input_len, initial_out_len );
217
+ extract<Torus><<<grid, threads, 0 , stream>>> (
218
+ glwe_array_out, array_in, glwe_index, log_modulus, compressed_len ,
219
+ uncompressed_len );
221
220
check_cuda_error (cudaGetLastError ());
222
221
}
223
222
0 commit comments