|
12 | 12 | #include "polynomial/functions.cuh"
|
13 | 13 | #include "utils/kernel_dimensions.cuh"
|
14 | 14 |
|
| 15 | +// This kernel follows the naming used in the rust implementation |
15 | 16 | template <typename Torus>
|
16 | 17 | __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
|
17 | 18 | uint32_t num_glwes, uint32_t in_len, uint32_t out_len) {
|
18 | 19 | auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
19 | 20 |
|
20 | 21 | if (tid < num_glwes * out_len) {
|
21 |
| - auto NBITS = sizeof(Torus) * 8; |
| 22 | + const auto NBITS = sizeof(Torus) * 8; |
22 | 23 | auto glwe_index = tid / out_len;
|
23 | 24 | auto i = tid % out_len;
|
24 | 25 | auto chunk_array_in = array_in + glwe_index * in_len;
|
@@ -50,7 +51,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
50 | 51 | PANIC("Cuda error: Input and output must be different");
|
51 | 52 |
|
52 | 53 | cuda_set_device(gpu_index);
|
53 |
| - auto NBITS = sizeof(Torus) * 8; |
| 54 | + const auto NBITS = sizeof(Torus) * 8; |
54 | 55 | auto compression_params = mem_ptr->compression_params;
|
55 | 56 | auto log_modulus = mem_ptr->storage_log_modulus;
|
56 | 57 |
|
@@ -139,24 +140,26 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
139 | 140 | num_radix_blocks, mem_ptr);
|
140 | 141 | }
|
141 | 142 |
|
| 143 | +// This kernel follows the naming used in the rust implementation |
| 144 | + // except for output_len, which relates to initial_len |
142 | 145 | template <typename Torus>
|
143 |
| -__global__ void extract(Torus *glwe_array_out, Torus const *array_in, |
144 |
| - uint32_t index, uint32_t log_modulus, |
145 |
| - uint32_t input_len, uint32_t initial_out_len) { |
146 |
| - auto NBITS = sizeof(Torus) * 8; |
| 146 | +__global__ void extract(Torus *glwe_array_out, const Torus *array_in, |
| 147 | + const uint32_t index, const uint32_t log_modulus, |
| 148 | + const uint32_t input_len, const uint32_t output_len) { |
| 149 | + const auto NBITS = sizeof(Torus) * 8; |
147 | 150 |
|
148 | 151 | auto i = threadIdx.x + blockIdx.x * blockDim.x;
|
149 | 152 | auto chunk_array_in = array_in + index * input_len;
|
150 |
| - if (i < initial_out_len) { |
| 153 | + if (i < output_len) { |
151 | 154 | // Unpack
|
152 |
| - Torus mask = ((Torus)1 << log_modulus) - 1; |
153 |
| - auto start = i * log_modulus; |
154 |
| - auto end = (i + 1) * log_modulus; |
| 155 | + auto mask = (static_cast<Torus>(1) << log_modulus) - 1; |
| 156 | + const auto start = i * log_modulus; |
| 157 | + const auto end = (i + 1) * log_modulus; |
155 | 158 |
|
156 | 159 | auto start_block = start / NBITS;
|
157 | 160 | auto start_remainder = start % NBITS;
|
158 | 161 |
|
159 |
| - auto end_block_inclusive = (end - 1) / NBITS; |
| 162 | + const auto end_block_inclusive = (end - 1) / NBITS; |
160 | 163 |
|
161 | 164 | Torus unpacked_i;
|
162 | 165 | if (start_block == end_block_inclusive) {
|
@@ -185,30 +188,44 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
185 | 188 | PANIC("Cuda error: Input and output must be different");
|
186 | 189 |
|
187 | 190 | cuda_set_device(gpu_index);
|
188 |
| - auto NBITS = sizeof(Torus) * 8; |
189 | 191 | auto compression_params = mem_ptr->compression_params;
|
| 192 | + auto num_glwes = (mem_ptr->body_count + compression_params.polynomial_size - 1) / compression_params.polynomial_size; |
| 193 | + printf("glwe_index: %u / %u\n", glwe_index, num_glwes); |
| 194 | + const auto NBITS = sizeof(Torus) * 8; |
| 195 | + printf("CUDA NBITS: %u\n", NBITS); |
190 | 196 | auto log_modulus = mem_ptr->storage_log_modulus;
|
| 197 | + printf("CUDA log_modulus: %u\n", log_modulus); |
| 198 | + |
| 199 | + auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) * |
| 200 | + compression_params.polynomial_size; |
| 201 | + printf("CUDA glwe_ciphertext_size: %u\n", glwe_ciphertext_size); |
| 202 | + |
| 203 | + auto glwe_mask_size = |
| 204 | + compression_params.glwe_dimension * compression_params.polynomial_size; |
| 205 | + printf("CUDA glwe_mask_size: %u\n", glwe_mask_size); |
| 206 | + |
| 207 | + uint32_t num_lwes = (glwe_index == num_glwes - 1) ? (mem_ptr->body_count % compression_params.polynomial_size) : compression_params.polynomial_size; |
| 208 | + printf("CUDA body_count: %u\n", num_lwes); |
191 | 209 |
|
192 |
| - uint32_t body_count = |
193 |
| - std::min(mem_ptr->body_count, compression_params.polynomial_size); |
194 | 210 | // num_glwes = 1 in this case
|
195 | 211 | auto uncompressed_len =
|
196 |
| - compression_params.glwe_dimension * compression_params.polynomial_size + |
197 |
| - body_count; |
| 212 | + glwe_ciphertext_size; |
| 213 | + printf("CUDA uncompressed_len: %u\n", uncompressed_len); |
198 | 214 |
|
199 |
| - auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) * |
200 |
| - compression_params.polynomial_size; |
201 | 215 | auto number_bits_to_unpack = uncompressed_len * log_modulus;
|
| 216 | + printf("CUDA number_bits_to_unpack: %u\n", number_bits_to_unpack); |
| 217 | + |
202 | 218 | // number_bits_to_unpack.div_ceil(Scalar::BITS)
|
203 | 219 | auto compressed_len = (number_bits_to_unpack + NBITS - 1) / NBITS;
|
| 220 | + printf("CUDA compressed_len: %u\n", compressed_len); |
204 | 221 |
|
205 | 222 | // We assure the tail of the glwe is zeroed
|
206 | 223 | auto zeroed_slice = glwe_array_out + uncompressed_len;
|
207 | 224 | cuda_memset_async(zeroed_slice, 0,
|
208 |
| - (compression_params.polynomial_size - body_count) * |
| 225 | + (compression_params.polynomial_size - num_lwes) * |
209 | 226 | sizeof(Torus),
|
210 | 227 | stream, gpu_index);
|
211 |
| - |
| 228 | +// cuda_memset_async(glwe_array_out, 0, glwe_ciphertext_size * sizeof(Torus), stream, gpu_index); |
212 | 229 | // Kernel settings
|
213 | 230 | int num_blocks = 0, num_threads = 0;
|
214 | 231 | getNumBlocksAndThreads(uncompressed_len, 128, num_blocks, num_threads);
|
|
0 commit comments