Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
pdroalves committed Feb 18, 2025
1 parent 7807162 commit 6e79255
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 67 deletions.
1 change: 1 addition & 0 deletions backends/tfhe-cuda-backend/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ endif()
add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})

# Check if the DEBUG flag is defined
set(CMAKE_BUILD_TYPE "Debug")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# Debug mode
message("Compiling in Debug mode")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ template <typename Torus> struct int_decompression {
// Example: in the 2_2 case we are mapping a 2 bits message onto a 4 bits
// space, we want to keep the original 2 bits value in the 4 bits space,
// so we apply the identity and the encoding will rescale it for us.
auto decompression_rescale_f = [encryption_params](Torus x) -> Torus {
auto decompression_rescale_f = [](Torus x) -> Torus {
return x;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"

// This kernel follows the naming used in the rust implementation
template <typename Torus>
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
uint32_t num_glwes, uint32_t in_len, uint32_t out_len) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;

if (tid < num_glwes * out_len) {
auto NBITS = sizeof(Torus) * 8;
const auto NBITS = sizeof(Torus) * 8;
auto glwe_index = tid / out_len;
auto i = tid % out_len;
auto chunk_array_in = array_in + glwe_index * in_len;
Expand Down Expand Up @@ -50,7 +51,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
PANIC("Cuda error: Input and output must be different");

cuda_set_device(gpu_index);
auto NBITS = sizeof(Torus) * 8;
const auto NBITS = sizeof(Torus) * 8;
auto compression_params = mem_ptr->compression_params;
auto log_modulus = mem_ptr->storage_log_modulus;

Expand Down Expand Up @@ -139,24 +140,26 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
num_radix_blocks, mem_ptr);
}

// This kernel follows the naming used in the rust implementation
// except for output_len, which relates to initial_len
template <typename Torus>
__global__ void extract(Torus *glwe_array_out, Torus const *array_in,
uint32_t index, uint32_t log_modulus,
uint32_t input_len, uint32_t initial_out_len) {
auto NBITS = sizeof(Torus) * 8;
__global__ void extract(Torus *glwe_array_out, const Torus *array_in,
const uint32_t index, const uint32_t log_modulus,
const uint32_t input_len, const uint32_t output_len) {
const auto NBITS = sizeof(Torus) * 8;

auto i = threadIdx.x + blockIdx.x * blockDim.x;
auto chunk_array_in = array_in + index * input_len;
if (i < initial_out_len) {
if (i < output_len) {
// Unpack
Torus mask = ((Torus)1 << log_modulus) - 1;
auto start = i * log_modulus;
auto end = (i + 1) * log_modulus;
auto mask = (static_cast<Torus>(1) << log_modulus) - 1;
const auto start = i * log_modulus;
const auto end = (i + 1) * log_modulus;

auto start_block = start / NBITS;
auto start_remainder = start % NBITS;

auto end_block_inclusive = (end - 1) / NBITS;
const auto end_block_inclusive = (end - 1) / NBITS;

Torus unpacked_i;
if (start_block == end_block_inclusive) {
Expand Down Expand Up @@ -185,30 +188,44 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
PANIC("Cuda error: Input and output must be different");

cuda_set_device(gpu_index);
auto NBITS = sizeof(Torus) * 8;
auto compression_params = mem_ptr->compression_params;
auto num_glwes = (mem_ptr->body_count + compression_params.polynomial_size - 1) / compression_params.polynomial_size;
printf("glwe_index: %u / %u\n", glwe_index, num_glwes);
const auto NBITS = sizeof(Torus) * 8;
printf("CUDA NBITS: %u\n", NBITS);
auto log_modulus = mem_ptr->storage_log_modulus;
printf("CUDA log_modulus: %u\n", log_modulus);

auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
printf("CUDA glwe_ciphertext_size: %u\n", glwe_ciphertext_size);

auto glwe_mask_size =
compression_params.glwe_dimension * compression_params.polynomial_size;
printf("CUDA glwe_mask_size: %u\n", glwe_mask_size);

uint32_t num_lwes = (glwe_index == num_glwes - 1) ? (mem_ptr->body_count % compression_params.polynomial_size) : compression_params.polynomial_size;
printf("CUDA body_count: %u\n", num_lwes);

uint32_t body_count =
std::min(mem_ptr->body_count, compression_params.polynomial_size);
// num_glwes = 1 in this case
auto uncompressed_len =
compression_params.glwe_dimension * compression_params.polynomial_size +
body_count;
glwe_ciphertext_size;
printf("CUDA uncompressed_len: %u\n", uncompressed_len);

auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
auto number_bits_to_unpack = uncompressed_len * log_modulus;
printf("CUDA number_bits_to_unpack: %u\n", number_bits_to_unpack);

// number_bits_to_unpack.div_ceil(Scalar::BITS)
auto compressed_len = (number_bits_to_unpack + NBITS - 1) / NBITS;
printf("CUDA compressed_len: %u\n", compressed_len);

// We assure the tail of the glwe is zeroed
auto zeroed_slice = glwe_array_out + uncompressed_len;
cuda_memset_async(zeroed_slice, 0,
(compression_params.polynomial_size - body_count) *
(compression_params.polynomial_size - num_lwes) *
sizeof(Torus),
stream, gpu_index);

// cuda_memset_async(glwe_array_out, 0, glwe_ciphertext_size * sizeof(Torus), stream, gpu_index);
// Kernel settings
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(uncompressed_len, 128, num_blocks, num_threads);
Expand Down
46 changes: 0 additions & 46 deletions tfhe/src/core_crypto/entities/packed_integers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,56 +105,10 @@ impl<Scalar: UnsignedInteger> PackedIntegers<Scalar> {
let end_block_inclusive = (end - 1) / Scalar::BITS;

if start_block == end_block_inclusive {
// Lowest bits are on the right
//
// Target mapping:
// Scalar::BITS
// |---------------|
//
// packed_coeffs: | start_block+1 | start_block |
// container : | i+1 | i | i-1 |
//
// |-------|
// log_modulus
//
// |---|
// start_remainder
//
// In container[i] we want the bits of packed_coeffs[start_block] starting from
// index start_remainder
//
// container[i] = lowest_bits of single_part
//
let single_part = self.packed_coeffs[start_block] >> start_remainder;

single_part & mask
} else {
// Lowest bits are on the right
//
// Target mapping:
// Scalar::BITS
// |---------------|
//
// packed_coeffs: | start_block+1 | start_block |
// container : | i+1 | i | i-1 |
//
// |-------|
// log_modulus
//
// |-----------|
// start_remainder
//
// |---|
// Scalar::BITS - start_remainder
//
// In the lowest bits of container[i] we want the highest bits of
// packed_coeffs[start_block] starting from index start_remainder
//
// In the next bits, we want the lowest bits of packed_coeffs[start_block + 1]
// left shifted to avoid overlapping
//
// container[i] = lowest_bits of (first_part|second_part)
//
assert_eq!(end_block_inclusive, start_block + 1);

let first_part = self.packed_coeffs[start_block] >> start_remainder;
Expand Down

0 comments on commit 6e79255

Please sign in to comment.