From 416e642c3dd1bd892d0e25971f454fe4df51d5dd Mon Sep 17 00:00:00 2001 From: Stephen Akridge Date: Mon, 27 Apr 2020 10:19:04 -0700 Subject: [PATCH] Optimize gpu sigverify --- ci/buildkite.yml | 2 +- src/cuda-ecc-ed25519/common.cu | 15 +++ src/cuda-ecc-ed25519/ed25519.h | 4 + src/cuda-ecc-ed25519/fe.cu | 22 +++ src/cuda-ecc-ed25519/fe.h | 2 + src/cuda-ecc-ed25519/ge.cu | 23 ++++ src/cuda-ecc-ed25519/int128.h | 37 +++++ src/cuda-ecc-ed25519/license.txt | 31 +++++ src/cuda-ecc-ed25519/sc.cu | 223 +++++++++++++++++++++++++++++++ src/cuda-ecc-ed25519/sc.h | 1 + src/cuda-ecc-ed25519/verify.cu | 59 +++++++- src/gpu-common.mk | 3 +- 12 files changed, 418 insertions(+), 4 deletions(-) create mode 100644 src/cuda-ecc-ed25519/int128.h diff --git a/ci/buildkite.yml b/ci/buildkite.yml index 92c7833..9ecf072 100644 --- a/ci/buildkite.yml +++ b/ci/buildkite.yml @@ -1,6 +1,6 @@ steps: - command: "ci/build.sh" name: "build" - timeout_in_minutes: 30 + timeout_in_minutes: 45 agents: - "queue=cuda" diff --git a/src/cuda-ecc-ed25519/common.cu b/src/cuda-ecc-ed25519/common.cu index a238f7c..e98c1dd 100644 --- a/src/cuda-ecc-ed25519/common.cu +++ b/src/cuda-ecc-ed25519/common.cu @@ -23,4 +23,19 @@ static uint64_t __host__ __device__ load_4(const unsigned char *in) { return result; } +static uint64_t __host__ __device__ load_7(const unsigned char *in) { + uint64_t result; + + result = (uint64_t) in[0]; + result |= ((uint64_t) in[1]) << 8; + result |= ((uint64_t) in[2]) << 16; + result |= ((uint64_t) in[3]) << 24; + result |= ((uint64_t) in[4]) << 32; + result |= ((uint64_t) in[5]) << 40; + result |= ((uint64_t) in[6]) << 48; + + return result; +} + + #endif diff --git a/src/cuda-ecc-ed25519/ed25519.h b/src/cuda-ecc-ed25519/ed25519.h index 4615959..0efbda8 100644 --- a/src/cuda-ecc-ed25519/ed25519.h +++ b/src/cuda-ecc-ed25519/ed25519.h @@ -75,6 +75,10 @@ bool ED25519_DECLSPEC ed25519_init(); int cuda_host_register(void* ptr, size_t size, unsigned int flags); int cuda_host_unregister(void* ptr); +int ED25519_DECLSPEC ed25519_get_checked_scalar(unsigned char* out_scalar, const unsigned char* in_scalar); + +int ED25519_DECLSPEC ed25519_check_packed_ge_small_order(const unsigned char* packed_group_element); + #ifdef __cplusplus } #endif diff --git a/src/cuda-ecc-ed25519/fe.cu b/src/cuda-ecc-ed25519/fe.cu index 84748b7..04c3986 100644 --- a/src/cuda-ecc-ed25519/fe.cu +++ b/src/cuda-ecc-ed25519/fe.cu @@ -43,6 +43,28 @@ void __device__ __host__ fe_1(fe h) { } +int __host__ __device__ fe_is_1(fe h) { + if (h[0] != 1) { + return 0; + } + for (int i = 1; i < 9; i++) { + if (h[i] != 0) { + return 0; + } + } + return 1; +} + +int __host__ __device__ fe_is_0(fe h) { + for (int i = 0; i < 9; i++) { + if (h[i] != 0) { + return 0; + } + } + return 1; +} + + /* h = f + g diff --git a/src/cuda-ecc-ed25519/fe.h b/src/cuda-ecc-ed25519/fe.h index 2a33879..40abad5 100644 --- a/src/cuda-ecc-ed25519/fe.h +++ b/src/cuda-ecc-ed25519/fe.h @@ -18,6 +18,8 @@ typedef int32_t fe[10]; void __host__ __device__ fe_0(fe h); void __device__ __host__ fe_1(fe h); +int __device__ __host__ fe_is_0(fe h); +int __device__ __host__ fe_is_1(fe h); void __device__ __host__ fe_frombytes(fe h, const unsigned char *s); void __device__ __host__ fe_tobytes(unsigned char *s, const fe h); diff --git a/src/cuda-ecc-ed25519/ge.cu b/src/cuda-ecc-ed25519/ge.cu index 7284e83..4513dbd 100644 --- a/src/cuda-ecc-ed25519/ge.cu +++ b/src/cuda-ecc-ed25519/ge.cu @@ -180,6 +180,29 @@ int __device__ __host__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned cha return 0; } +// x = 1, y = 0, z = 0, t = 1 +int __host__ __device__ ge_is_identity(ge_p3* p) { + return (fe_is_0(p->X) && + fe_is_1(p->Y) && + fe_is_1(p->Z) && + fe_is_0(p->T)) ? 1 : 0; +} + +int __host__ __device__ ge_is_small_order(ge_p3* p) { + ge_p1p1 r; + ge_p2 s; + ge_p3 q; + + // calculate q = p * 2*3 + ge_p3_dbl(&r, p); + ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + ge_p1p1_to_p3(&q, &r); + + return ge_is_identity(&q); +} /* r = p + q diff --git a/src/cuda-ecc-ed25519/int128.h b/src/cuda-ecc-ed25519/int128.h new file mode 100644 index 0000000..7f23efe --- /dev/null +++ b/src/cuda-ecc-ed25519/int128.h @@ -0,0 +1,37 @@ +#ifndef INT128_H +#define INT128_H + +struct uint128_t { + uint64_t low; + uint64_t high; +}; + +static __device__ __host__ uint128_t mul_128(uint64_t a, uint64_t b) { + uint128_t result; +#ifdef __CUDA_ARCH__ + result.low = a * b; + result.high = __mul64hi(a, b); +#elif __x86_64__ + asm( "mulq %3\n\t" + : "=a" (result.low), "=d" (result.high) + : "%0" (a), "rm" (b)); +#endif + return result; +} + +static __device__ __host__ uint128_t add_128(uint128_t a, uint128_t b) { + uint128_t result; +#ifdef __CUDA_ARCH__ + asm( "add.cc.u64 %0, %2, %4;\n\t" + "addc.u64 %1, %3, %5;\n\t" + : "=l" (result.low), "=l" (result.high) + : "l" (a.low), "l" (a.high), + "l" (b.low), "l" (b.high)); +#else + result.low = a.low + b.low; + result.high = a.high + b.high + (result.low < a.low); +#endif + return result; +} + +#endif diff --git a/src/cuda-ecc-ed25519/license.txt b/src/cuda-ecc-ed25519/license.txt index c1503f9..af69c95 100644 --- a/src/cuda-ecc-ed25519/license.txt +++ b/src/cuda-ecc-ed25519/license.txt @@ -14,3 +14,34 @@ applications, and to alter it and redistribute it freely, subject to the followi being the original software. 3. This notice may not be removed or altered from any source distribution. + +================================ + +Copyright (c) 2017-2019 isis agora lovecruft. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/cuda-ecc-ed25519/sc.cu b/src/cuda-ecc-ed25519/sc.cu index 299e163..176df38 100644 --- a/src/cuda-ecc-ed25519/sc.cu +++ b/src/cuda-ecc-ed25519/sc.cu @@ -1,6 +1,228 @@ #include "fixedint.h" #include "sc.h" #include "common.cu" +#include "int128.h" + +#ifdef __CUDA_ARCH__ +#define CONSTANT __constant__ +#else +#define CONSTANT const +#endif + +/// R = R % L where R = 2^260 +const __device__ uint64_t R_scalar_u52[] = { + 0x000f48bd6721e6ed, + 0x0003bab5ac67e45a, + 0x000fffffeb35e51b, + 0x000fffffffffffff, + 0x00000fffffffffff, + }; + +CONSTANT uint64_t L_scalar_u52[] = { + 0x0002631a5cf5d3ed, + 0x000dea2f79cd6581, + 0x000000000014def9, + 0x0000000000000000, + 0x0000100000000000, + }; + +const uint64_t L_FACTOR = 0x51da312547e1b; + +// Unpacked 32-byte scalar with 5x 52-bit limbs +typedef uint64_t scalar32_u52_t[5]; + +// Packed 32-byte scalar +typedef uint8_t scalar32_t[32]; + +void __host__ __device__ scalar52_mul(uint128_t* out, const scalar32_u52_t a, const scalar32_u52_t b) { + out[0] = mul_128(a[0], b[0]); + + out[1] = add_128(mul_128(a[0], b[1]), mul_128(a[1], b[0])); + + out[2] = add_128(mul_128(a[0], b[2]), mul_128(a[1], b[1])); + out[2] = add_128(out[2], mul_128(a[2], b[0])); + + out[3] = add_128(mul_128(a[0], b[3]), mul_128(a[1], b[2])); + out[3] = add_128(out[3], mul_128(a[2], b[1])); + out[3] = add_128(out[3], mul_128(a[3], b[0])); + + out[4] = add_128(mul_128(a[0], b[4]), mul_128(a[1], b[3])); + out[4] = add_128(out[4], mul_128(a[2], b[2])); + out[4] = add_128(out[4], mul_128(a[3], b[1])); + out[4] = add_128(out[4], mul_128(a[4], b[0])); + + out[5] = add_128(mul_128(a[1], b[4]), mul_128(a[2], b[3])); + out[5] = add_128(out[5], mul_128(a[3], b[2])); + out[5] = add_128(out[5], mul_128(a[4], b[1])); + + out[6] = add_128(mul_128(a[2], b[4]), mul_128(a[3], b[3])); + out[6] = add_128(out[5], mul_128(a[4], b[2])); + + out[7] = add_128(mul_128(a[3], b[4]), mul_128(a[4], b[3])); + + out[8] = mul_128(a[4], b[4]); +} + +#define MASK_52 ((UINT64_C(1) << 52) - 1) + +void __host__ __device__ scalar32_unpack(scalar32_u52_t out, scalar32_t in) { + out[0] = MASK_52 & load_7(in); // 0-51 + out[1] = MASK_52 & (load_7(in + 6) >> 4); // 52-103 + out[2] = MASK_52 & load_7(in + 13); // 104-156 + out[3] = MASK_52 & (load_7(in + 19) >> 4); // 156-208 + out[4] = MASK_52 & load_7(in + 26); // 208-256 +} + +void __host__ __device__ scalar32_pack(scalar32_t out, scalar32_u52_t in) { + out[0] = (uint8_t)in[0]; + out[1] = (uint8_t)(in[0] >> 8); + out[2] = (uint8_t)(in[0] >> 16); + out[3] = (uint8_t)(in[0] >> 24); + out[4] = (uint8_t)(in[0] >> 32); + out[5] = (uint8_t)(in[0] >> 40); + + out[6] = (uint8_t)((in[0] >> 48) | (in[1] << 4)); + + out[7] = (uint8_t)(in[1] >> 4); + out[8] = (uint8_t)(in[1] >> 12); + out[9] = (uint8_t)(in[1] >> 20); + out[10] = (uint8_t)(in[1] >> 28); + out[11] = (uint8_t)(in[1] >> 36); + out[12] = (uint8_t)(in[1] >> 44); + + out[13] = (uint8_t)(in[2] >> 0); + out[14] = (uint8_t)(in[2] >> 8); + out[15] = (uint8_t)(in[2] >> 16); + out[16] = (uint8_t)(in[2] >> 24); + out[17] = (uint8_t)(in[2] >> 32); + out[18] = (uint8_t)(in[2] >> 40); + + out[19] = ((uint8_t)(in[2] >> 48) | (uint8_t)(in[ 3] << 4)); + + out[20] = (uint8_t)(in[3] >> 4); + out[21] = (uint8_t)(in[3] >> 12); + out[22] = (uint8_t)(in[3] >> 20); + out[23] = (uint8_t)(in[3] >> 28); + out[24] = (uint8_t)(in[3] >> 36); + out[25] = (uint8_t)(in[3] >> 44); + + out[26] = (uint8_t)(in[4] >> 0); + out[27] = (uint8_t)(in[4] >> 8); + out[28] = (uint8_t)(in[4] >> 16); + out[29] = (uint8_t)(in[4] >> 24); + out[30] = (uint8_t)(in[4] >> 32); + out[31] = (uint8_t)(in[4] >> 40); +} + +uint128_t __host__ __device__ rshift_128(uint128_t val, uint64_t n) { + val.low >>= n; + uint64_t high_to_low = (val.high & MASK_52) << (64 - n); + val.low |= high_to_low; + val.high >>= n; + return val; +} + +void __host__ __device__ part1(uint128_t sum, uint128_t* carry, uint64_t* adjust) { + *adjust = (sum.low * L_FACTOR) & MASK_52; + *carry = rshift_128(add_128(sum, mul_128(*adjust, L_scalar_u52[0])), 52); +} + +void __host__ __device__ part2(uint128_t sum, uint128_t* carry, uint64_t* adjust) { + *adjust = sum.low & MASK_52; + *carry = rshift_128(sum, 52); +} + +// r = a - b +void __host__ __device__ scalar_u52_sub(scalar32_u52_t r, scalar32_u52_t a, const scalar32_u52_t b) { + for (int i = 0; i < 5; i++) { + r[i] = 0; + } + + // a - b + uint64_t borrow = 0; + for (int i = 0; i < 5; i++) { + borrow = a[i] - (b[i] + (borrow >> 63)); + r[i] = borrow & MASK_52; + } + + // conditionally add l if the difference is negative + uint64_t underflow_mask = ((borrow >> 63) ^ 1) - 1; + uint64_t carry = 0; + for (int i = 0; i < 5; i++) { + carry = (carry >> 52) + r[i] + (L_scalar_u52[i] & underflow_mask); + r[i] = carry & MASK_52; + } +} + +void __host__ __device__ montgomery_reduce(scalar32_u52_t s, uint128_t r[9]) { + uint128_t carry; + uint64_t n0; + part1(r[0], &carry, &n0); + + uint64_t n1; + uint128_t sum1 = add_128(mul_128(n0, L_scalar_u52[1]), r[1]); + sum1 = add_128(sum1, carry); + part1(sum1, &carry, &n1); + + uint64_t n2; + uint128_t sum2 = add_128(mul_128(n0, L_scalar_u52[2]), mul_128(n1, L_scalar_u52[1])); + sum2 = add_128(sum2, r[2]); + sum2 = add_128(sum2, carry); + part1(sum2, &carry, &n2); + + uint64_t n3; + uint128_t sum3 = add_128(mul_128(n1, L_scalar_u52[2]), mul_128(n2, L_scalar_u52[1])); + sum3 = add_128(sum3, r[3]); + sum3 = add_128(sum3, carry); + part1(sum3, &carry, &n3); + + uint64_t n4; + uint128_t sum4 = add_128(mul_128(n0, L_scalar_u52[4]), mul_128(n3, L_scalar_u52[1])); + sum4 = add_128(sum4, mul_128(n2, L_scalar_u52[2])); + sum4 = add_128(sum4, r[4]); + sum4 = add_128(sum4, carry); + part1(sum4, &carry, &n4); + + uint128_t sum0; + scalar32_u52_t r_scalar; + + sum0 = add_128(carry, r[5]); + sum0 = add_128(sum0, mul_128(n1, L_scalar_u52[4])); + sum0 = add_128(sum0, mul_128(n3, L_scalar_u52[2])); + sum0 = add_128(sum0, mul_128(n4, L_scalar_u52[1])); + part2(sum0, &carry, &r_scalar[0]); + + sum0 = add_128(carry, r[6]); + sum0 = add_128(sum0, mul_128(n2, L_scalar_u52[4])); + sum0 = add_128(sum0, mul_128(n4, L_scalar_u52[2])); + part2(sum0, &carry, &r_scalar[1]); + + sum0 = add_128(carry, r[7]); + sum0 = add_128(sum0, mul_128(n3, L_scalar_u52[4])); + part2(sum0, &carry, &r_scalar[2]); + + sum0 = add_128(carry, r[8]); + sum0 = add_128(sum0, mul_128(n4, L_scalar_u52[4])); + part2(sum0, &carry, &r_scalar[3]); + + r_scalar[4] = carry.low; + + scalar_u52_sub(s, r_scalar, L_scalar_u52); +} + +void __host__ __device__ scalar32_reduce(unsigned char *s) { + scalar32_u52_t s_u52; + scalar32_unpack(s_u52, s); + + uint128_t s_R[9]; + scalar52_mul(s_R, s_u52, R_scalar_u52); + + scalar32_u52_t s_R_mod_l; + montgomery_reduce(s_R_mod_l, s_R); + + scalar32_pack(s, s_R_mod_l); +} + /* Input: @@ -12,6 +234,7 @@ Output: Overwrites s in place. */ +// 23x 21-bit limbs void __host__ __device__ sc_reduce(unsigned char *s) { int64_t s0 = 2097151 & load_3(s); int64_t s1 = 2097151 & (load_4(s + 2) >> 5); diff --git a/src/cuda-ecc-ed25519/sc.h b/src/cuda-ecc-ed25519/sc.h index 2cd2d1a..bec6c37 100644 --- a/src/cuda-ecc-ed25519/sc.h +++ b/src/cuda-ecc-ed25519/sc.h @@ -6,6 +6,7 @@ The set of scalars is \Z/l where l = 2^252 + 27742317777372353535851937790883648493. */ +void __host__ __device__ scalar32_reduce(unsigned char* s); void __host__ __device__ sc_reduce(unsigned char *s); void __host__ __device__ sc_muladd(unsigned char *s, const unsigned char *a, const unsigned char *b, const unsigned char *c); diff --git a/src/cuda-ecc-ed25519/verify.cu b/src/cuda-ecc-ed25519/verify.cu index 572c013..79b78c4 100644 --- a/src/cuda-ecc-ed25519/verify.cu +++ b/src/cuda-ecc-ed25519/verify.cu @@ -56,6 +56,56 @@ static int __host__ __device__ consttime_equal(const unsigned char *x, const uns return !r; } +// 0 == success +static int __host__ __device__ +get_checked_scalar(unsigned char* scalar, const unsigned char* signature) { + // Check if top 4-bits are clear + // then scalar is reduced. + if ((signature[31] & 0xf0) == 0) { + for (int i = 0; i < 32; i++) { + scalar[i] = signature[i]; + } + return 0; + } + + if ((signature[31] >> 7) != 0) { + return 1; + } + + scalar32_reduce(scalar); + if (!consttime_equal(scalar, signature)) { + return 1; + } + return 0; + +} + +int ed25519_get_checked_scalar(unsigned char* out_scalar, const unsigned char* in_scalar) { + return get_checked_scalar(out_scalar, in_scalar); +} + +// Return 0=success if ge unpacks and is not small order +static int __device__ __host__ +check_packed_ge_small_order(const unsigned char* packed_group_element) { + ge_p3 signature_R; + + // fail if ge does not unpack + if (0 != ge_frombytes_negate_vartime(&signature_R, packed_group_element)) { + return 1; + } + + // fail if ge is small order + if (0 != ge_is_small_order(&signature_R)) { + return 1; + } + + return 0; +} + +int ed25519_check_packed_ge_small_order(const unsigned char* packed_group_element) { + return check_packed_ge_small_order(packed_group_element); +} + static int __device__ __host__ ed25519_verify_device(const unsigned char *signature, const unsigned char *message, @@ -67,11 +117,16 @@ ed25519_verify_device(const unsigned char *signature, ge_p3 A; ge_p2 R; - if (signature[63] & 224) { + // Check that s.reduce() == s + if (0 != get_checked_scalar(checker, signature + 32)) { + return 0; + } + + if (0 != check_packed_ge_small_order(signature)) { return 0; } - if (ge_frombytes_negate_vartime(&A, public_key) != 0) { + if (0 != ge_frombytes_negate_vartime(&A, public_key)) { return 0; } diff --git a/src/gpu-common.mk b/src/gpu-common.mk index 759c124..0dbfca8 100644 --- a/src/gpu-common.mk +++ b/src/gpu-common.mk @@ -3,6 +3,7 @@ GPU_PTX_ARCH:=compute_35 GPU_ARCHS?=sm_37,sm_50,sm_61,sm_70 HOST_CFLAGS:=-Wall -Werror -fPIC -Wno-strict-aliasing GPU_CFLAGS:=--gpu-code=$(GPU_ARCHS),$(GPU_PTX_ARCH) --gpu-architecture=$(GPU_PTX_ARCH) -CFLAGS_release:=-Icommon --ptxas-options=-v $(GPU_CFLAGS) -O3 -Xcompiler "$(HOST_CFLAGS)" +#--ptxas-options=-v +CFLAGS_release:=-Icommon $(GPU_CFLAGS) -O3 -Xcompiler "$(HOST_CFLAGS)" CFLAGS_debug:=$(CFLAGS_release) -g CFLAGS:=$(CFLAGS_$V)