From 416e642c3dd1bd892d0e25971f454fe4df51d5dd Mon Sep 17 00:00:00 2001
From: Stephen Akridge <sakridge@gmail.com>
Date: Mon, 27 Apr 2020 10:19:04 -0700
Subject: [PATCH] Optimize gpu sigverify

---
 ci/buildkite.yml                 |   2 +-
 src/cuda-ecc-ed25519/common.cu   |  15 +++
 src/cuda-ecc-ed25519/ed25519.h   |   4 +
 src/cuda-ecc-ed25519/fe.cu       |  22 +++
 src/cuda-ecc-ed25519/fe.h        |   2 +
 src/cuda-ecc-ed25519/ge.cu       |  23 ++++
 src/cuda-ecc-ed25519/int128.h    |  37 +++++
 src/cuda-ecc-ed25519/license.txt |  31 +++++
 src/cuda-ecc-ed25519/sc.cu       | 223 +++++++++++++++++++++++++++++++
 src/cuda-ecc-ed25519/sc.h        |   1 +
 src/cuda-ecc-ed25519/verify.cu   |  59 +++++++-
 src/gpu-common.mk                |   3 +-
 12 files changed, 418 insertions(+), 4 deletions(-)
 create mode 100644 src/cuda-ecc-ed25519/int128.h

diff --git a/ci/buildkite.yml b/ci/buildkite.yml
index 92c7833..9ecf072 100644
--- a/ci/buildkite.yml
+++ b/ci/buildkite.yml
@@ -1,6 +1,6 @@
 steps:
   - command: "ci/build.sh"
     name: "build"
-    timeout_in_minutes: 30
+    timeout_in_minutes: 45
     agents:
       - "queue=cuda"
diff --git a/src/cuda-ecc-ed25519/common.cu b/src/cuda-ecc-ed25519/common.cu
index a238f7c..e98c1dd 100644
--- a/src/cuda-ecc-ed25519/common.cu
+++ b/src/cuda-ecc-ed25519/common.cu
@@ -23,4 +23,19 @@ static uint64_t __host__ __device__ load_4(const unsigned char *in) {
     return result;
 }
 
+static uint64_t __host__ __device__ load_7(const unsigned char *in) {
+    uint64_t result;
+
+    result = (uint64_t) in[0];
+    result |= ((uint64_t) in[1]) << 8;
+    result |= ((uint64_t) in[2]) << 16;
+    result |= ((uint64_t) in[3]) << 24;
+    result |= ((uint64_t) in[4]) << 32;
+    result |= ((uint64_t) in[5]) << 40;
+    result |= ((uint64_t) in[6]) << 48;
+
+    return result;
+}
+
+
 #endif
diff --git a/src/cuda-ecc-ed25519/ed25519.h b/src/cuda-ecc-ed25519/ed25519.h
index 4615959..0efbda8 100644
--- a/src/cuda-ecc-ed25519/ed25519.h
+++ b/src/cuda-ecc-ed25519/ed25519.h
@@ -75,6 +75,10 @@ bool ED25519_DECLSPEC ed25519_init();
 int cuda_host_register(void* ptr, size_t size, unsigned int flags);
 int cuda_host_unregister(void* ptr);
 
+int ED25519_DECLSPEC ed25519_get_checked_scalar(unsigned char* out_scalar, const unsigned char* in_scalar);
+
+int ED25519_DECLSPEC ed25519_check_packed_ge_small_order(const unsigned char* packed_group_element);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/cuda-ecc-ed25519/fe.cu b/src/cuda-ecc-ed25519/fe.cu
index 84748b7..04c3986 100644
--- a/src/cuda-ecc-ed25519/fe.cu
+++ b/src/cuda-ecc-ed25519/fe.cu
@@ -43,6 +43,28 @@ void __device__ __host__ fe_1(fe h) {
 }
 
 
+int __host__ __device__ fe_is_1(fe h) {
+    if (h[0] != 1) {
+        return 0;
+    }
+    for (int i = 1; i < 9; i++) {
+        if (h[i] != 0) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+int __host__ __device__ fe_is_0(fe h) {
+    for (int i = 0; i < 9; i++) {
+        if (h[i] != 0) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+
 
 /*
     h = f + g
diff --git a/src/cuda-ecc-ed25519/fe.h b/src/cuda-ecc-ed25519/fe.h
index 2a33879..40abad5 100644
--- a/src/cuda-ecc-ed25519/fe.h
+++ b/src/cuda-ecc-ed25519/fe.h
@@ -18,6 +18,8 @@ typedef int32_t fe[10];
 
 void __host__ __device__ fe_0(fe h);
 void __device__ __host__ fe_1(fe h);
+int __device__ __host__ fe_is_0(fe h);
+int __device__ __host__ fe_is_1(fe h);
 
 void __device__ __host__ fe_frombytes(fe h, const unsigned char *s);
 void __device__ __host__ fe_tobytes(unsigned char *s, const fe h);
diff --git a/src/cuda-ecc-ed25519/ge.cu b/src/cuda-ecc-ed25519/ge.cu
index 7284e83..4513dbd 100644
--- a/src/cuda-ecc-ed25519/ge.cu
+++ b/src/cuda-ecc-ed25519/ge.cu
@@ -180,6 +180,29 @@ int __device__ __host__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned cha
     return 0;
 }
 
+// x = 1, y = 0, z = 0, t = 1
+int __host__ __device__ ge_is_identity(ge_p3* p) {
+    return (fe_is_0(p->X) &&
+            fe_is_1(p->Y) &&
+            fe_is_1(p->Z) &&
+            fe_is_0(p->T)) ? 1 : 0;
+}
+
+int __host__ __device__ ge_is_small_order(ge_p3* p) {
+    ge_p1p1 r;
+    ge_p2 s;
+    ge_p3 q;
+
+    // calculate q = p * 2*3
+    ge_p3_dbl(&r, p);
+    ge_p1p1_to_p2(&s, &r);
+    ge_p2_dbl(&r, &s);
+    ge_p1p1_to_p2(&s, &r);
+    ge_p2_dbl(&r, &s);
+    ge_p1p1_to_p3(&q, &r);
+
+    return ge_is_identity(&q);
+}
 
 /*
 r = p + q
diff --git a/src/cuda-ecc-ed25519/int128.h b/src/cuda-ecc-ed25519/int128.h
new file mode 100644
index 0000000..7f23efe
--- /dev/null
+++ b/src/cuda-ecc-ed25519/int128.h
@@ -0,0 +1,37 @@
+#ifndef INT128_H
+#define INT128_H
+
+struct uint128_t {
+  uint64_t low;
+  uint64_t high;
+};
+
+static __device__ __host__ uint128_t mul_128(uint64_t a, uint64_t b) {
+  uint128_t result;
+#ifdef __CUDA_ARCH__
+  result.low = a * b;
+  result.high = __mul64hi(a, b);
+#elif __x86_64__
+  asm( "mulq %3\n\t"
+      : "=a" (result.low), "=d" (result.high)
+      : "%0" (a), "rm" (b));
+#endif
+  return result;
+}
+
+static __device__ __host__ uint128_t add_128(uint128_t a, uint128_t b) {
+  uint128_t result;
+#ifdef __CUDA_ARCH__
+  asm( "add.cc.u64    %0, %2, %4;\n\t"
+       "addc.u64      %1, %3, %5;\n\t"
+       : "=l" (result.low), "=l" (result.high)
+       : "l" (a.low), "l" (a.high),
+       "l" (b.low), "l" (b.high));
+#else
+  result.low = a.low + b.low;
+  result.high = a.high + b.high + (result.low < a.low);
+#endif
+  return result;
+}
+
+#endif
diff --git a/src/cuda-ecc-ed25519/license.txt b/src/cuda-ecc-ed25519/license.txt
index c1503f9..af69c95 100644
--- a/src/cuda-ecc-ed25519/license.txt
+++ b/src/cuda-ecc-ed25519/license.txt
@@ -14,3 +14,34 @@ applications, and to alter it and redistribute it freely, subject to the followi
    being the original software.
 
 3. This notice may not be removed or altered from any source distribution.
+
+================================
+
+Copyright (c) 2017-2019 isis agora lovecruft. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/cuda-ecc-ed25519/sc.cu b/src/cuda-ecc-ed25519/sc.cu
index 299e163..176df38 100644
--- a/src/cuda-ecc-ed25519/sc.cu
+++ b/src/cuda-ecc-ed25519/sc.cu
@@ -1,6 +1,228 @@
 #include "fixedint.h"
 #include "sc.h"
 #include "common.cu"
+#include "int128.h"
+
+#ifdef __CUDA_ARCH__
+#define CONSTANT __constant__
+#else
+#define CONSTANT const
+#endif
+
+/// R = R % L where R = 2^260
+const __device__ uint64_t R_scalar_u52[] = {
+    0x000f48bd6721e6ed,
+    0x0003bab5ac67e45a,
+    0x000fffffeb35e51b,
+    0x000fffffffffffff,
+    0x00000fffffffffff,
+    };
+
+CONSTANT uint64_t L_scalar_u52[] = {
+    0x0002631a5cf5d3ed,
+    0x000dea2f79cd6581,
+    0x000000000014def9,
+    0x0000000000000000,
+    0x0000100000000000,
+    };
+
+const uint64_t L_FACTOR = 0x51da312547e1b;
+
+// Unpacked 32-byte scalar with 5x 52-bit limbs
+typedef uint64_t scalar32_u52_t[5];
+
+// Packed 32-byte scalar
+typedef uint8_t scalar32_t[32];
+
+void __host__ __device__ scalar52_mul(uint128_t* out, const scalar32_u52_t a, const scalar32_u52_t b) {
+    out[0] = mul_128(a[0], b[0]);
+
+    out[1] = add_128(mul_128(a[0], b[1]), mul_128(a[1], b[0]));
+
+    out[2] = add_128(mul_128(a[0], b[2]), mul_128(a[1], b[1]));
+    out[2] = add_128(out[2], mul_128(a[2], b[0]));
+
+    out[3] = add_128(mul_128(a[0], b[3]), mul_128(a[1], b[2]));
+    out[3] = add_128(out[3], mul_128(a[2], b[1]));
+    out[3] = add_128(out[3], mul_128(a[3], b[0]));
+
+    out[4] = add_128(mul_128(a[0], b[4]), mul_128(a[1], b[3]));
+    out[4] = add_128(out[4], mul_128(a[2], b[2]));
+    out[4] = add_128(out[4], mul_128(a[3], b[1]));
+    out[4] = add_128(out[4], mul_128(a[4], b[0]));
+
+    out[5] = add_128(mul_128(a[1], b[4]), mul_128(a[2], b[3]));
+    out[5] = add_128(out[5], mul_128(a[3], b[2]));
+    out[5] = add_128(out[5], mul_128(a[4], b[1]));
+
+    out[6] = add_128(mul_128(a[2], b[4]), mul_128(a[3], b[3]));
+    out[6] = add_128(out[5], mul_128(a[4], b[2]));
+
+    out[7] = add_128(mul_128(a[3], b[4]), mul_128(a[4], b[3]));
+
+    out[8] = mul_128(a[4], b[4]);
+}
+
+#define MASK_52 ((UINT64_C(1) << 52) - 1)
+
+void __host__ __device__ scalar32_unpack(scalar32_u52_t out, scalar32_t in) {
+    out[0] = MASK_52 & load_7(in);             //   0-51
+    out[1] = MASK_52 & (load_7(in + 6) >> 4);  //  52-103
+    out[2] = MASK_52 & load_7(in + 13);        // 104-156
+    out[3] = MASK_52 & (load_7(in + 19) >> 4); // 156-208
+    out[4] = MASK_52 & load_7(in + 26);        // 208-256
+}
+
+void __host__ __device__ scalar32_pack(scalar32_t out, scalar32_u52_t in) {
+    out[0] = (uint8_t)in[0];
+    out[1] = (uint8_t)(in[0] >> 8);
+    out[2] = (uint8_t)(in[0] >> 16);
+    out[3] = (uint8_t)(in[0] >> 24);
+    out[4] = (uint8_t)(in[0] >> 32);
+    out[5] = (uint8_t)(in[0] >> 40);
+
+    out[6] = (uint8_t)((in[0] >> 48) | (in[1] << 4));
+
+    out[7]  = (uint8_t)(in[1] >>  4);
+    out[8]  = (uint8_t)(in[1] >> 12);
+    out[9]  = (uint8_t)(in[1] >> 20);
+    out[10] = (uint8_t)(in[1] >> 28);
+    out[11] = (uint8_t)(in[1] >> 36);
+    out[12] = (uint8_t)(in[1] >> 44);
+
+    out[13] = (uint8_t)(in[2] >>  0);
+    out[14] = (uint8_t)(in[2] >>  8);
+    out[15] = (uint8_t)(in[2] >> 16);
+    out[16] = (uint8_t)(in[2] >> 24);
+    out[17] = (uint8_t)(in[2] >> 32);
+    out[18] = (uint8_t)(in[2] >> 40);
+
+    out[19] = ((uint8_t)(in[2] >> 48) | (uint8_t)(in[ 3] << 4));
+
+    out[20] = (uint8_t)(in[3] >>  4);
+    out[21] = (uint8_t)(in[3] >> 12);
+    out[22] = (uint8_t)(in[3] >> 20);
+    out[23] = (uint8_t)(in[3] >> 28);
+    out[24] = (uint8_t)(in[3] >> 36);
+    out[25] = (uint8_t)(in[3] >> 44);
+
+    out[26] = (uint8_t)(in[4] >>  0);
+    out[27] = (uint8_t)(in[4] >>  8);
+    out[28] = (uint8_t)(in[4] >> 16);
+    out[29] = (uint8_t)(in[4] >> 24);
+    out[30] = (uint8_t)(in[4] >> 32);
+    out[31] = (uint8_t)(in[4] >> 40);
+}
+
+uint128_t __host__ __device__ rshift_128(uint128_t val, uint64_t n) {
+    val.low >>= n;
+    uint64_t high_to_low = (val.high & MASK_52) << (64 - n);
+    val.low |= high_to_low;
+    val.high >>= n;
+    return val;
+}
+
+void __host__ __device__ part1(uint128_t sum, uint128_t* carry, uint64_t* adjust) {
+    *adjust = (sum.low * L_FACTOR) & MASK_52;
+    *carry = rshift_128(add_128(sum, mul_128(*adjust, L_scalar_u52[0])), 52);
+}
+
+void __host__ __device__ part2(uint128_t sum, uint128_t* carry, uint64_t* adjust) {
+    *adjust = sum.low & MASK_52;
+    *carry = rshift_128(sum, 52);
+}
+
+// r = a - b
+void __host__ __device__ scalar_u52_sub(scalar32_u52_t r, scalar32_u52_t a, const scalar32_u52_t b) {
+    for (int i = 0; i < 5; i++) {
+        r[i] = 0;
+    }
+
+    // a - b
+    uint64_t borrow = 0;
+    for (int i = 0; i < 5; i++) {
+        borrow = a[i] - (b[i] + (borrow >> 63));
+        r[i] = borrow & MASK_52;
+    }
+
+    // conditionally add l if the difference is negative
+    uint64_t underflow_mask = ((borrow >> 63) ^ 1) - 1;
+    uint64_t carry = 0;
+    for (int i = 0; i < 5; i++) {
+        carry = (carry >> 52) + r[i] + (L_scalar_u52[i] & underflow_mask);
+        r[i] = carry & MASK_52;
+    }
+}
+
+void __host__ __device__ montgomery_reduce(scalar32_u52_t s, uint128_t r[9]) {
+    uint128_t carry;
+    uint64_t n0;
+    part1(r[0], &carry, &n0);
+
+    uint64_t n1;
+    uint128_t sum1 = add_128(mul_128(n0, L_scalar_u52[1]), r[1]);
+    sum1 = add_128(sum1, carry);
+    part1(sum1, &carry, &n1);
+
+    uint64_t n2;
+    uint128_t sum2 = add_128(mul_128(n0, L_scalar_u52[2]), mul_128(n1, L_scalar_u52[1]));
+    sum2 = add_128(sum2, r[2]);
+    sum2 = add_128(sum2, carry);
+    part1(sum2, &carry, &n2);
+
+    uint64_t n3;
+    uint128_t sum3 = add_128(mul_128(n1, L_scalar_u52[2]), mul_128(n2, L_scalar_u52[1]));
+    sum3 = add_128(sum3, r[3]);
+    sum3 = add_128(sum3, carry);
+    part1(sum3, &carry, &n3);
+
+    uint64_t n4;
+    uint128_t sum4 = add_128(mul_128(n0, L_scalar_u52[4]), mul_128(n3, L_scalar_u52[1]));
+    sum4 = add_128(sum4, mul_128(n2, L_scalar_u52[2]));
+    sum4 = add_128(sum4, r[4]);
+    sum4 = add_128(sum4, carry);
+    part1(sum4, &carry, &n4);
+
+    uint128_t sum0;
+    scalar32_u52_t r_scalar;
+
+    sum0 = add_128(carry, r[5]);
+    sum0 = add_128(sum0, mul_128(n1, L_scalar_u52[4]));
+    sum0 = add_128(sum0, mul_128(n3, L_scalar_u52[2]));
+    sum0 = add_128(sum0, mul_128(n4, L_scalar_u52[1]));
+    part2(sum0, &carry, &r_scalar[0]);
+
+    sum0 = add_128(carry, r[6]);
+    sum0 = add_128(sum0, mul_128(n2, L_scalar_u52[4]));
+    sum0 = add_128(sum0, mul_128(n4, L_scalar_u52[2]));
+    part2(sum0, &carry, &r_scalar[1]);
+
+    sum0 = add_128(carry, r[7]);
+    sum0 = add_128(sum0, mul_128(n3, L_scalar_u52[4]));
+    part2(sum0, &carry, &r_scalar[2]);
+
+    sum0 = add_128(carry, r[8]);
+    sum0 = add_128(sum0, mul_128(n4, L_scalar_u52[4]));
+    part2(sum0, &carry, &r_scalar[3]);
+
+    r_scalar[4] = carry.low;
+
+    scalar_u52_sub(s, r_scalar, L_scalar_u52);
+}
+
+void __host__ __device__ scalar32_reduce(unsigned char *s) {
+    scalar32_u52_t s_u52;
+    scalar32_unpack(s_u52, s);
+
+    uint128_t s_R[9];
+    scalar52_mul(s_R, s_u52, R_scalar_u52);
+
+    scalar32_u52_t s_R_mod_l;
+    montgomery_reduce(s_R_mod_l, s_R);
+
+    scalar32_pack(s, s_R_mod_l);
+}
+
 
 /*
 Input:
@@ -12,6 +234,7 @@ Output:
   Overwrites s in place.
 */
 
+// 23x 21-bit limbs
 void __host__ __device__ sc_reduce(unsigned char *s) {
     int64_t s0 = 2097151 & load_3(s);
     int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
diff --git a/src/cuda-ecc-ed25519/sc.h b/src/cuda-ecc-ed25519/sc.h
index 2cd2d1a..bec6c37 100644
--- a/src/cuda-ecc-ed25519/sc.h
+++ b/src/cuda-ecc-ed25519/sc.h
@@ -6,6 +6,7 @@ The set of scalars is \Z/l
 where l = 2^252 + 27742317777372353535851937790883648493.
 */
 
+void __host__ __device__ scalar32_reduce(unsigned char* s);
 void __host__ __device__ sc_reduce(unsigned char *s);
 void __host__ __device__ sc_muladd(unsigned char *s, const unsigned char *a, const unsigned char *b, const unsigned char *c);
 
diff --git a/src/cuda-ecc-ed25519/verify.cu b/src/cuda-ecc-ed25519/verify.cu
index 572c013..79b78c4 100644
--- a/src/cuda-ecc-ed25519/verify.cu
+++ b/src/cuda-ecc-ed25519/verify.cu
@@ -56,6 +56,56 @@ static int __host__ __device__ consttime_equal(const unsigned char *x, const uns
     return !r;
 }
 
+// 0 == success
+static int __host__ __device__
+get_checked_scalar(unsigned char* scalar, const unsigned char* signature) {
+    // Check if top 4-bits are clear
+    // then scalar is reduced.
+    if ((signature[31] & 0xf0) == 0) {
+        for (int i = 0; i < 32; i++) {
+            scalar[i] = signature[i];
+        }
+        return 0;
+    }
+
+    if ((signature[31] >> 7) != 0) {
+        return 1;
+    }
+
+    scalar32_reduce(scalar);
+    if (!consttime_equal(scalar, signature)) {
+        return 1;
+    }
+    return 0;
+
+}
+
+int ed25519_get_checked_scalar(unsigned char* out_scalar, const unsigned char* in_scalar) {
+    return get_checked_scalar(out_scalar, in_scalar);
+}
+
+// Return 0=success if ge unpacks and is not small order
+static int __device__ __host__
+check_packed_ge_small_order(const unsigned char* packed_group_element) {
+    ge_p3 signature_R;
+
+    // fail if ge does not unpack
+    if (0 != ge_frombytes_negate_vartime(&signature_R, packed_group_element)) {
+        return 1;
+    }
+
+    // fail if ge is small order
+    if (0 != ge_is_small_order(&signature_R)) {
+        return 1;
+    }
+
+    return 0;
+}
+
+int ed25519_check_packed_ge_small_order(const unsigned char* packed_group_element) {
+    return check_packed_ge_small_order(packed_group_element);
+}
+
 static int __device__ __host__
 ed25519_verify_device(const unsigned char *signature,
                       const unsigned char *message,
@@ -67,11 +117,16 @@ ed25519_verify_device(const unsigned char *signature,
     ge_p3 A;
     ge_p2 R;
 
-    if (signature[63] & 224) {
+    // Check that s.reduce() == s
+    if (0 != get_checked_scalar(checker, signature + 32)) {
+        return 0;
+    }
+
+    if (0 != check_packed_ge_small_order(signature)) {
         return 0;
     }
 
-    if (ge_frombytes_negate_vartime(&A, public_key) != 0) {
+    if (0 != ge_frombytes_negate_vartime(&A, public_key)) {
         return 0;
     }
 
diff --git a/src/gpu-common.mk b/src/gpu-common.mk
index 759c124..0dbfca8 100644
--- a/src/gpu-common.mk
+++ b/src/gpu-common.mk
@@ -3,6 +3,7 @@ GPU_PTX_ARCH:=compute_35
 GPU_ARCHS?=sm_37,sm_50,sm_61,sm_70
 HOST_CFLAGS:=-Wall -Werror -fPIC -Wno-strict-aliasing
 GPU_CFLAGS:=--gpu-code=$(GPU_ARCHS),$(GPU_PTX_ARCH) --gpu-architecture=$(GPU_PTX_ARCH)
-CFLAGS_release:=-Icommon --ptxas-options=-v $(GPU_CFLAGS) -O3 -Xcompiler "$(HOST_CFLAGS)"
+#--ptxas-options=-v
+CFLAGS_release:=-Icommon $(GPU_CFLAGS) -O3 -Xcompiler "$(HOST_CFLAGS)"
 CFLAGS_debug:=$(CFLAGS_release) -g
 CFLAGS:=$(CFLAGS_$V)