sha2: Add aarch64 backends (RustCrypto#490)

codahale · web-flow · commit 3fa561ed2eb5 · 2023-06-15T16:13:13.000Z
diff --git a/.github/workflows/sha2.yml b/.github/workflows/sha2.yml
@@ -23,7 +23,7 @@ jobs:
     with:
         # Crate supports MSRV 1.41 without `oid` feature. We test true MSRV
         # in the `test-msrv` job.
-        msrv: 1.57.0
+        msrv: 1.59.0
 
   # Builds for no_std platforms
   build:
@@ -32,7 +32,7 @@ jobs:
     strategy:
       matrix:
         rust:
-          - 1.57
+          - 1.59
           - stable
         target:
           - thumbv7em-none-eabi
diff --git a/sha2/src/sha256/aarch64.rs b/sha2/src/sha256/aarch64.rs
@@ -1,15 +1,159 @@
 //! SHA-256 `aarch64` backend.
 
+// Implementation adapted from mbedtls.
+
 // TODO: stdarch intrinsics: RustCrypto/hashes#257
 
+use core::arch::{aarch64::*, asm};
+
+use crate::consts::K32;
+
 cpufeatures::new!(sha2_hwcap, "sha2");
 
 pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
     // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
     // after stabilization
     if sha2_hwcap::get() {
-        sha2_asm::compress256(state, blocks);
+        unsafe { sha256_compress(state, blocks) }
     } else {
         super::soft::compress(state, blocks);
     }
 }
+
+#[target_feature(enable = "sha2")]
+unsafe fn sha256_compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    // SAFETY: Requires the sha2 feature.
+
+    // Load state into vectors.
+    let mut abcd = vld1q_u32(state[0..4].as_ptr());
+    let mut efgh = vld1q_u32(state[4..8].as_ptr());
+
+    // Iterate through the message blocks.
+    for block in blocks {
+        // Keep original state values.
+        let abcd_orig = abcd;
+        let efgh_orig = efgh;
+
+        // Load the message block into vectors, assuming little endianness.
+        let mut s0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[0..16].as_ptr())));
+        let mut s1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[16..32].as_ptr())));
+        let mut s2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[32..48].as_ptr())));
+        let mut s3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[48..64].as_ptr())));
+
+        // Rounds 0 to 3
+        let mut tmp = vaddq_u32(s0, vld1q_u32(&K32[0]));
+        let mut abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        // Rounds 4 to 7
+        tmp = vaddq_u32(s1, vld1q_u32(&K32[4]));
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        // Rounds 8 to 11
+        tmp = vaddq_u32(s2, vld1q_u32(&K32[8]));
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        // Rounds 12 to 15
+        tmp = vaddq_u32(s3, vld1q_u32(&K32[12]));
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        for t in (16..64).step_by(16) {
+            // Rounds t to t + 3
+            s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
+            tmp = vaddq_u32(s0, vld1q_u32(&K32[t]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+            // Rounds t + 4 to t + 7
+            s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
+            tmp = vaddq_u32(s1, vld1q_u32(&K32[t + 4]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+            // Rounds t + 8 to t + 11
+            s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
+            tmp = vaddq_u32(s2, vld1q_u32(&K32[t + 8]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+            // Rounds t + 12 to t + 15
+            s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
+            tmp = vaddq_u32(s3, vld1q_u32(&K32[t + 12]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+        }
+
+        // Add the block-specific state to the original state.
+        abcd = vaddq_u32(abcd, abcd_orig);
+        efgh = vaddq_u32(efgh, efgh_orig);
+    }
+
+    // Store vectors into state.
+    vst1q_u32(state[0..4].as_mut_ptr(), abcd);
+    vst1q_u32(state[4..8].as_mut_ptr(), efgh);
+}
+
+// TODO remove these polyfills once SHA2 intrinsics land
+
+#[inline(always)]
+unsafe fn vsha256hq_u32(
+    mut hash_efgh: uint32x4_t,
+    hash_abcd: uint32x4_t,
+    wk: uint32x4_t,
+) -> uint32x4_t {
+    asm!(
+        "SHA256H {:q}, {:q}, {:v}.4S",
+        inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    hash_efgh
+}
+
+#[inline(always)]
+unsafe fn vsha256h2q_u32(
+    mut hash_efgh: uint32x4_t,
+    hash_abcd: uint32x4_t,
+    wk: uint32x4_t,
+) -> uint32x4_t {
+    asm!(
+        "SHA256H2 {:q}, {:q}, {:v}.4S",
+        inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    hash_efgh
+}
+
+#[inline(always)]
+unsafe fn vsha256su0q_u32(mut w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t {
+    asm!(
+        "SHA256SU0 {:v}.4S, {:v}.4S",
+        inout(vreg) w0_3, in(vreg) w4_7,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    w0_3
+}
+
+#[inline(always)]
+unsafe fn vsha256su1q_u32(
+    mut tw0_3: uint32x4_t,
+    w8_11: uint32x4_t,
+    w12_15: uint32x4_t,
+) -> uint32x4_t {
+    asm!(
+        "SHA256SU1 {:v}.4S, {:v}.4S, {:v}.4S",
+        inout(vreg) tw0_3, in(vreg) w8_11, in(vreg) w12_15,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    tw0_3
+}
diff --git a/sha2/src/sha512.rs b/sha2/src/sha512.rs
@@ -15,6 +15,10 @@ cfg_if::cfg_if! {
         }
         mod x86;
         use x86::compress;
+    } else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] {
+        mod soft;
+        mod aarch64;
+        use aarch64::compress;
     } else {
         mod soft;
         use soft::compress;
diff --git a/sha2/src/sha512/aarch64.rs b/sha2/src/sha512/aarch64.rs