-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathso.71936833.c
112 lines (77 loc) · 2.98 KB
/
so.71936833.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/*******************************************************************************
*
* stackoverflow.com/questions/71936833/nibble-shuffling-with-x64-simd
* https://godbolt.org/z/qMca4sPbh
*
* Authors: Brett Hale (906839), Peter Cordes (224132)
* SPDX-License-Identifier: CC-BY-SA-4.0 OR CC0-1.0
*
*******************************************************************************/
#include <inttypes.h>
#include <stdio.h>
#include <immintrin.h>
uint64_t u4x16_sse_shuffle (uint64_t src, uint64_t idx)
{
__m128i v_dst, v_src, v_idx, tmp;
/* u4x16 nibbles to xmm u8x16: [0:n[15], .., 0:n[0]] */
#if (1) /* Cordes: SSE2 instructions */
v_src = _mm_cvtsi64_si128((int64_t) src);
tmp = _mm_srli_epi32(v_src, 4);
v_src = _mm_unpacklo_epi8(v_src, tmp);
v_src = _mm_and_si128(v_src, _mm_set1_epi8(0x0f));
v_idx = _mm_cvtsi64_si128((int64_t) idx);
tmp = _mm_srli_epi32(v_idx, 4);
v_idx = _mm_unpacklo_epi8(v_idx, tmp);
v_idx = _mm_and_si128(v_idx, _mm_set1_epi8(0x0f));
#else /* u64 AND + SHIFT */
uint64_t split = UINT64_C(0x0f0f0f0f0f0f0f0f);
uint64_t u64_lo, u64_hi;
u64_lo = src & split;
v_src = _mm_cvtsi64_si128((int64_t) u64_lo);
u64_hi = (src & ~(split)) >> 4;
tmp = _mm_cvtsi64_si128((int64_t) u64_hi);
v_src = _mm_unpacklo_epi8(v_src, tmp);
u64_lo = idx & split;
v_idx = _mm_cvtsi64_si128((int64_t) u64_lo);
u64_hi = (idx & ~(split)) >> 4;
tmp = _mm_cvtsi64_si128((int64_t) u64_hi);
v_idx = _mm_unpacklo_epi8(v_idx, tmp);
#endif
/* the 'nibble' shuffle, using xmm u8x16 elements: */
v_dst = _mm_shuffle_epi8(v_src, v_idx);
/* recombine nibbles: [0:n15, 0:n14, .., 0:n1, 0:n0]
* as: [[127:64 = any], [63:0 = n15:n14, .., n1:n0]] */
#if (1) /* Cordes recombine: pmaddubsw + packuswb */
__m128i m_mul = _mm_set1_epi16(0x1001);
v_dst = _mm_maddubs_epi16(v_dst, m_mul);
v_dst = _mm_packus_epi16(v_dst, v_dst);
#else /* recombine: SHIFT + OR + SHUFFLE: */
__m128i m_mix = _mm_set_epi64x(
INT64_C(-1), INT64_C(0x0e0c0a0806040200));
tmp = _mm_srli_epi64(v_dst, 4);
v_dst = _mm_shuffle_epi8(_mm_or_si128(v_dst, tmp), m_mix);
#endif
return ((uint64_t) (_mm_cvtsi128_si64(v_dst)));
}
int main (void)
{
/* test vectors: */
#define u64_identity UINT64_C(0xFEDCBA9876543210);
#define u64_inv_gold UINT64_C(0x9E3779B97F4A7C15);
#define u64_rnd_nrpt UINT64_C(0xB74E05C2FD83169A);
uint64_t src = u64_inv_gold;
uint64_t idx = u64_rnd_nrpt;
fprintf(stdout, "src: 0x%016" PRIX64 "\n", src);
fprintf(stdout, "idx: 0x%016" PRIX64 "\n", idx);
uint64_t dst = 0;
for (int i = 0; i < 16; i++)
{
uint8_t index = (idx >> (i * 4)) & 0xf;
dst |= ((src >> (index * 4)) & 0xf) << (i * 4);
}
fprintf(stdout, "dst (serial) : 0x%016" PRIX64 "\n", dst);
dst = u4x16_sse_shuffle(src, idx);
fprintf(stdout, "dst (SSSE3+) : 0x%016" PRIX64 "\n", dst);
return (0);
}
/******************************************************************************/