Skip to content

Commit e6e42b4

Browse files
committed
Simplify SSSE3 nibble2base function
1 parent b8145e6 commit e6e42b4

File tree

1 file changed

+17
-39
lines changed

1 file changed

+17
-39
lines changed

simd.c

+17-39
Original file line numberDiff line numberDiff line change
@@ -119,52 +119,30 @@ void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_defau
119119

120120
__attribute__((target("ssse3")))
121121
static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
122+
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
122123
const char *seq_end_ptr = seq + len;
123124
char *seq_cursor = seq;
124125
uint8_t *nibble_cursor = nib;
125126
const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i) - 1);
126-
__m128i first_upper_shuffle = _mm_setr_epi8(
127-
0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1);
128-
__m128i first_lower_shuffle = _mm_setr_epi8(
129-
-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7);
130-
__m128i second_upper_shuffle = _mm_setr_epi8(
131-
8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1);
132-
__m128i second_lower_shuffle = _mm_setr_epi8(
133-
-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15);
134-
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
135-
/* Work on 16 encoded characters at the time resulting in 32 decoded characters
136-
Examples are given for 8 encoded characters A until H to keep it readable.
137-
Encoded stored as |AB|CD|EF|GH|
138-
Shuffle into |AB|00|CD|00|EF|00|GH|00| and
139-
|00|AB|00|CD|00|EF|00|GH|
140-
Shift upper to the right resulting into
141-
|0A|B0|0C|D0|0E|F0|0G|H0| and
142-
|00|AB|00|CD|00|EF|00|GH|
143-
Merge with or resulting into (X stands for garbage)
144-
|0A|XB|0C|XD|0E|XF|0G|XH|
145-
Bitwise and with 0b1111 leads to:
146-
|0A|0B|0C|0D|0E|0F|0G|0H|
147-
We can use the resulting 4-bit integers as indexes for the shuffle of
148-
the nucleotide lookup. */
127+
/* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes
128+
as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used
129+
together with the pshufb instruction as a lookup table. The most efficient
130+
way is to use bitwise AND and shift to create two vectors. One with all
131+
the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|).
132+
The lookup can then be performed and the resulting vectors can be
133+
interleaved again using the unpack instructions. */
149134
while (seq_cursor < seq_vec_end_ptr) {
150135
__m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);
151-
152-
__m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
153-
__m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
154-
__m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
155-
__m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
156-
__m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(15));
157-
__m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
136+
__m128i encoded_upper = _mm_srli_epi64(encoded, 4);
137+
encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15));
138+
__m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15));
139+
__m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper);
140+
__m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower);
141+
__m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower);
142+
__m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower);
158143
_mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);
159-
160-
__m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
161-
__m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
162-
__m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
163-
__m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
164-
__m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(15));
165-
__m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
166-
_mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides);
167-
144+
_mm_storeu_si128((__m128i *)(seq_cursor + (sizeof(__m128i))),
145+
second_nucleotides);
168146
nibble_cursor += sizeof(__m128i);
169147
seq_cursor += 2 * sizeof(__m128i);
170148
}

0 commit comments

Comments
 (0)