@@ -123,48 +123,26 @@ static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
123
123
char * seq_cursor = seq ;
124
124
uint8_t * nibble_cursor = nib ;
125
125
const char * seq_vec_end_ptr = seq_end_ptr - (2 * sizeof (__m128i ) - 1 );
126
- __m128i first_upper_shuffle = _mm_setr_epi8 (
127
- 0 , -1 , 1 , -1 , 2 , -1 , 3 , -1 , 4 , -1 , 5 , -1 , 6 , -1 , 7 , -1 );
128
- __m128i first_lower_shuffle = _mm_setr_epi8 (
129
- -1 , 0 , -1 , 1 , -1 , 2 , -1 , 3 , -1 , 4 , -1 , 5 , -1 , 6 , -1 , 7 );
130
- __m128i second_upper_shuffle = _mm_setr_epi8 (
131
- 8 , -1 , 9 , -1 , 10 , -1 , 11 , -1 , 12 , -1 , 13 , -1 , 14 , -1 , 15 , -1 );
132
- __m128i second_lower_shuffle = _mm_setr_epi8 (
133
- -1 , 8 , -1 , 9 , -1 , 10 , -1 , 11 , -1 , 12 , -1 , 13 , -1 , 14 , -1 , 15 );
134
126
__m128i nuc_lookup_vec = _mm_lddqu_si128 ((__m128i * )seq_nt16_str );
135
- /* Work on 16 encoded characters at the time resulting in 32 decoded characters
136
- Examples are given for 8 encoded characters A until H to keep it readable.
137
- Encoded stored as |AB|CD|EF|GH|
138
- Shuffle into |AB|00|CD|00|EF|00|GH|00| and
139
- |00|AB|00|CD|00|EF|00|GH|
140
- Shift upper to the right resulting into
141
- |0A|B0|0C|D0|0E|F0|0G|H0| and
142
- |00|AB|00|CD|00|EF|00|GH|
143
- Merge with or resulting into (X stands for garbage)
144
- |0A|XB|0C|XD|0E|XF|0G|XH|
145
- Bitwise and with 0b1111 leads to:
146
- |0A|0B|0C|0D|0E|0F|0G|0H|
147
- We can use the resulting 4-bit integers as indexes for the shuffle of
148
- the nucleotide lookup. */
127
+ /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes
128
+ as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used
129
+ together with the pshufb instruction as a lookup table. The most efficient
130
+ way is to use bitwise AND and shift to create two vectors. One with all
131
+ the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|).
132
+ The lookup can then be performed and the resulting vectors can be
133
+ interleaved again using the unpack instructions. */
149
134
while (seq_cursor < seq_vec_end_ptr ) {
150
135
__m128i encoded = _mm_lddqu_si128 ((__m128i * )nibble_cursor );
151
-
152
- __m128i first_upper = _mm_shuffle_epi8 ( encoded , first_upper_shuffle );
153
- __m128i first_lower = _mm_shuffle_epi8 (encoded , first_lower_shuffle );
154
- __m128i shifted_first_upper = _mm_srli_epi64 ( first_upper , 4 );
155
- __m128i first_merged = _mm_or_si128 ( shifted_first_upper , first_lower );
156
- __m128i first_indexes = _mm_and_si128 ( first_merged , _mm_set1_epi8 ( 15 ) );
157
- __m128i first_nucleotides = _mm_shuffle_epi8 ( nuc_lookup_vec , first_indexes );
136
+ __m128i encoded_upper = _mm_srli_epi64 ( encoded , 4 );
137
+ encoded_upper = _mm_and_si128 ( encoded_upper , _mm_set1_epi8 ( 15 ) );
138
+ __m128i encoded_lower = _mm_and_si128 (encoded , _mm_set1_epi8 ( 15 ) );
139
+ __m128i nucs_upper = _mm_shuffle_epi8 ( nuc_lookup_vec , encoded_upper );
140
+ __m128i nucs_lower = _mm_shuffle_epi8 ( nuc_lookup_vec , encoded_lower );
141
+ __m128i first_nucleotides = _mm_unpacklo_epi8 ( nucs_upper , nucs_lower );
142
+ __m128i second_nucleotides = _mm_unpackhi_epi8 ( nucs_upper , nucs_lower );
158
143
_mm_storeu_si128 ((__m128i * )seq_cursor , first_nucleotides );
159
-
160
- __m128i second_upper = _mm_shuffle_epi8 (encoded , second_upper_shuffle );
161
- __m128i second_lower = _mm_shuffle_epi8 (encoded , second_lower_shuffle );
162
- __m128i shifted_second_upper = _mm_srli_epi64 (second_upper , 4 );
163
- __m128i second_merged = _mm_or_si128 (shifted_second_upper , second_lower );
164
- __m128i second_indexes = _mm_and_si128 (second_merged , _mm_set1_epi8 (15 ));
165
- __m128i second_nucleotides = _mm_shuffle_epi8 (nuc_lookup_vec , second_indexes );
166
- _mm_storeu_si128 ((__m128i * )(seq_cursor + 16 ), second_nucleotides );
167
-
144
+ _mm_storeu_si128 ((__m128i * )(seq_cursor + sizeof (__m128i )),
145
+ second_nucleotides );
168
146
nibble_cursor += sizeof (__m128i );
169
147
seq_cursor += 2 * sizeof (__m128i );
170
148
}
0 commit comments