Skip to content

Commit 9de8e4f

Browse files
committed
Add API functions to parse and format SEQ and QUAL fields
1 parent 2d2d5aa commit 9de8e4f

File tree

2 files changed

+114
-0
lines changed

2 files changed

+114
-0
lines changed

htslib/sam.h

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1143,6 +1143,75 @@ ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *
11431143
HTSLIB_EXPORT
11441144
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b);
11451145

1146+
/// Unpack nibble-encoded sequence into a char buffer
1147+
/** @param dest Destination character buffer
1148+
@param seq Pointer to sequence bases packed as per bam1_t
1149+
@param seqlen Number of bases at @p seq to be unpacked
1150+
1151+
Unpacks nucleotides encoded in the (seqlen+1)/2 bytes at @p seq.
1152+
Writes exactly @p seqlen characters to @p dest. If NUL-termination
1153+
is desired, the calling code should add '\0' appropriately itself.
1154+
*/
1155+
HTSLIB_EXPORT
1156+
void sam_format_seq(char *dest, const uint8_t *seq, size_t seqlen);
1157+
1158+
/// Unpack BAM record's sequence field into a char buffer
1159+
/** @param dest Destination character buffer
1160+
@param b Source alignment record
1161+
*/
1162+
static inline void bam_format_seq(char *dest, const bam1_t *b) {
1163+
sam_format_seq(dest, bam_get_seq(b), b->core.l_qseq);
1164+
}
1165+
1166+
/// Decode base qualities into a char buffer
1167+
/** @param dest Destination character buffer
1168+
@param qual Pointer to base qualities encoded as per bam1_t
1169+
@param quallen Number of base qualities at @p seq to be decoded
1170+
1171+
Decodes base qualities encoded in the @p quallen bytes at @p seq.
1172+
Writes exactly @p quallen characters to @p dest. If the base qualities
1173+
are missing (encoded as '\xff'), writes '*' followed by quallen-1 NULs.
1174+
Otherwise if NUL-termination is desired, the calling code should add
1175+
'\0' appropriately itself.
1176+
*/
1177+
HTSLIB_EXPORT
1178+
void sam_format_qual(char *dest, const uint8_t *qual, size_t quallen);
1179+
1180+
/// Decode BAM record's base qualities field into a char buffer
1181+
/** @param dest Destination character buffer
1182+
@param b Source alignment record
1183+
*/
1184+
static inline void bam_format_qual(char *dest, const bam1_t *b) {
1185+
sam_format_qual(dest, bam_get_qual(b), b->core.l_qseq);
1186+
}
1187+
1188+
/// Pack ASCII sequence string into nibble-encoded sequence buffer
1189+
/** @param seq Destination sequence base buffer to be packed as per bam1_t
1190+
@param src Pointer to ASCII-encoded base characters
1191+
@param len Number of base characters at @p src to be packed
1192+
@return The number of bytes written to @p seq, or negative on error
1193+
1194+
Packs @p len ASCII base characters into a nibble-encoded sequence buffer.
1195+
If the text at @p src is "*\0", writes 0 bytes to @p seq; otherwise writes
1196+
exactly floor((len+1)/2) bytes to @p seq.
1197+
*/
1198+
HTSLIB_EXPORT
1199+
int sam_parse_seq(uint8_t *seq, const char *src, size_t len);
1200+
1201+
/// Encode ASCII base qualities into uint8_t buffer
1202+
/** @param qual Destination base quality buffer to be encoded as per bam1_t
1203+
@param src Pointer to ASCII-encoded base quality characters
1204+
@param len Number of base qualities at @p src to be encoded
1205+
@return The number of bytes written to @p qual, or negative on error
1206+
1207+
Encodes @p len ASCII base qualities into a quality buffer encoded as per bam1_t.
1208+
If the text at @p src is "*\0" (i.e., if len==1 and src is '*', or len>1 and src
1209+
is '*' followed by a NUL), encodes as @p len missing ('\xff') qualities. Always
1210+
writes exactly @p len bytes to @p qual.
1211+
*/
1212+
HTSLIB_EXPORT
1213+
int sam_parse_qual(uint8_t *qual, const char *src, size_t len);
1214+
11461215
/*************************
11471216
*** BAM/CRAM indexing ***
11481217
*************************/

sam.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4400,6 +4400,51 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
44004400
return sam_format1_append(h, b, str);
44014401
}
44024402

4403+
void sam_format_seq(char *dest, const uint8_t *seq, size_t seqlen)
4404+
{
4405+
nibble2base(seq, dest, seqlen);
4406+
}
4407+
4408+
void sam_format_qual(char *dest, const uint8_t *qual, size_t quallen)
4409+
{
4410+
if (qual[0] != 0xff) {
4411+
add33((uint8_t *) dest, qual, quallen);
4412+
}
4413+
else {
4414+
memset(dest, '\0', quallen);
4415+
dest[0] = '*';
4416+
}
4417+
}
4418+
4419+
int sam_parse_seq(uint8_t *seq, const char *src, size_t len)
4420+
{
4421+
if (src[0] == '*' && (len == 1 || src[1] == '\0')) return 0;
4422+
4423+
const uint8_t *usrc = (const uint8_t *) src;
4424+
uint8_t *seq0 = seq;
4425+
size_t i, even_len = len & ~1;
4426+
for (i = 0; i < even_len; i += 2)
4427+
*seq++ = (seq_nt16_table[usrc[i]] << 4) | seq_nt16_table[usrc[i + 1]];
4428+
for (; i < len; i++)
4429+
*seq++ = seq_nt16_table[usrc[i]] << 4;
4430+
4431+
return seq - seq0;
4432+
}
4433+
4434+
int sam_parse_qual(uint8_t *qual, const char *src, size_t len)
4435+
{
4436+
if (src[0] == '*' && (len == 1 || src[1] == '\0')) {
4437+
memset(qual, 0xff, len);
4438+
}
4439+
else {
4440+
int failed = 0;
4441+
COPY_MINUS_N(qual, src, 33, len, failed);
4442+
if (failed) { errno = EINVAL; return -1; }
4443+
}
4444+
4445+
return len;
4446+
}
4447+
44034448
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
44044449
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
44054450
{

0 commit comments

Comments
 (0)