From 329e7943b7ba3f0af15b0eaa00a367a1ac15bd83 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 7 Jan 2025 12:52:43 +0000 Subject: [PATCH] Allow BYTE_ARRAY_STOP to work on non-zero STOP code with TOK3. Our htscodec name tokeniser decoder always adds nul bytes between names. This happens to match the default STOP byte used in htslib's CRAM implementation, but there's nothing to say it has to be 0 and indeed Java uses 9 (tab). This is an oversight and ideally we'd change the name tokeniser decode function to take an additional parameter to specify the stop byte, but that's changing the API. Easiest is just to recognise this on-the-fly and correct the error by looking for a different stop byte. Also fixed cram_uncompress_block setting of b->orig_method. This was only correct when the original prototype definitions of RANS_PR0 were in use, and with the RANSPR official numbering the calculation caused RLE+O1 to be mislabelled as TOK3. This field isn't used in anything else anyway during decode (but has some diagnostic usage during encode). The official API is via cram_block_get_method and cram_expand_method. --- cram/cram_codecs.c | 5 ++++- cram/cram_io.c | 6 ++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index a72419e1c..fe454f794 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -3613,7 +3613,10 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, cp = b->data + b->idx; cp_end = b->data + b->uncomp_size; - stop = c->u.byte_array_stop.stop; + // STOP byte is hard-coded as zero by our name tokeniser decoder + // implementation, so we may ignore what was requested. + stop = b->orig_method == TOK3 ? 0 : c->u.byte_array_stop.stop; + if (cp_end - cp < out->alloc - out->byte) { unsigned char *out_cp = BLOCK_END(out); while (cp != cp_end && *cp != stop) diff --git a/cram/cram_io.c b/cram/cram_io.c index 35dbb99c8..5e5891114 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1698,8 +1698,7 @@ int cram_uncompress_block(cram_block *b) { free(uncomp); return -1; } - b->orig_method = RANS_PR0 + (b->data[0]&1) - + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); + b->orig_method = RANSPR; free(b->data); b->data = (unsigned char *)uncomp; b->alloc = usize2; @@ -1718,8 +1717,7 @@ int cram_uncompress_block(cram_block *b) { free(uncomp); return -1; } - b->orig_method = ARITH_PR0 + (b->data[0]&1) - + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); + b->orig_method = ARITH; free(b->data); b->data = (unsigned char *)uncomp; b->alloc = usize2;