Skip to content

Commit 41ea68b

Browse files
jmarshallwhitwham
authored andcommitted
Read whole lines at once in fai_retrieve()
Because fai_retrieve() is given only well-formatted input containing lines of the same length, it already knows exactly where the base and non-graphic characters are. So in general the interval to be read will look like ......ATGCAT (read last six bases and line terminator) ATGCATGCATGC (read complete line including line terminator) ATGCATGCATGC (read complete line including line terminator) ATGC........ (read first four base characters) and can be read a line at a time instead of a character at a time, with special handling for the partial first and last lines, and discarding the terminator characters at the end of each line read.
1 parent 6012472 commit 41ea68b

File tree

1 file changed

+47
-18
lines changed

1 file changed

+47
-18
lines changed

faidx.c

+47-18
Original file line numberDiff line numberDiff line change
@@ -715,9 +715,8 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
715715

716716
static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
717717
uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) {
718-
char *s;
719-
size_t l;
720-
int c = 0;
718+
char *buffer, *s;
719+
ssize_t nread, remaining, firstline_len, firstline_blen;
721720
int ret;
722721

723722
if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
@@ -743,27 +742,57 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
743742
return NULL;
744743
}
745744

746-
l = 0;
747-
s = (char*)malloc((size_t) end - beg + 2);
748-
if (!s) {
745+
// Over-allocate so there is extra space for one end-of-line sequence
746+
buffer = (char*)malloc((size_t) end - beg + val->line_len - val->line_blen + 1);
747+
if (!buffer) {
749748
*len = -1;
750749
return NULL;
751750
}
752751

753-
BGZF *fp = fai->bgzf;
754-
while ( l < end - beg && (c=bgzf_getc(fp))>=0 )
755-
if (isgraph(c)) s[l++] = c;
756-
if (c < 0) {
757-
hts_log_error("Failed to retrieve block: %s",
758-
c == -1 ? "unexpected end of file" : "error reading file");
759-
free(s);
760-
*len = -1;
761-
return NULL;
752+
remaining = *len = end - beg;
753+
firstline_blen = val->line_blen - beg % val->line_blen;
754+
755+
// Special case when the entire interval requested is within a single FASTA/Q line
756+
if (remaining <= firstline_blen) {
757+
nread = bgzf_read_small(fai->bgzf, buffer, remaining);
758+
if (nread < remaining) goto error;
759+
buffer[nread] = '\0';
760+
return buffer;
761+
}
762+
763+
s = buffer;
764+
firstline_len = val->line_len - beg % val->line_blen;
765+
766+
// Read the (partial) first line and its line terminator, but increment s past the
767+
// line contents only, so the terminator characters will be overwritten by the next line.
768+
nread = bgzf_read_small(fai->bgzf, s, firstline_len);
769+
if (nread < firstline_len) goto error;
770+
s += firstline_blen;
771+
remaining -= firstline_blen;
772+
773+
// Similarly read complete lines and their line terminator characters, but overwrite the latter.
774+
while (remaining > val->line_blen) {
775+
nread = bgzf_read_small(fai->bgzf, s, val->line_len);
776+
if (nread < (ssize_t) val->line_len) goto error;
777+
s += val->line_blen;
778+
remaining -= val->line_blen;
762779
}
763780

764-
s[l] = '\0';
765-
*len = l;
766-
return s;
781+
if (remaining > 0) {
782+
nread = bgzf_read_small(fai->bgzf, s, remaining);
783+
if (nread < remaining) goto error;
784+
s += remaining;
785+
}
786+
787+
*s = '\0';
788+
return buffer;
789+
790+
error:
791+
hts_log_error("Failed to retrieve block: %s",
792+
(nread == 0)? "unexpected end of file" : "error reading file");
793+
free(buffer);
794+
*len = -1;
795+
return NULL;
767796
}
768797

769798
static int fai_get_val(const faidx_t *fai, const char *str,

0 commit comments

Comments
 (0)