Skip to content
This repository was archived by the owner on Jun 3, 2021. It is now read-only.

Commit 9ed0641

Browse files
committed
Attempt to make tabix work for VCF and SAM with long references
The normal way of estimating n_lvls breaks down at about 4 Gbases for the default CSI min_shift. This adds a very simple parser to grab any reference lengths from the headers and find the longest. The value is then used to adjust n_lvls if necessary.
1 parent 1606913 commit 9ed0641

File tree

1 file changed

+54
-3
lines changed

1 file changed

+54
-3
lines changed

tbx.c

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
9393
intv->ss = line + b; intv->se = line + i;
9494
} else if (id == conf->bc) {
9595
// here ->beg is 0-based.
96-
intv->beg = intv->end = strtol(line + b, &s, 0);
96+
intv->beg = intv->end = strtoll(line + b, &s, 0);
9797
if ( s==line+b ) return -1; // expected int
9898
if (!(conf->preset&TBX_UCSC)) --intv->beg;
9999
else ++intv->end;
@@ -103,7 +103,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
103103
if ((conf->preset&0xffff) == TBX_GENERIC) {
104104
if (id == conf->ec)
105105
{
106-
intv->end = strtol(line + b, &s, 0);
106+
intv->end = strtoll(line + b, &s, 0);
107107
if ( s==line+b ) return -1; // expected int
108108
}
109109
} else if ((conf->preset&0xffff) == TBX_SAM) {
@@ -131,7 +131,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
131131
s = strstr(line + b, ";END=");
132132
if (s) s += 5;
133133
}
134-
if (s) intv->end = strtol(s, &s, 0);
134+
if (s) intv->end = strtoll(s, &s, 0);
135135
line[i] = c;
136136
}
137137
}
@@ -220,6 +220,44 @@ static int tbx_set_meta(tbx_t *tbx)
220220
return 0;
221221
}
222222

223+
// Minimal effort parser to extract reference length out of VCF header line
224+
// This is used only used to adjust the number of levels if necessary,
225+
// so not a major problem if it doesn't always work.
226+
static void adjust_max_ref_len_vcf(const char *str, int64_t *max_ref_len)
227+
{
228+
const char *ptr;
229+
int64_t len;
230+
if (strncmp(str, "##contig", 8) != 0) return;
231+
ptr = strstr(str + 8, "length");
232+
if (!ptr) return;
233+
for (ptr += 6; *ptr == ' ' || *ptr == '='; ptr++) {}
234+
len = strtoll(ptr, NULL, 10);
235+
if (*max_ref_len < len) *max_ref_len = len;
236+
}
237+
238+
// Same for sam files
239+
static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len)
240+
{
241+
const char *ptr;
242+
int64_t len;
243+
if (strncmp(str, "@SQ", 3) != 0) return;
244+
ptr = strstr(str + 3, "\tLN:");
245+
if (!ptr) return;
246+
ptr += 4;
247+
len = strtoll(ptr, NULL, 10);
248+
if (*max_ref_len < len) *max_ref_len = len;
249+
}
250+
251+
// Adjusts number of levels if not big enough. This can happen for
252+
// files with very large contigs.
253+
static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len)
254+
{
255+
int64_t s = 1LL << (min_shift + n_lvls * 3);
256+
max_len += 256;
257+
for (; max_len > s; ++n_lvls, s <<= 3) {}
258+
return n_lvls;
259+
}
260+
223261
tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
224262
{
225263
tbx_t *tbx;
@@ -228,6 +266,7 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
228266
int64_t lineno = 0;
229267
uint64_t last_off = 0;
230268
tbx_intv_t intv;
269+
int64_t max_ref_len = 0;
231270

232271
str.s = 0; str.l = str.m = 0;
233272
tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
@@ -237,11 +276,23 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
237276
else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
238277
while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
239278
++lineno;
279+
if (str.s[0] == tbx->conf.meta_char && fmt == HTS_FMT_CSI) {
280+
switch (tbx->conf.preset) {
281+
case TBX_SAM:
282+
adjust_max_ref_len_sam(str.s, &max_ref_len); break;
283+
case TBX_VCF:
284+
adjust_max_ref_len_vcf(str.s, &max_ref_len); break;
285+
default:
286+
break;
287+
}
288+
}
240289
if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
241290
last_off = bgzf_tell(fp);
242291
continue;
243292
}
244293
if (first == 0) {
294+
if (fmt == HTS_FMT_CSI)
295+
n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len);
245296
tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
246297
if (!tbx->idx) goto fail;
247298
first = 1;

0 commit comments

Comments
 (0)