Skip to content
This repository was archived by the owner on Jun 3, 2021. It is now read-only.

Commit 0f0d091

Browse files
committed
Make synced_bcf_reader work with 64 bit positions.
MAX_CSI_COOR is now about 12 Tbp, which is the limit for a CSI index with min_shift 14.
1 parent 9ed0641 commit 0f0d091

File tree

4 files changed

+34
-25
lines changed

4 files changed

+34
-25
lines changed

Diff for: bcf_sr_sort.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ void debug_vbuf(sr_sort_t *srt)
288288
for (i=0; i<srt->sr->nreaders; i++)
289289
{
290290
vcf_buf_t *buf = &srt->vcf_buf[i];
291-
fprintf(stderr,"\t%d", buf->rec[j] ? buf->rec[j]->pos+1 : 0);
291+
fprintf(stderr,"\t%"PRIhts_pos, buf->rec[j] ? buf->rec[j]->pos+1 : 0);
292292
}
293293
fprintf(stderr,"\n");
294294
}
@@ -330,7 +330,7 @@ int bcf_sr_sort_add_active(sr_sort_t *srt, int idx)
330330
srt->active[srt->nactive - 1] = idx;
331331
return 0; // FIXME: check for errs in this function
332332
}
333-
static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
333+
static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos)
334334
{
335335
if ( !srt->grp_str2int )
336336
{
@@ -556,7 +556,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
556556
return 0; // FIXME: check for errs in this function
557557
}
558558

559-
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
559+
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos)
560560
{
561561
int i,j;
562562
assert( srt->nactive>0 );

Diff for: bcf_sr_sort.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -90,15 +90,16 @@ typedef struct
9090
int moff, noff, *off, mcharp;
9191
char **charp;
9292
const char *chr;
93-
int pos, nsr, msr;
93+
hts_pos_t pos;
94+
int nsr, msr;
9495
int pair;
9596
int nactive, mactive, *active; // list of readers with lines at the current pos
9697
}
9798
sr_sort_t;
9899

99100
sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt);
100101
void bcf_sr_sort_reset(sr_sort_t *srt);
101-
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int pos);
102+
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t pos);
102103
int bcf_sr_sort_set_active(sr_sort_t *srt, int i);
103104
int bcf_sr_sort_add_active(sr_sort_t *srt, int i);
104105
void bcf_sr_sort_destroy(sr_sort_t *srt);

Diff for: htslib/synced_bcf_reader.h

+5-4
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,9 @@ typedef struct _bcf_sr_regions_t
125125
char **seq_names; // sequence names
126126
int nseqs; // number of sequences (chromosomes) in the file
127127
int iseq; // current position: chr name, index to snames
128-
int start, end; // current position: start, end of the region (0-based)
129-
int prev_seq, prev_start;
128+
hts_pos_t start, end; // current position: start, end of the region (0-based)
129+
int prev_seq;
130+
hts_pos_t prev_start;
130131
}
131132
bcf_sr_regions_t;
132133

@@ -241,7 +242,7 @@ int bcf_sr_next_line(bcf_srs_t *readers);
241242
* @seq: sequence name; NULL to seek to start
242243
* @pos: 0-based coordinate
243244
*/
244-
int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos);
245+
int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos);
245246

246247
/**
247248
* bcf_sr_set_samples() - sets active samples
@@ -336,7 +337,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg);
336337
* regions and more regions exist; -2 if not in the regions and there are no more
337338
* regions left.
338339
*/
339-
int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end);
340+
int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end);
340341

341342
/*
342343
* bcf_sr_regions_flush() - calls repeatedly regs->missed_reg_handler() until

Diff for: synced_bcf_reader.c

+23-16
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,15 @@ DEALINGS IN THE SOFTWARE. */
3939
#include "htslib/thread_pool.h"
4040
#include "bcf_sr_sort.h"
4141

42-
#define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi
42+
// Maximum indexable coordinate of .csi, for default min_shift of 14.
43+
// This comes out to about 17 Tbp. Limiting factor is the bin number,
44+
// which is a uint32_t in CSI. The highest number of levels compatible
45+
// with this is 10 (needs 31 bits).
46+
#define MAX_CSI_COOR ((1LL << (14 + 30)) - 1)
4347

4448
typedef struct
4549
{
46-
uint32_t start, end;
50+
hts_pos_t start, end;
4751
}
4852
region1_t;
4953

@@ -61,7 +65,7 @@ typedef struct
6165
}
6266
aux_t;
6367

64-
static int _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end);
68+
static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end);
6569
static bcf_sr_regions_t *_regions_init_string(const char *str);
6670
static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec);
6771

@@ -420,11 +424,11 @@ static inline int has_filter(bcf_sr_t *reader, bcf1_t *line)
420424
return 0;
421425
}
422426

423-
static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end)
427+
static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_pos_t end)
424428
{
425429
if ( end>=MAX_CSI_COOR )
426430
{
427-
hts_log_error("The coordinate is out of csi index limit: %d", end+1);
431+
hts_log_error("The coordinate is out of csi index limit: %"PRIhts_pos, end+1);
428432
exit(1);
429433
}
430434
if ( reader->itr )
@@ -446,7 +450,7 @@ static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end)
446450
reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1);
447451
}
448452
if (!reader->itr) {
449-
hts_log_error("Could not seek: %s:%d-%d", seq, start + 1, end + 1);
453+
hts_log_error("Could not seek: %s:%"PRIhts_pos"-%"PRIhts_pos, seq, start + 1, end + 1);
450454
assert(0);
451455
}
452456
return 0;
@@ -581,7 +585,8 @@ static void _reader_shift_buffer(bcf_sr_t *reader)
581585

582586
static int next_line(bcf_srs_t *files)
583587
{
584-
int i, min_pos = INT_MAX;
588+
int i;
589+
hts_pos_t min_pos = HTS_POS_MAX;
585590
const char *chr = NULL;
586591

587592
// Loop until next suitable line is found or all readers have finished
@@ -606,7 +611,7 @@ static int next_line(bcf_srs_t *files)
606611
else if ( min_pos==files->readers[i].buffer[1]->pos )
607612
bcf_sr_sort_add_active(&BCF_SR_AUX(files)->sort, i);
608613
}
609-
if ( min_pos==INT_MAX )
614+
if ( min_pos==HTS_POS_MAX )
610615
{
611616
if ( !files->regions ) break;
612617
continue;
@@ -622,7 +627,7 @@ static int next_line(bcf_srs_t *files)
622627
for (i=0; i<files->nreaders; i++)
623628
if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos )
624629
_reader_shift_buffer(&files->readers[i]);
625-
min_pos = INT_MAX;
630+
min_pos = HTS_POS_MAX;
626631
chr = NULL;
627632
continue;
628633
}
@@ -672,7 +677,7 @@ static void bcf_sr_seek_start(bcf_srs_t *readers)
672677
}
673678

674679

675-
int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos)
680+
int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos)
676681
{
677682
if ( !readers->regions ) return 0;
678683
bcf_sr_sort_reset(&BCF_SR_AUX(readers)->sort);
@@ -767,7 +772,7 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
767772

768773
// Add a new region into a list sorted by start,end. On input the coordinates
769774
// are 1-based, stored 0-based, inclusive.
770-
static int _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end)
775+
static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end)
771776
{
772777
if ( start==-1 && end==-1 )
773778
{
@@ -828,7 +833,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
828833

829834
kstring_t tmp = {0,0,0};
830835
const char *sp = str, *ep = str;
831-
int from, to;
836+
hts_pos_t from, to;
832837
while ( 1 )
833838
{
834839
while ( *ep && *ep!=',' && *ep!=':' ) ep++;
@@ -880,7 +885,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
880885

881886
// ichr,ifrom,ito are 0-based;
882887
// returns -1 on error, 0 if the line is a comment line, 1 on success
883-
static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to)
888+
static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char **chr, char **chr_end, hts_pos_t *from, hts_pos_t *to)
884889
{
885890
if (ifrom < 0 || ito < 0) return -1;
886891
*chr_end = NULL;
@@ -970,7 +975,8 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr
970975
while ( hts_getline(reg->file, KS_SEP_LINE, &reg->line) > 0 )
971976
{
972977
char *chr, *chr_end;
973-
int from, to, ret;
978+
hts_pos_t from, to;
979+
int ret;
974980
ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to);
975981
if ( ret < 0 )
976982
{
@@ -1077,7 +1083,8 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg)
10771083

10781084
// reading from tabix
10791085
char *chr, *chr_end;
1080-
int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to;
1086+
int ichr = 0, ifrom = 1, ito = 2, is_bed = 0;
1087+
hts_pos_t from, to;
10811088
if ( reg->tbx )
10821089
{
10831090
ichr = reg->tbx->conf.sc-1;
@@ -1196,7 +1203,7 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re
11961203
return !(type & VCF_INDEL) ? 1 : 0;
11971204
}
11981205

1199-
int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end)
1206+
int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end)
12001207
{
12011208
int iseq;
12021209
if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence

0 commit comments

Comments
 (0)