Skip to content

Commit 3c90481

Browse files
authored
Add support for matching VCF lines by ID (PR #1844)
See samtools/bcftools#1739
1 parent 4c1acb8 commit 3c90481

File tree

3 files changed

+25
-6
lines changed

3 files changed

+25
-6
lines changed

bcf_sr_sort.c

+13-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (C) 2017-2021 Genome Research Ltd.
2+
Copyright (C) 2017-2021,2024 Genome Research Ltd.
33
44
Author: Petr Danecek <[email protected]>
55
@@ -32,6 +32,7 @@
3232
#include "htslib/khash_str2int.h"
3333
#include "htslib/kbitset.h"
3434

35+
// Variant types and pair-wise compatibility of their combinations, see bcf_sr_init_scores()
3536
#define SR_REF 1
3637
#define SR_SNP 2
3738
#define SR_INDEL 4
@@ -366,7 +367,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
366367
// group VCFs into groups, each with a unique combination of variants in the duplicate lines
367368
int ireader,ivar,irec,igrp,ivset,iact;
368369
for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
369-
for (iact=0; iact<srt->nactive; iact++)
370+
for (iact=0; iact<srt->nactive; iact++) // process each of the active readers, ie which still have a record to process
370371
{
371372
ireader = srt->active[iact];
372373
bcf_sr_t *reader = &readers->readers[ireader];
@@ -384,6 +385,11 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
384385
srt->off[srt->noff++] = srt->str.l;
385386
size_t beg = srt->str.l;
386387
int end_pos = -1;
388+
if ( srt->pair & BCF_SR_PAIR_ID )
389+
{
390+
kputs(line->d.id,&srt->str);
391+
kputc(':',&srt->str);
392+
}
387393
for (ivar=1; ivar<line->n_allele; ivar++)
388394
{
389395
if ( ivar>1 ) kputc(',',&srt->str);
@@ -417,7 +423,10 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
417423
}
418424

419425
// Create new variant or attach to existing one. But careful, there can be duplicate
420-
// records with the same POS,REF,ALT (e.g. in dbSNP-b142)
426+
// records with the same POS,REF,ALT (e.g. in dbSNP-b142). In such case, use a
427+
// hash table (srt->var_str2int) and a counter (var_idx) to ensure they are
428+
// treated as separate variants, while still allowing them to be matched
429+
// between readers.
421430
char *var_str = beg + srt->str.s;
422431
int ret, var_idx = 0, var_end = srt->str.l;
423432
while ( 1 )
@@ -435,6 +444,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
435444
}
436445
if ( ret==-1 )
437446
{
447+
// the variant is not present, insert
438448
ivar = srt->nvar++;
439449
hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
440450
srt->var[ivar].nvcf = 0;

bcf_sr_sort.h

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (C) 2017 Genome Research Ltd.
2+
Copyright (C) 2017-2019,2024 Genome Research Ltd.
33
44
Author: Petr Danecek <[email protected]>
55
@@ -55,6 +55,9 @@ typedef struct
5555
}
5656
var_t;
5757

58+
// Group is a set of variants in duplicate records within one VCF. They are identified with a key (used only
59+
// for debugging), such as C>A,C>G;C>T. Commas separate alleles in a multiallelic record, semicolons separate
60+
// VCF lines.
5861
typedef struct
5962
{
6063
char *key; // only for debugging
@@ -67,7 +70,7 @@ typedef struct
6770
{
6871
int nvar, mvar, *var; // list of compatible variants that can be output together
6972
int cnt; // number of readers in this group
70-
kbitset_t *mask; // which groups are populated in this set (replace with expandable bitmask)
73+
kbitset_t *mask; // which groups are populated in this set
7174
}
7275
varset_t;
7376

@@ -100,8 +103,13 @@ sr_sort_t;
100103
sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt);
101104
void bcf_sr_sort_reset(sr_sort_t *srt);
102105
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t pos);
106+
107+
// initialize a new position using the i-th reader
103108
int bcf_sr_sort_set_active(sr_sort_t *srt, int i);
109+
110+
// add i-th reader with the same position, assumed bcf_sr_sort_set_active() was called with another reader
104111
int bcf_sr_sort_add_active(sr_sort_t *srt, int i);
112+
105113
void bcf_sr_sort_destroy(sr_sort_t *srt);
106114
void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i);
107115

htslib/synced_bcf_reader.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/// @file htslib/synced_bcf_reader.h
22
/// Stream through multiple VCF files.
33
/*
4-
Copyright (C) 2012-2017, 2019-2023 Genome Research Ltd.
4+
Copyright (C) 2012-2017, 2019-2024 Genome Research Ltd.
55
66
Author: Petr Danecek <[email protected]>
77
@@ -89,6 +89,7 @@ extern "C" {
8989
#define BCF_SR_PAIR_SNP_REF (1<<4) // allow REF-only records with SNPs
9090
#define BCF_SR_PAIR_INDEL_REF (1<<5) // allow REF-only records with indels
9191
#define BCF_SR_PAIR_EXACT (1<<6) // require the exact same set of alleles in all files
92+
#define BCF_SR_PAIR_ID (1<<7) // require matching IDs (overlap)
9293
#define BCF_SR_PAIR_BOTH (BCF_SR_PAIR_SNPS|BCF_SR_PAIR_INDELS)
9394
#define BCF_SR_PAIR_BOTH_REF (BCF_SR_PAIR_SNPS|BCF_SR_PAIR_INDELS|BCF_SR_PAIR_SNP_REF|BCF_SR_PAIR_INDEL_REF)
9495

0 commit comments

Comments
 (0)