Skip to content

Commit 5177190

Browse files
committed
update to work with vcf 4.4 prefixed phasing info
1 parent a662866 commit 5177190

File tree

5 files changed

+203
-4
lines changed

5 files changed

+203
-4
lines changed

htslib/vcf.h

+108
Original file line numberDiff line numberDiff line change
@@ -1501,6 +1501,34 @@ static inline int bcf_float_is_vector_end(float f)
15011501
return u.i==bcf_float_vector_end ? 1 : 0;
15021502
}
15031503

1504+
typedef enum bcf_version {v41 = 1, v42, v43, v44} bcf_version;
1505+
/**
1506+
* bcf_get_version - get the version as bcf_version enumeration
1507+
* @param hdr - bcf header, to get version
1508+
* @param ipver - pointer to return version
1509+
* Returns 0 on success and -1 on failure
1510+
*/
1511+
static inline int bcf_get_version(const bcf_hdr_t *hdr, bcf_version *ver)
1512+
{
1513+
const char *version = NULL;
1514+
1515+
if (!hdr || !ver) {
1516+
return -1;
1517+
}
1518+
1519+
version = bcf_hdr_get_version(hdr);
1520+
if (!strcmp("VCFv4.1", version)) {
1521+
*ver = v41;
1522+
} else if (!strcmp("VCFv4.2", version)) {
1523+
*ver = v42;
1524+
} else if (!strcmp("VCFv4.3", version)) {
1525+
*ver = v43;
1526+
} else {
1527+
*ver = v44;
1528+
}
1529+
return 0;
1530+
}
1531+
15041532
static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
15051533
{
15061534
uint32_t e = 0;
@@ -1528,6 +1556,86 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
15281556
return e == 0 ? 0 : -1;
15291557
}
15301558

1559+
/**
1560+
* bcf_format_gt1 - formats GT information on a string
1561+
* @param hdr - bcf header, to get version
1562+
* @param fmt - pointer to bcf format data
1563+
* @param isample - position of interested sample in data
1564+
* @param str - pointer to output string
1565+
* Returns 0 on success and -1 on failure
1566+
* This method is extended from bcf_format_gt to output phasing information
1567+
* in accordance with v4.4 format, which supports explicit / prefixed phasing
1568+
* for 1st allele.
1569+
* Explicit / prefixed phasing for 1st allele is used only when it is a must to
1570+
* correctly express phasing.
1571+
*/
1572+
static inline int bcf_format_gt1(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
1573+
{
1574+
uint32_t e = 0;
1575+
bcf_version ver = v42;
1576+
int ploidy = 1, anyunphased = 0;
1577+
int32_t val0 = 0;
1578+
kstring_t tmp1 = KS_INITIALIZE, tmp2 = KS_INITIALIZE;
1579+
1580+
if (bcf_get_version(hdr, &ver)) {
1581+
hts_log_error("Failed to get version information");
1582+
return -1;
1583+
}
1584+
#define BRANCH(type_t, convert, missing, vector_end) { \
1585+
uint8_t *ptr = fmt->p + isample*fmt->size; \
1586+
int i; \
1587+
for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
1588+
{ \
1589+
type_t val = convert(ptr); \
1590+
if ( val == vector_end ) break; \
1591+
if (!i) { val0 = val; } \
1592+
if (i) { \
1593+
e |= kputc("/|"[val & 1], &tmp1) < 0; \
1594+
anyunphased |= !(val & 1); \
1595+
} \
1596+
if (!(val >> 1)) e |= kputc('.', &tmp1) < 0; \
1597+
else e |= kputw((val >> 1) - 1, &tmp1) < 0; \
1598+
} \
1599+
if (i == 0) e |= kputc('.', &tmp1) < 0; \
1600+
ploidy = i; \
1601+
}
1602+
switch (fmt->type) {
1603+
case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break;
1604+
case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break;
1605+
case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break;
1606+
case BCF_BT_NULL: e |= kputc('.', &tmp1) < 0; break;
1607+
default: hts_log_error("Unexpected type %d", fmt->type); return -2;
1608+
}
1609+
#undef BRANCH
1610+
1611+
if (ver >= v44) { //output which supports prefixed phasing
1612+
/* update 1st allele's phasing if required and append rest to it.
1613+
use prefixed phasing only when it is a must. i.e. without which the
1614+
inferred value will be incorrect */
1615+
if (val0 & 1) {
1616+
/* 1st one is phased, if ploidy is > 1 and an unphased allele exists
1617+
need to specify explicitly */
1618+
e |= (ploidy > 1 && anyunphased) ?
1619+
(kputc('|', &tmp2) < 0) :
1620+
0;
1621+
} else {
1622+
/* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
1623+
ploidy > 1 and no other unphased allele exist, need to specify
1624+
explicitly */
1625+
e |= ((ploidy <= 1) || (ploidy > 1 && !anyunphased)) ?
1626+
(kputc('/', &tmp2) < 0) :
1627+
0;
1628+
}
1629+
e |= kputsn(tmp1.s, tmp1.l, &tmp2) < 0; //append rest with updated one
1630+
ks_free(&tmp1);
1631+
tmp1 = tmp2;
1632+
}
1633+
//updated v44 string or <v44 without any update
1634+
e |= kputsn(tmp1.s, tmp1.l, str) < 0;
1635+
ks_free(&tmp1);
1636+
return e == 0 ? 0 : -1;
1637+
}
1638+
15311639
static inline int bcf_enc_size(kstring_t *s, int size, int type)
15321640
{
15331641
// Most common case is first

test/test.pl

+10
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
run_test('test_bcf2vcf',$opts);
5454
run_test('test_vcf_sweep',$opts,out=>'test-vcf-sweep.out');
5555
run_test('test_vcf_various',$opts);
56+
run_test('test_vcf_44', $opts);
5657
run_test('test_bcf_sr_sort',$opts);
5758
run_test('test_bcf_sr_no_index',$opts);
5859
run_test('test_bcf_sr_range', $opts);
@@ -1159,6 +1160,15 @@ sub test_vcf_various
11591160
cmd => "$$opts{path}/test_view $$opts{path}/modhdr.vcf.gz chr22:1-2");
11601161
}
11611162

1163+
sub test_vcf_44
1164+
{
1165+
my ($opts, %args) = @_;
1166+
1167+
# vcf4.4 with implicit and explicit phasing info combinations
1168+
test_cmd($opts, %args, out => "vcf44_1.expected",
1169+
cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf44_1.vcf");
1170+
}
1171+
11621172
sub write_multiblock_bgzf {
11631173
my ($name, $frags) = @_;
11641174

test/vcf44_1.expected

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
##fileformat=VCFv4.4
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=1,length=1000>
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
##failue="test file on explicit and implicit phasing markers in 4.4"
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097
7+
1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1
8+
1 61480 rs56992751 T A 100 PASS . GT 0 /0|1
9+
1 61481 rs56992752 T A 100 PASS . GT /0 |0/1
10+
1 61482 rs56992752 T A 100 PASS . GT /0 /1
11+
1 61483 rs56992752 T A 100 PASS . GT 0 1
12+
1 61484 rs56992752 T A 100 PASS . GT 0 /1
13+
1 61485 rs56992752 T A 100 PASS . GT 0 1
14+
1 61486 rs56992752 T A 100 PASS . GT 0 1
15+
1 61487 rs56992752 T A 100 PASS . GT 0 1
16+
1 61488 rs56992752 T A 100 PASS . GT 0 /1
17+
1 61489 rs56992752 T A 100 PASS . GT /0 1
18+
1 61490 rs56992752 T A 100 PASS . GT /0 1
19+
1 61491 rs56992752 T A 100 PASS . GT /0 /1
20+
1 61492 rs56992752 T A 100 PASS . GT /0|0 1/0
21+
1 61493 rs56992752 T A 100 PASS . GT 0|0 |1/0
22+
1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0
23+
1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0
24+
1 61496 rs56992752 T A 100 PASS . GT . .
25+
1 61497 rs56992752 T A 100 PASS . GT ./1 .|1
26+
1 61498 rs56992752 T A 100 PASS . GT 1/. 1|.
27+
1 61499 rs56992752 T A 100 PASS . GT ./. .|.

test/vcf44_1.vcf

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
##fileformat=VCFv4.4
2+
##contig=<ID=1,length=1000>
3+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
4+
##failue="test file on explicit and implicit phasing markers in 4.4"
5+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097
6+
1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1
7+
1 61480 rs56992751 T A 100 PASS . GT 0 /0|1
8+
1 61481 rs56992752 T A 100 PASS . GT /0 |0/1
9+
1 61482 rs56992752 T A 100 PASS . GT /0 /1
10+
1 61483 rs56992752 T A 100 PASS . GT 0 1
11+
1 61484 rs56992752 T A 100 PASS . GT 0 /1
12+
1 61485 rs56992752 T A 100 PASS . GT 0 |1
13+
1 61486 rs56992752 T A 100 PASS . GT |0 1
14+
1 61487 rs56992752 T A 100 PASS . GT |0 |1
15+
1 61488 rs56992752 T A 100 PASS . GT |0 /1
16+
1 61489 rs56992752 T A 100 PASS . GT /0 1
17+
1 61490 rs56992752 T A 100 PASS . GT /0 |1
18+
1 61491 rs56992752 T A 100 PASS . GT /0 /1
19+
1 61492 rs56992752 T A 100 PASS . GT /0|0 /1/0
20+
1 61493 rs56992752 T A 100 PASS . GT |0|0 |1/0
21+
1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0
22+
1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0
23+
1 61496 rs56992752 T A 100 PASS . GT . .
24+
1 61497 rs56992752 T A 100 PASS . GT ./1 .|1
25+
1 61498 rs56992752 T A 100 PASS . GT 1/. 1|.
26+
1 61499 rs56992752 T A 100 PASS . GT ./. .|.

vcf.c

+32-4
Original file line numberDiff line numberDiff line change
@@ -3061,8 +3061,14 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
30613061
const char *t = q + 1;
30623062
int m = 0; // m: sample id
30633063
const int nsamples = bcf_hdr_nsamples(h);
3064-
3064+
bcf_version ver = v42;
30653065
const char *end = s->s + s->l;
3066+
3067+
if (bcf_get_version(h, &ver)) {
3068+
hts_log_error("Failed to get version information");
3069+
return -1;
3070+
}
3071+
30663072
while ( t<end )
30673073
{
30683074
// can we skip some samples?
@@ -3099,13 +3105,25 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
30993105
int l;
31003106
if (z->is_gt) {
31013107
// Genotypes.
3102-
// <val>([|/]<val>)+... where <val> is [0-9]+ or ".".
3108+
//([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
31033109
int32_t is_phased = 0;
31043110
uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
31053111
uint32_t unreadable = 0;
31063112
uint32_t max = 0;
3107-
int overflow = 0;
3113+
int overflow = 0, ploidy = 0, anyunphased = 0, \
3114+
phasingprfx = 0;
3115+
3116+
/* with prefixed phasing, it is explicitly given for 1st one
3117+
with non-prefixed, set based on ploidy and phasing of other
3118+
alleles. */
3119+
if (ver >= v44 && (*t == '|' || *t == '/')) {
3120+
// cache prefix and phasing status
3121+
is_phased = *t++ == '|';
3122+
phasingprfx = 1;
3123+
}
3124+
31083125
for (l = 0;; ++t) {
3126+
ploidy++;
31093127
if (*t == '.') {
31103128
++t, x[l++] = is_phased;
31113129
} else {
@@ -3125,9 +3143,19 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
31253143
if (max < val) max = val;
31263144
x[l++] = (val + 1) << 1 | is_phased;
31273145
}
3146+
anyunphased |= (ploidy != 1) && !is_phased;
31283147
is_phased = (*t == '|');
31293148
if (*t != '|' && *t != '/') break;
31303149
}
3150+
if (ver >= v44 && !phasingprfx) {
3151+
/* no explicit phasing for 1st allele, set based on
3152+
other alleles and ploidy */
3153+
if (ploidy == 1) { //implicitly phased
3154+
x[0]|= 1;
3155+
} else { //set by other unphased alleles
3156+
x[0] |= anyunphased ? 0 : 1;
3157+
}
3158+
}
31313159
// Possibly check max against v->n_allele instead?
31323160
if (overflow || max > (INT32_MAX >> 1) - 1) {
31333161
hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
@@ -4187,7 +4215,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
41874215
if (!first) kputc_(':', s);
41884216
first = 0;
41894217
if (gt_i == i) {
4190-
bcf_format_gt(f,j,s);
4218+
bcf_format_gt1(h, f,j,s);
41914219
break;
41924220
}
41934221
else if (f->n == 1)

0 commit comments

Comments
 (0)