diff --git a/htslib/vcf.h b/htslib/vcf.h index 4748699d0..2acf3f8f9 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -65,11 +65,21 @@ extern "C" { #define BCF_HT_STR 3 #define BCF_HT_LONG (BCF_HT_INT | 0x100) // BCF_HT_INT, but for int64_t values; VCF only! -#define BCF_VL_FIXED 0 // variable length -#define BCF_VL_VAR 1 -#define BCF_VL_A 2 -#define BCF_VL_G 3 -#define BCF_VL_R 4 +// Values for INFO / FORMAT "Number=" fields +#define BCF_VL_FIXED 0 ///< Integer defining a fixed number of items +#define BCF_VL_VAR 1 ///< Generic variable length ("Number=.") +#define BCF_VL_A 2 ///< One value per alternate allele +#define BCF_VL_G 3 ///< One value for each possible genotype +#define BCF_VL_R 4 ///< One value for each allele, including the reference + +// The following was introduced in VCFv4.4 and is only valid for FORMAT header lines +#define BCF_VL_P 5 ///< One value for each allele value defined in GT + +// The following were introduced in VCFv4.5 and are only valid for FORMAT header lines +#define BCF_VL_LA 6 ///< As BCF_VL_A, but only alt alleles listed in LAA are considered present +#define BCF_VL_LG 7 ///< As BCF_VL_G, but only alt alleles listed in LAA are considered present +#define BCF_VL_LR 8 ///< As BCF_VL_R, but only alt alleles listed in LAA are considered present +#define BCF_VL_M 9 ///< One value for each possible base modification for the corresponding ChEBI ID /* === Dictionary === diff --git a/vcf.c b/vcf.c index 6c374108f..312519e67 100644 --- a/vcf.c +++ b/vcf.c @@ -128,6 +128,7 @@ static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr) //version macros #define VCF_DEF 4002000 #define VCF44 4004000 +#define VCF45 4005000 #define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 ) #define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 ) @@ -910,14 +911,20 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) } else if ( !strcmp(hrec->keys[i], "Number") ) { + int is_fmt = hrec->type == BCF_HL_FMT; if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A; else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R; else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G; else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR; + else if ( is_fmt && !strcmp(hrec->vals[i],"P") ) var = BCF_VL_P; + else if ( is_fmt && !strcmp(hrec->vals[i],"LA") ) var = BCF_VL_LA; + else if ( is_fmt && !strcmp(hrec->vals[i],"LR") ) var = BCF_VL_LR; + else if ( is_fmt && !strcmp(hrec->vals[i],"LG") ) var = BCF_VL_LG; + else if ( is_fmt && !strcmp(hrec->vals[i],"M") ) var = BCF_VL_M; else { - sscanf(hrec->vals[i],"%d",&num); - var = BCF_VL_FIXED; + if (sscanf(hrec->vals[i],"%d",&num) == 1) + var = BCF_VL_FIXED; } if (var != BCF_VL_FIXED) num = 0xfffff; } @@ -928,7 +935,7 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) *hrec->key == 'I' ? "An" : "A", hrec->key); type = BCF_HT_STR; } - if (var == -1) { + if (var == UINT32_MAX) { hts_log_warning("%s %s field has no Number defined. Assuming '.'", *hrec->key == 'I' ? "An" : "A", hrec->key); var = BCF_VL_VAR; @@ -1226,26 +1233,114 @@ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, co return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type]; } +// Check the VCF header is correctly formatted as per the specification. +// Note the code that calls this doesn't bother to check return values and +// we have so many broken VCFs in the wild that for now we just reprt a +// warning and continue anyway. So currently this is a void function. void bcf_hdr_check_sanity(bcf_hdr_t *hdr) { - static int PL_warned = 0, GL_warned = 0; + int version = bcf_get_version(hdr, NULL); - if ( !PL_warned ) - { - int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL"); - if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G ) - { - hts_log_warning("PL should be declared as Number=G"); - PL_warned = 1; + struct tag { + char name[10]; + char type_str[3]; + int type; + int version; + }; + + struct tag info_tags[] = { + {"AD", "R", BCF_VL_R, VCF_DEF}, + {"ADF", "R", BCF_VL_R, VCF_DEF}, + {"ADR", "R", BCF_VL_R, VCF_DEF}, + {"AC", "A", BCF_VL_A, VCF_DEF}, + {"AF", "A", BCF_VL_A, VCF_DEF}, + {"CIGAR", "A", BCF_VL_A, VCF_DEF}, + {"AA", "1", BCF_VL_FIXED, VCF_DEF}, + {"AN", "1", BCF_VL_FIXED, VCF_DEF}, + {"BQ", "1", BCF_VL_FIXED, VCF_DEF}, + {"DB", "0", BCF_VL_FIXED, VCF_DEF}, + {"DP", "1", BCF_VL_FIXED, VCF_DEF}, + {"END", "1", BCF_VL_FIXED, VCF_DEF}, + {"H2", "0", BCF_VL_FIXED, VCF_DEF}, + {"H3", "0", BCF_VL_FIXED, VCF_DEF}, + {"SB", "4", BCF_VL_FIXED, VCF_DEF}, + {"SOMATIC", "0", BCF_VL_FIXED, VCF_DEF}, + {"VALIDATED", "0", BCF_VL_FIXED, VCF_DEF}, + {"1000G", "0", BCF_VL_FIXED, VCF_DEF}, + }; + static int info_warned[sizeof(info_tags)/sizeof(*info_tags)] = {0}; + + struct tag fmt_tags[] = { + {"AD", "R", BCF_VL_R, VCF_DEF}, + {"ADF", "R", BCF_VL_R, VCF_DEF}, + {"ADR", "R", BCF_VL_R, VCF_DEF}, + {"EC", "A", BCF_VL_A, VCF_DEF}, + {"GL", "G", BCF_VL_G, VCF_DEF}, + {"GP", "G", BCF_VL_G, VCF_DEF}, + {"PL", "G", BCF_VL_G, VCF_DEF}, + {"PP", "G", BCF_VL_G, VCF_DEF}, + {"DP", "1", BCF_VL_FIXED, VCF_DEF}, + {"LEN", "1", BCF_VL_FIXED, VCF_DEF}, + {"FT", "1", BCF_VL_FIXED, VCF_DEF}, + {"GQ", "1", BCF_VL_FIXED, VCF_DEF}, + {"GT", "1", BCF_VL_FIXED, VCF_DEF}, + {"HQ", "2", BCF_VL_FIXED, VCF_DEF}, + {"MQ", "1", BCF_VL_FIXED, VCF_DEF}, + {"PQ", "1", BCF_VL_FIXED, VCF_DEF}, + {"PS", "1", BCF_VL_FIXED, VCF_DEF}, + {"PSL", "P", BCF_VL_P, VCF44}, + {"PSO", "P", BCF_VL_P, VCF44}, + {"PSQ", "P", BCF_VL_P, VCF44}, + {"LGL", "LG", BCF_VL_LG, VCF45}, + {"LGP", "LG", BCF_VL_LG, VCF45}, + {"LPL", "LG", BCF_VL_LG, VCF45}, + {"LPP", "LP", BCF_VL_LG, VCF45}, + {"LEC", "LA", BCF_VL_LA, VCF45}, + {"LAD", "LR", BCF_VL_LR, VCF45}, + {"LADF", "LR", BCF_VL_LR, VCF45}, + {"LADR", "LR", BCF_VL_LR, VCF45}, + }; + static int fmt_warned[sizeof(fmt_tags)/sizeof(*fmt_tags)] = {0}; + + // Check INFO tag types. We shouldn't really permit ".", but it's + // commonly misused so we let it slide unless it's a new tag and the + // file format claims to be new also. We also cannot distinguish between + // Number=1 and Number=2, but we at least report the correct term if we + // get, say, Number=G in its place. + int i; + for (i = 0; i < sizeof(info_tags)/sizeof(*info_tags); i++) { + if (info_warned[i]) + continue; + int id = bcf_hdr_id2int(hdr, BCF_DT_ID, info_tags[i].name); + if (bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, id) && + bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != info_tags[i].type && + bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != BCF_VL_VAR) { + hts_log_warning("%s should be declared as Number=%s", + info_tags[i].name, info_tags[i].type_str); + info_warned[i] = 1; } } - if ( !GL_warned ) - { - int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL"); - if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G ) - { - hts_log_warning("GL should be declared as Number=G"); - GL_warned = 1; + + // Check FORMAT tag types. + for (i = 0; i < sizeof(fmt_tags)/sizeof(*fmt_tags); i++) { + if (fmt_warned[i]) + continue; + int id = bcf_hdr_id2int(hdr, BCF_DT_ID, fmt_tags[i].name); + if (bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, id) && + bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != fmt_tags[i].type) { + // Permit "Number=." if this tag predates the vcf version it is + // defined within. This is a common tactic for callers to use + // new tags with older formats in order to avoid parsing failures + // with some software. + // We don't care for 4.3 and earlier as that's more of a wild-west + // and it's not abnormal to see incorrect usage of Number=. there. + if ((version < VCF44 && + bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != BCF_VL_VAR) || + (version >= VCF44 && version >= fmt_tags[i].version)) { + hts_log_warning("%s should be declared as Number=%s", + fmt_tags[i].name, fmt_tags[i].type_str); + fmt_warned[i] = 1; + } } } }