Skip to content

Commit 5e09eb3

Browse files
committed
Properly handle info fields with embedded spaces by VCF version.
1 parent 5700958 commit 5e09eb3

File tree

5 files changed

+155
-3
lines changed

5 files changed

+155
-3
lines changed

src/main/java/htsjdk/variant/vcf/VCFCodec.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ protected void reportDuplicateInfoKeyValue(final String duplicateKey, final Stri
104104
* @return a mapping of keys to objects
105105
*/
106106
protected Map<String, Object> parseInfo(String infoField) {
107-
if (infoField.indexOf(' ') != -1) {
107+
if ((infoField.indexOf(' ') != -1) && !version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) {
108108
generateException(
109109
String.format("Whitespace is not allowed in the INFO field in VCF version %s: %s",
110110
version == null ?

src/test/java/htsjdk/variant/vcf/VCFCodec42FeaturesTest.java

+19
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
package htsjdk.variant.vcf;
22

33
import htsjdk.HtsjdkTest;
4+
import htsjdk.samtools.util.Tuple;
5+
import htsjdk.tribble.TribbleException;
6+
import htsjdk.variant.variantcontext.VariantContext;
47
import org.testng.Assert;
58
import org.testng.annotations.Test;
69

710
import java.nio.file.Path;
811
import java.nio.file.Paths;
12+
import java.util.List;
913

1014
public class VCFCodec42FeaturesTest extends HtsjdkTest {
1115
private static final Path TEST_PATH = Paths.get("src/test/resources/htsjdk/variant/");
@@ -21,4 +25,19 @@ public void testV42PedigreeParsing() {
2125
Assert.assertEquals(vcf42PedigreeLine.getClass(), VCFHeaderLine.class);
2226
Assert.assertEquals(vcf42PedigreeLine.getValue(), "<Derived=NA12891, Original=NA12878>");
2327
}
28+
29+
@Test(expectedExceptions = TribbleException.class)
30+
public void testVCF42RejectsInfoFieldWithSpaces() {
31+
// 1st variant has an info field with a value containing an embedded space
32+
final Path infoSpace42File = TEST_PATH.resolve("infoSpace42.vcf");
33+
34+
try ( final VCFFileReader vcfReader = new VCFFileReader(infoSpace42File, false) ){
35+
for (final VariantContext vc : vcfReader) {
36+
37+
}
38+
} catch (final TribbleException e) {
39+
Assert.assertTrue(e.getMessage().contains("Whitespace is not allowed"));
40+
throw e;
41+
}
42+
}
2443
}

src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java

+11-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package htsjdk.variant.vcf;
22

3+
import htsjdk.beta.io.IOPathUtils;
4+
import htsjdk.io.IOPath;
35
import htsjdk.samtools.util.CloseableIterator;
46
import htsjdk.samtools.util.FileExtensions;
57
import htsjdk.samtools.util.Interval;
@@ -19,8 +21,7 @@
1921
import java.nio.file.Files;
2022
import java.nio.file.Path;
2123
import java.nio.file.Paths;
22-
import java.util.ArrayList;
23-
import java.util.List;
24+
import java.util.*;
2425
import java.util.function.Function;
2526
import java.util.stream.Collectors;
2627

@@ -203,6 +204,14 @@ public void testVCF43IndexRoundTripQuery(final Path testFile) throws IOException
203204
}
204205
}
205206

207+
@Test
208+
public void testVCF43AcceptsInfoFieldWithSpaces() {
209+
// 1st variant has an info field with a value containing an embedded space
210+
final Path infoSpaceFile = TEST_PATH.resolve("infoSpace43.vcf");
211+
final Tuple<VCFHeader, List<VariantContext>> infoSpace43 = readEntireVCFIntoMemory(infoSpaceFile);
212+
Assert.assertTrue(infoSpace43.b.get(0).getAttribute("set").toString().contains(" "));
213+
}
214+
206215
//
207216
// UTF8-specific tests
208217
//
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
##fileformat=VCFv4.2
2+
##ALT=<ID=DEL,Description="Deletion",ExtraAltField="extra alt">
3+
##ALT=<ID=DUP,Description="Duplication">
4+
##ALT=<ID=INS,Description="Insertion">
5+
##ALT=<ID=INV,Description="Inversion">
6+
##ALT=<ID=INVDUP,Description="InvertedDUP with unknown boundaries">
7+
##ALT=<ID=TRA,Description="Translocation">
8+
##FILTER=<ID=GATK_STANDARD,Description="Standard GATK filter",ExtraFilterField="extra filter field">
9+
##FILTER=<ID=HARD_TO_VALIDATE,Description="Hard to validate">
10+
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed",ExtraFormatField="extra format">
11+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
12+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
13+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
14+
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
15+
##INFO=<ID=AB,Number=1,Type=Float,Description="Allele Balance for hets (ref/(ref+alt))",ExtraInfoField="extra info">
16+
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
17+
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
18+
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
19+
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref base qualities">
20+
##INFO=<ID=BaseQRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref base qualities">
21+
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
22+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
23+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
24+
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
25+
##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
26+
##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction">
27+
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
28+
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
29+
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
30+
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref read mapping qualities">
31+
##INFO=<ID=MQRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref read mapping qualities">
32+
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
33+
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref read position bias">
34+
##INFO=<ID=ReadPosRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref read position bias">
35+
##INFO=<ID=SB,Number=1,Type=Float,Description="Strand Bias">
36+
##INFO=<ID=set,Number=1,Type=String,Description="Source VCF for the merged record in CombineVariants">
37+
##contig=<ID=1,length=249250621,assembly=b37,extraContigField="extra contig field">
38+
##contig=<ID=10,length=135534747,assembly=b37>
39+
##contig=<ID=11,length=135006516,assembly=b37>
40+
##contig=<ID=12,length=133851895,assembly=b37>
41+
##contig=<ID=13,length=115169878,assembly=b37>
42+
##contig=<ID=14,length=107349540,assembly=b37>
43+
##contig=<ID=15,length=102531392,assembly=b37>
44+
##contig=<ID=16,length=90354753,assembly=b37>
45+
##contig=<ID=17,length=81195210,assembly=b37>
46+
##contig=<ID=18,length=78077248,assembly=b37>
47+
##contig=<ID=19,length=59128983,assembly=b37>
48+
##contig=<ID=2,length=243199373,assembly=b37>
49+
##contig=<ID=20,length=63025520,assembly=b37>
50+
##contig=<ID=21,length=48129895,assembly=b37>
51+
##contig=<ID=22,length=51304566,assembly=b37>
52+
##contig=<ID=3,length=198022430,assembly=b37>
53+
##contig=<ID=4,length=191154276,assembly=b37>
54+
##contig=<ID=5,length=180915260,assembly=b37>
55+
##contig=<ID=6,length=171115067,assembly=b37>
56+
##contig=<ID=7,length=159138663,assembly=b37>
57+
##contig=<ID=8,length=146364022,assembly=b37>
58+
##contig=<ID=9,length=141213431,assembly=b37>
59+
##contig=<ID=X,length=155270560,assembly=b37>
60+
##contig=<ID=Y,length=59373566,assembly=b37>
61+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240
62+
1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredIn Both GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
##fileformat=VCFv4.3
2+
##ALT=<ID=DEL,Description="Deletion",ExtraAltField="extra alt">
3+
##ALT=<ID=DUP,Description="Duplication">
4+
##ALT=<ID=INS,Description="Insertion">
5+
##ALT=<ID=INV,Description="Inversion">
6+
##ALT=<ID=INVDUP,Description="InvertedDUP with unknown boundaries">
7+
##ALT=<ID=TRA,Description="Translocation">
8+
##FILTER=<ID=GATK_STANDARD,Description="Standard GATK filter",ExtraFilterField="extra filter field">
9+
##FILTER=<ID=HARD_TO_VALIDATE,Description="Hard to validate">
10+
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed",ExtraFormatField="extra format">
11+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
12+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
13+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
14+
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
15+
##INFO=<ID=AB,Number=1,Type=Float,Description="Allele Balance for hets (ref/(ref+alt))",ExtraInfoField="extra info">
16+
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
17+
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
18+
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
19+
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref base qualities">
20+
##INFO=<ID=BaseQRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref base qualities">
21+
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
22+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
23+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
24+
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
25+
##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
26+
##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction">
27+
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
28+
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
29+
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
30+
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref read mapping qualities">
31+
##INFO=<ID=MQRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref read mapping qualities">
32+
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
33+
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref read position bias">
34+
##INFO=<ID=ReadPosRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref read position bias">
35+
##INFO=<ID=SB,Number=1,Type=Float,Description="Strand Bias">
36+
##INFO=<ID=set,Number=1,Type=String,Description="Source VCF for the merged record in CombineVariants">
37+
##contig=<ID=1,length=249250621,assembly=b37,extraContigField="extra contig field">
38+
##contig=<ID=10,length=135534747,assembly=b37>
39+
##contig=<ID=11,length=135006516,assembly=b37>
40+
##contig=<ID=12,length=133851895,assembly=b37>
41+
##contig=<ID=13,length=115169878,assembly=b37>
42+
##contig=<ID=14,length=107349540,assembly=b37>
43+
##contig=<ID=15,length=102531392,assembly=b37>
44+
##contig=<ID=16,length=90354753,assembly=b37>
45+
##contig=<ID=17,length=81195210,assembly=b37>
46+
##contig=<ID=18,length=78077248,assembly=b37>
47+
##contig=<ID=19,length=59128983,assembly=b37>
48+
##contig=<ID=2,length=243199373,assembly=b37>
49+
##contig=<ID=20,length=63025520,assembly=b37>
50+
##contig=<ID=21,length=48129895,assembly=b37>
51+
##contig=<ID=22,length=51304566,assembly=b37>
52+
##contig=<ID=3,length=198022430,assembly=b37>
53+
##contig=<ID=4,length=191154276,assembly=b37>
54+
##contig=<ID=5,length=180915260,assembly=b37>
55+
##contig=<ID=6,length=171115067,assembly=b37>
56+
##contig=<ID=7,length=159138663,assembly=b37>
57+
##contig=<ID=8,length=146364022,assembly=b37>
58+
##contig=<ID=9,length=141213431,assembly=b37>
59+
##contig=<ID=X,length=155270560,assembly=b37>
60+
##contig=<ID=Y,length=59373566,assembly=b37>
61+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240
62+
1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredIn Both GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99

0 commit comments

Comments
 (0)