diff --git a/samples/simple_snp_trainer/pileup_hdf5_generator.py b/samples/simple_snp_trainer/pileup_hdf5_generator.py index 3781ff1..c58ff4c 100755 --- a/samples/simple_snp_trainer/pileup_hdf5_generator.py +++ b/samples/simple_snp_trainer/pileup_hdf5_generator.py @@ -52,6 +52,9 @@ def generate_hdf5(args): # Generate the variant entries using VCF reader. vcf_reader = VCFReader(file_list) + for variant in vcf_reader: + print(variant) + exit(0) # Setup encoder for samples and labels. sample_encoder = PileupEncoder(window_size=100, max_reads=100, diff --git a/tests/data/some_indels.bam b/tests/data/some_indels.bam new file mode 100644 index 0000000..0408482 Binary files /dev/null and b/tests/data/some_indels.bam differ diff --git a/tests/data/some_indels.bam.bai b/tests/data/some_indels.bam.bai new file mode 100644 index 0000000..743e59e Binary files /dev/null and b/tests/data/some_indels.bam.bai differ diff --git a/tests/data/some_indels.vcf.gz b/tests/data/some_indels.vcf.gz new file mode 100644 index 0000000..a959d12 Binary files /dev/null and b/tests/data/some_indels.vcf.gz differ diff --git a/tests/data/some_indels.vcf.gz.tbi b/tests/data/some_indels.vcf.gz.tbi new file mode 100644 index 0000000..b4e5e30 Binary files /dev/null and b/tests/data/some_indels.vcf.gz.tbi differ diff --git a/tests/test_pileup_encoder.py b/tests/test_pileup_encoder.py index 594ef01..81502c9 100644 --- a/tests/test_pileup_encoder.py +++ b/tests/test_pileup_encoder.py @@ -27,17 +27,37 @@ @pytest.fixture def snp_variant(): - bam = os.path.join(get_data_folder(), "small_bam.bam") - variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A', - quality=60, filter=None, info={'DP': 35, 'AF': 0.0185714}, format=['GT', 'GQ'], - samples=[['1/1', '50']], zygosity=VariantZygosity.HOMOZYGOUS, + bam = os.path.join(get_data_folder(), "some_indels.bam") + variant = Variant(chrom="1", pos=10106775, id="rs12406448", ref='T', allele='C', + quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'], + samples=[['0/1', None, 638, [149, 142], [175, 174], 1079]], zygosity=VariantZygosity.HETEROZYGOUS, type=VariantType.SNP, vcf='null.vcf', bam=bam) return variant +@pytest.fixture +def insertion_variant(): + bam = os.path.join(get_data_folder(), "some_indels.bam") + variant = Variant(chrom="1", pos=10122622, id="rs57037935", ref='T', allele='TG', + quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'], + samples=[['1/1', None, 546, [0, 246], [25, 25], 330]], zygosity=VariantZygosity.HOMOZYGOUS, + type=VariantType.INSERTION, vcf='null.vcf', bam=bam) + return variant + + +@pytest.fixture +def deletion_variant(): + bam = os.path.join(get_data_folder(), "some_indels.bam") + variant = Variant(chrom="1", pos=10163457, id=None, ref='CTTTA', allele='C', + quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'], + samples=[['1/0', None, 177, [0, 0, 0], [0, 0, 0], 160]], zygosity=VariantZygosity.HETEROZYGOUS, + type=VariantType.DELETION, vcf='null.vcf', bam=bam) + return variant + + def test_snp_encoder_basic(snp_variant): max_reads = 100 - window_size = 5 + window_size = 10 width = 2 * window_size + 1 height = max_reads layers = [PileupEncoder.Layer.READ] @@ -96,7 +116,7 @@ def test_snp_encoder_base_quality(snp_variant): # and then converting it to a long tensor and summing up all elements to match # against total size. all_lt_1 = (encoding <= 1.0).long() - assert(torch.sum(all_lt_1) == (max_reads * width)) + assert(torch.sum(all_lt_1) == (height * width)) def test_snp_encoder_mapping_quality(snp_variant): @@ -118,7 +138,39 @@ def test_snp_encoder_mapping_quality(snp_variant): # and then converting it to a long tensor and summing up all elements to match # against total size. all_lt_1 = (encoding <= 1.0).long() - assert(torch.sum(all_lt_1) == (max_reads * width)) + assert(torch.sum(all_lt_1) == (height * width)) + + +def test_insertion_read_encoding(insertion_variant): + max_reads = 100 + window_size = 30 + width = 2 * window_size + 1 + height = max_reads + layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.ALLELE] + + encoder = PileupEncoder(window_size=window_size, + max_reads=max_reads, layers=layers) + + variant = insertion_variant + + encoding = encoder(variant) + assert(encoding.size() == torch.Size([len(layers), height, width])) + + +def test_deletion_read_encoding(deletion_variant): + max_reads = 100 + window_size = 10 + width = 2 * window_size + 1 + height = max_reads + layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.ALLELE] + + encoder = PileupEncoder(window_size=window_size, + max_reads=max_reads, layers=layers) + + variant = deletion_variant + + encoding = encoder(variant) + assert(encoding.size() == torch.Size([len(layers), height, width])) def test_pileup_unknown_layer(): diff --git a/tests/test_vcfio.py b/tests/test_vcfio.py index f81833e..b99ac50 100644 --- a/tests/test_vcfio.py +++ b/tests/test_vcfio.py @@ -28,7 +28,7 @@ def test_vcf_loader_snps(get_created_vcf_tabix_files): vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input()) vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False) vcf_loader = VCFReader([vcf_bam_tuple]) - assert(len(vcf_loader) == 13) + assert(len(vcf_loader) == 15) def test_vcf_fetch_variant(get_created_vcf_tabix_files): diff --git a/variantworks/io/vcfio.py b/variantworks/io/vcfio.py index 5e727e6..44a74a8 100644 --- a/variantworks/io/vcfio.py +++ b/variantworks/io/vcfio.py @@ -179,9 +179,6 @@ def _parse_vcf(self, vcf_file, bam, labels, is_fp=False): "Can not parse record %s in %s, all samples must be called in true positive VCF file" % ( record, vcf_file) ) - if not record.is_snp: - warnings.warn("%s is filtered - not an SNP record" % record) - continue if len(record.ALT) > 1: warnings.warn( "%s is filtered - multiallele recrods are not supported" % record) diff --git a/variantworks/sample_encoder.py b/variantworks/sample_encoder.py index 7b70b92..d0da06b 100644 --- a/variantworks/sample_encoder.py +++ b/variantworks/sample_encoder.py @@ -21,7 +21,7 @@ import torch from variantworks.base_encoder import base_enum_encoder -from variantworks.types import Variant, VariantType, VariantZygosity +from variantworks.types import Variant, VariantZygosity class SampleEncoder: @@ -80,7 +80,7 @@ class Layer(Enum): REFERENCE = 3 ALLELE = 4 - def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encoder=None): + def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encoder=None, print_encoding=False): """Construct class instance. Args: @@ -91,6 +91,7 @@ def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encod encoding follows the ordering of layers in the list. [Layer.READ] base_encoder : A dict defining conversion of nucleotide string chars to numeric representation. [base_encoder.base_enum_encoder] + print_encoding : Print ASCII representation of each encoding that's converted to a tensor. [False] Returns: Instance of class. @@ -108,6 +109,7 @@ def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encod (self.height, self.width), dtype=torch.float32) self.layer_tensors.append(tensor) self.layer_dict[layer] = tensor + self.print_encoding = print_encoding @property def width(self): @@ -135,6 +137,9 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_ # Fetch the subsequence based on the offsets seq = pileupread.alignment.query_sequence[query_pos - left_offset: query_pos + right_offset] + if self.print_encoding: + print("{}{}{}".format("-" * pileup_pos_range[0], seq, "-" * + (2 * self.window_size + 1 - len(seq) - pileup_pos_range[0]))) for seq_pos, pileup_pos in enumerate(range(pileup_pos_range[0], pileup_pos_range[1])): # Encode base characters to enum tensor[row, pileup_pos] = self.base_encoder[seq[seq_pos]] @@ -153,7 +158,7 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_ qual = qual / MAX_BASE_QUALITY tensor[row, pileup_pos] = qual elif layer == self.Layer.MAPPING_QUALITY: - MAX_MAPPING_QUALITY = 50.0 + MAX_MAPPING_QUALITY = 100.0 # Getch mapping quality of alignment map_qual = pileupread.alignment.mapping_quality # Missing mapiping quality is 255 @@ -165,11 +170,21 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_ # Encode base characters to enum tensor[row, pileup_pos] = map_qual elif layer == self.Layer.REFERENCE: + if self.print_encoding: + print("{}{}{}".format("-" * self.window_size, variant.ref, "-" * + (2 * self.window_size + 1 - len(variant.ref) - self.window_size))) # Only encode the reference at the variant position, rest all 0 - tensor[row, self.window_size] = self.base_encoder[variant.ref] + for seq_pos, pileup_pos in enumerate( + range(self.window_size, min(self.window_size + len(variant.ref), 2 * self.window_size - 1))): + tensor[row, pileup_pos] = self.base_encoder[variant.ref[seq_pos]] elif layer == self.Layer.ALLELE: + if self.print_encoding: + print("{}{}{}".format("-" * self.window_size, variant.allele, "-" * + (2 * self.window_size + 1 - len(variant.allele) - self.window_size))) # Only encode the allele at the variant position, rest all 0 - tensor[row, self.window_size] = self.base_encoder[variant.allele] + for seq_pos, pileup_pos in enumerate( + range(self.window_size, min(self.window_size + len(variant.allele), 2 * self.window_size - 1))): + tensor[row, pileup_pos] = self.base_encoder[variant.allele[seq_pos]] def __call__(self, variant): """Return a torch Tensor pileup queried from a BAM file. @@ -182,8 +197,13 @@ def __call__(self, variant): variant_pos = variant.pos bam_file = variant.bam - assert(variant.type == - VariantType.SNP), "Only SNP variants supported in PileupEncoder currently." + # Check that the ref and alt alleles all fit in the window context. + if len(variant.ref) > self.window_size: + raise RuntimeError("Ref allele {} too large for window {}. Please increase window size.".format( + variant.ref, self.window_size)) + if len(variant.allele) > self.window_size: + raise RuntimeError("Alt allele {} too large for window {}. Please increase window size.".format( + variant.allele, self.window_size)) # Create BAM object if one hasn't been opened before. if bam_file not in self.bams: @@ -199,6 +219,10 @@ def __call__(self, variant): truncate=True, max_depth=self.max_reads) + if self.print_encoding: + print("\nEncoding for {}".format(variant)) + print("Order of rows : {}".format(self.layers)) + for col, pileup_col in enumerate(pileups): for row, pileupread in enumerate(pileup_col.pileups): # Skip rows beyond the max depth @@ -208,6 +232,9 @@ def __call__(self, variant): if pileupread.is_del or pileupread.is_refskip: continue + if pileupread.is_head or pileupread.is_tail: + continue + # Using the variant locus as the center, find the left and right offset # from that locus to use as bounds for fetching bases from reads. #