Skip to content

Commit

Permalink
Merge pull request #82 from tijyojwad/jdaw/add-indel-encoding
Browse files Browse the repository at this point in the history
[pileup_encoder] indel encoding for pileups
  • Loading branch information
Joyjit Daw authored Jul 20, 2020
2 parents 4d22a0a + a4f5b9b commit 90d9119
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 18 deletions.
3 changes: 3 additions & 0 deletions samples/simple_snp_trainer/pileup_hdf5_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ def generate_hdf5(args):

# Generate the variant entries using VCF reader.
vcf_reader = VCFReader(file_list)
for variant in vcf_reader:
print(variant)
exit(0)

# Setup encoder for samples and labels.
sample_encoder = PileupEncoder(window_size=100, max_reads=100,
Expand Down
Binary file added tests/data/some_indels.bam
Binary file not shown.
Binary file added tests/data/some_indels.bam.bai
Binary file not shown.
Binary file added tests/data/some_indels.vcf.gz
Binary file not shown.
Binary file added tests/data/some_indels.vcf.gz.tbi
Binary file not shown.
66 changes: 59 additions & 7 deletions tests/test_pileup_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,37 @@

@pytest.fixture
def snp_variant():
bam = os.path.join(get_data_folder(), "small_bam.bam")
variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A',
quality=60, filter=None, info={'DP': 35, 'AF': 0.0185714}, format=['GT', 'GQ'],
samples=[['1/1', '50']], zygosity=VariantZygosity.HOMOZYGOUS,
bam = os.path.join(get_data_folder(), "some_indels.bam")
variant = Variant(chrom="1", pos=10106775, id="rs12406448", ref='T', allele='C',
quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
samples=[['0/1', None, 638, [149, 142], [175, 174], 1079]], zygosity=VariantZygosity.HETEROZYGOUS,
type=VariantType.SNP, vcf='null.vcf', bam=bam)
return variant


@pytest.fixture
def insertion_variant():
bam = os.path.join(get_data_folder(), "some_indels.bam")
variant = Variant(chrom="1", pos=10122622, id="rs57037935", ref='T', allele='TG',
quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
samples=[['1/1', None, 546, [0, 246], [25, 25], 330]], zygosity=VariantZygosity.HOMOZYGOUS,
type=VariantType.INSERTION, vcf='null.vcf', bam=bam)
return variant


@pytest.fixture
def deletion_variant():
bam = os.path.join(get_data_folder(), "some_indels.bam")
variant = Variant(chrom="1", pos=10163457, id=None, ref='CTTTA', allele='C',
quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
samples=[['1/0', None, 177, [0, 0, 0], [0, 0, 0], 160]], zygosity=VariantZygosity.HETEROZYGOUS,
type=VariantType.DELETION, vcf='null.vcf', bam=bam)
return variant


def test_snp_encoder_basic(snp_variant):
max_reads = 100
window_size = 5
window_size = 10
width = 2 * window_size + 1
height = max_reads
layers = [PileupEncoder.Layer.READ]
Expand Down Expand Up @@ -96,7 +116,7 @@ def test_snp_encoder_base_quality(snp_variant):
# and then converting it to a long tensor and summing up all elements to match
# against total size.
all_lt_1 = (encoding <= 1.0).long()
assert(torch.sum(all_lt_1) == (max_reads * width))
assert(torch.sum(all_lt_1) == (height * width))


def test_snp_encoder_mapping_quality(snp_variant):
Expand All @@ -118,7 +138,39 @@ def test_snp_encoder_mapping_quality(snp_variant):
# and then converting it to a long tensor and summing up all elements to match
# against total size.
all_lt_1 = (encoding <= 1.0).long()
assert(torch.sum(all_lt_1) == (max_reads * width))
assert(torch.sum(all_lt_1) == (height * width))


def test_insertion_read_encoding(insertion_variant):
max_reads = 100
window_size = 30
width = 2 * window_size + 1
height = max_reads
layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.ALLELE]

encoder = PileupEncoder(window_size=window_size,
max_reads=max_reads, layers=layers)

variant = insertion_variant

encoding = encoder(variant)
assert(encoding.size() == torch.Size([len(layers), height, width]))


def test_deletion_read_encoding(deletion_variant):
max_reads = 100
window_size = 10
width = 2 * window_size + 1
height = max_reads
layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.ALLELE]

encoder = PileupEncoder(window_size=window_size,
max_reads=max_reads, layers=layers)

variant = deletion_variant

encoding = encoder(variant)
assert(encoding.size() == torch.Size([len(layers), height, width]))


def test_pileup_unknown_layer():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_vcfio.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_vcf_loader_snps(get_created_vcf_tabix_files):
vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
vcf_loader = VCFReader([vcf_bam_tuple])
assert(len(vcf_loader) == 13)
assert(len(vcf_loader) == 15)


def test_vcf_fetch_variant(get_created_vcf_tabix_files):
Expand Down
3 changes: 0 additions & 3 deletions variantworks/io/vcfio.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,6 @@ def _parse_vcf(self, vcf_file, bam, labels, is_fp=False):
"Can not parse record %s in %s, all samples must be called in true positive VCF file" % (
record, vcf_file)
)
if not record.is_snp:
warnings.warn("%s is filtered - not an SNP record" % record)
continue
if len(record.ALT) > 1:
warnings.warn(
"%s is filtered - multiallele recrods are not supported" % record)
Expand Down
41 changes: 34 additions & 7 deletions variantworks/sample_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import torch

from variantworks.base_encoder import base_enum_encoder
from variantworks.types import Variant, VariantType, VariantZygosity
from variantworks.types import Variant, VariantZygosity


class SampleEncoder:
Expand Down Expand Up @@ -80,7 +80,7 @@ class Layer(Enum):
REFERENCE = 3
ALLELE = 4

def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encoder=None):
def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encoder=None, print_encoding=False):
"""Construct class instance.
Args:
Expand All @@ -91,6 +91,7 @@ def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encod
encoding follows the ordering of layers in the list. [Layer.READ]
base_encoder : A dict defining conversion of nucleotide string chars to numeric representation.
[base_encoder.base_enum_encoder]
print_encoding : Print ASCII representation of each encoding that's converted to a tensor. [False]
Returns:
Instance of class.
Expand All @@ -108,6 +109,7 @@ def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encod
(self.height, self.width), dtype=torch.float32)
self.layer_tensors.append(tensor)
self.layer_dict[layer] = tensor
self.print_encoding = print_encoding

@property
def width(self):
Expand Down Expand Up @@ -135,6 +137,9 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
# Fetch the subsequence based on the offsets
seq = pileupread.alignment.query_sequence[query_pos -
left_offset: query_pos + right_offset]
if self.print_encoding:
print("{}{}{}".format("-" * pileup_pos_range[0], seq, "-" *
(2 * self.window_size + 1 - len(seq) - pileup_pos_range[0])))
for seq_pos, pileup_pos in enumerate(range(pileup_pos_range[0], pileup_pos_range[1])):
# Encode base characters to enum
tensor[row, pileup_pos] = self.base_encoder[seq[seq_pos]]
Expand All @@ -153,7 +158,7 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
qual = qual / MAX_BASE_QUALITY
tensor[row, pileup_pos] = qual
elif layer == self.Layer.MAPPING_QUALITY:
MAX_MAPPING_QUALITY = 50.0
MAX_MAPPING_QUALITY = 100.0
# Getch mapping quality of alignment
map_qual = pileupread.alignment.mapping_quality
# Missing mapiping quality is 255
Expand All @@ -165,11 +170,21 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
# Encode base characters to enum
tensor[row, pileup_pos] = map_qual
elif layer == self.Layer.REFERENCE:
if self.print_encoding:
print("{}{}{}".format("-" * self.window_size, variant.ref, "-" *
(2 * self.window_size + 1 - len(variant.ref) - self.window_size)))
# Only encode the reference at the variant position, rest all 0
tensor[row, self.window_size] = self.base_encoder[variant.ref]
for seq_pos, pileup_pos in enumerate(
range(self.window_size, min(self.window_size + len(variant.ref), 2 * self.window_size - 1))):
tensor[row, pileup_pos] = self.base_encoder[variant.ref[seq_pos]]
elif layer == self.Layer.ALLELE:
if self.print_encoding:
print("{}{}{}".format("-" * self.window_size, variant.allele, "-" *
(2 * self.window_size + 1 - len(variant.allele) - self.window_size)))
# Only encode the allele at the variant position, rest all 0
tensor[row, self.window_size] = self.base_encoder[variant.allele]
for seq_pos, pileup_pos in enumerate(
range(self.window_size, min(self.window_size + len(variant.allele), 2 * self.window_size - 1))):
tensor[row, pileup_pos] = self.base_encoder[variant.allele[seq_pos]]

def __call__(self, variant):
"""Return a torch Tensor pileup queried from a BAM file.
Expand All @@ -182,8 +197,13 @@ def __call__(self, variant):
variant_pos = variant.pos
bam_file = variant.bam

assert(variant.type ==
VariantType.SNP), "Only SNP variants supported in PileupEncoder currently."
# Check that the ref and alt alleles all fit in the window context.
if len(variant.ref) > self.window_size:
raise RuntimeError("Ref allele {} too large for window {}. Please increase window size.".format(
variant.ref, self.window_size))
if len(variant.allele) > self.window_size:
raise RuntimeError("Alt allele {} too large for window {}. Please increase window size.".format(
variant.allele, self.window_size))

# Create BAM object if one hasn't been opened before.
if bam_file not in self.bams:
Expand All @@ -199,6 +219,10 @@ def __call__(self, variant):
truncate=True,
max_depth=self.max_reads)

if self.print_encoding:
print("\nEncoding for {}".format(variant))
print("Order of rows : {}".format(self.layers))

for col, pileup_col in enumerate(pileups):
for row, pileupread in enumerate(pileup_col.pileups):
# Skip rows beyond the max depth
Expand All @@ -208,6 +232,9 @@ def __call__(self, variant):
if pileupread.is_del or pileupread.is_refskip:
continue

if pileupread.is_head or pileupread.is_tail:
continue

# Using the variant locus as the center, find the left and right offset
# from that locus to use as bounds for fetching bases from reads.
#
Expand Down

0 comments on commit 90d9119

Please sign in to comment.