Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pileup_encoder] indel encoding for pileups #82

Merged
merged 3 commits into from
Jul 20, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions samples/simple_snp_trainer/pileup_hdf5_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ def generate_hdf5(args):

# Generate the variant entries using VCF reader.
vcf_reader = VCFReader(file_list)
for variant in vcf_reader:
print(variant)
exit(0)

# Setup encoder for samples and labels.
sample_encoder = PileupEncoder(window_size=100, max_reads=100,
Expand Down
Binary file added tests/data/some_indels.bam
Binary file not shown.
Binary file added tests/data/some_indels.bam.bai
Binary file not shown.
Binary file added tests/data/some_indels.vcf.gz
Binary file not shown.
Binary file added tests/data/some_indels.vcf.gz.tbi
Binary file not shown.
66 changes: 59 additions & 7 deletions tests/test_pileup_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,37 @@

@pytest.fixture
def snp_variant():
bam = os.path.join(get_data_folder(), "small_bam.bam")
variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A',
quality=60, filter=None, info={'DP': 35, 'AF': 0.0185714}, format=['GT', 'GQ'],
samples=[['1/1', '50']], zygosity=VariantZygosity.HOMOZYGOUS,
bam = os.path.join(get_data_folder(), "some_indels.bam")
variant = Variant(chrom="1", pos=10106775, id="rs12406448", ref='T', allele='C',
quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
samples=[['0/1', None, 638, [149, 142], [175, 174], 1079]], zygosity=VariantZygosity.HETEROZYGOUS,
type=VariantType.SNP, vcf='null.vcf', bam=bam)
return variant


@pytest.fixture
def insertion_variant():
bam = os.path.join(get_data_folder(), "some_indels.bam")
variant = Variant(chrom="1", pos=10122622, id="rs57037935", ref='T', allele='TG',
quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
samples=[['1/1', None, 546, [0, 246], [25, 25], 330]], zygosity=VariantZygosity.HOMOZYGOUS,
type=VariantType.INSERTION, vcf='null.vcf', bam=bam)
return variant


@pytest.fixture
def deletion_variant():
bam = os.path.join(get_data_folder(), "some_indels.bam")
variant = Variant(chrom="1", pos=10163457, id=None, ref='CTTTA', allele='C',
quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
samples=[['1/0', None, 177, [0, 0, 0], [0, 0, 0], 160]], zygosity=VariantZygosity.HETEROZYGOUS,
type=VariantType.DELETION, vcf='null.vcf', bam=bam)
return variant


def test_snp_encoder_basic(snp_variant):
max_reads = 100
window_size = 5
window_size = 10
width = 2 * window_size + 1
height = max_reads
layers = [PileupEncoder.Layer.READ]
Expand Down Expand Up @@ -96,7 +116,7 @@ def test_snp_encoder_base_quality(snp_variant):
# and then converting it to a long tensor and summing up all elements to match
# against total size.
all_lt_1 = (encoding <= 1.0).long()
assert(torch.sum(all_lt_1) == (max_reads * width))
assert(torch.sum(all_lt_1) == (height * width))


def test_snp_encoder_mapping_quality(snp_variant):
Expand All @@ -118,7 +138,39 @@ def test_snp_encoder_mapping_quality(snp_variant):
# and then converting it to a long tensor and summing up all elements to match
# against total size.
all_lt_1 = (encoding <= 1.0).long()
assert(torch.sum(all_lt_1) == (max_reads * width))
assert(torch.sum(all_lt_1) == (height * width))


def test_insertion_read_encoding(insertion_variant):
max_reads = 100
window_size = 30
width = 2 * window_size + 1
height = max_reads
layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.ALLELE]

encoder = PileupEncoder(window_size=window_size,
max_reads=max_reads, layers=layers)

variant = insertion_variant

encoding = encoder(variant)
assert(encoding.size() == torch.Size([len(layers), height, width]))


def test_deletion_read_encoding(deletion_variant):
max_reads = 100
window_size = 10
width = 2 * window_size + 1
height = max_reads
layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.ALLELE]

encoder = PileupEncoder(window_size=window_size,
max_reads=max_reads, layers=layers)

variant = deletion_variant

encoding = encoder(variant)
assert(encoding.size() == torch.Size([len(layers), height, width]))


def test_pileup_unknown_layer():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_vcfio.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_vcf_loader_snps(get_created_vcf_tabix_files):
vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
vcf_loader = VCFReader([vcf_bam_tuple])
assert(len(vcf_loader) == 13)
assert(len(vcf_loader) == 15)


def test_vcf_fetch_variant(get_created_vcf_tabix_files):
Expand Down
3 changes: 0 additions & 3 deletions variantworks/io/vcfio.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,6 @@ def _parse_vcf(self, vcf_file, bam, labels, is_fp=False):
"Can not parse record %s in %s, all samples must be called in true positive VCF file" % (
record, vcf_file)
)
if not record.is_snp:
warnings.warn("%s is filtered - not an SNP record" % record)
continue
if len(record.ALT) > 1:
warnings.warn(
"%s is filtered - multiallele recrods are not supported" % record)
Expand Down
41 changes: 34 additions & 7 deletions variantworks/sample_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import torch

from variantworks.base_encoder import base_enum_encoder
from variantworks.types import Variant, VariantType, VariantZygosity
from variantworks.types import Variant, VariantZygosity


class SampleEncoder:
Expand Down Expand Up @@ -80,7 +80,7 @@ class Layer(Enum):
REFERENCE = 3
ALLELE = 4

def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encoder=None):
def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encoder=None, print_encoding=False):
"""Construct class instance.

Args:
Expand All @@ -91,6 +91,7 @@ def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encod
encoding follows the ordering of layers in the list. [Layer.READ]
base_encoder : A dict defining conversion of nucleotide string chars to numeric representation.
[base_encoder.base_enum_encoder]
print_encoding : Print ASCII representation of each encoding that's converted to a tensor. [False]

Returns:
Instance of class.
Expand All @@ -108,6 +109,7 @@ def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encod
(self.height, self.width), dtype=torch.float32)
self.layer_tensors.append(tensor)
self.layer_dict[layer] = tensor
self.print_encoding = print_encoding

@property
def width(self):
Expand Down Expand Up @@ -135,6 +137,9 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
# Fetch the subsequence based on the offsets
seq = pileupread.alignment.query_sequence[query_pos -
left_offset: query_pos + right_offset]
if self.print_encoding:
print("{}{}{}".format("-" * pileup_pos_range[0], seq, "-" *
(2 * self.window_size + 1 - len(seq) - pileup_pos_range[0])))
for seq_pos, pileup_pos in enumerate(range(pileup_pos_range[0], pileup_pos_range[1])):
# Encode base characters to enum
tensor[row, pileup_pos] = self.base_encoder[seq[seq_pos]]
Expand All @@ -153,7 +158,7 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
qual = qual / MAX_BASE_QUALITY
tensor[row, pileup_pos] = qual
elif layer == self.Layer.MAPPING_QUALITY:
MAX_MAPPING_QUALITY = 50.0
MAX_MAPPING_QUALITY = 100.0
tijyojwad marked this conversation as resolved.
Show resolved Hide resolved
# Getch mapping quality of alignment
map_qual = pileupread.alignment.mapping_quality
# Missing mapiping quality is 255
Expand All @@ -165,11 +170,21 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
# Encode base characters to enum
tensor[row, pileup_pos] = map_qual
elif layer == self.Layer.REFERENCE:
if self.print_encoding:
print("{}{}{}".format("-" * self.window_size, variant.ref, "-" *
(2 * self.window_size + 1 - len(variant.ref) - self.window_size)))
# Only encode the reference at the variant position, rest all 0
tensor[row, self.window_size] = self.base_encoder[variant.ref]
for seq_pos, pileup_pos in enumerate(
range(self.window_size, min(self.window_size + len(variant.ref), 2 * self.window_size - 1))):
tensor[row, pileup_pos] = self.base_encoder[variant.ref[seq_pos]]
elif layer == self.Layer.ALLELE:
if self.print_encoding:
print("{}{}{}".format("-" * self.window_size, variant.allele, "-" *
(2 * self.window_size + 1 - len(variant.allele) - self.window_size)))
# Only encode the allele at the variant position, rest all 0
tensor[row, self.window_size] = self.base_encoder[variant.allele]
for seq_pos, pileup_pos in enumerate(
range(self.window_size, min(self.window_size + len(variant.allele), 2 * self.window_size - 1))):
tensor[row, pileup_pos] = self.base_encoder[variant.allele[seq_pos]]

def __call__(self, variant):
"""Return a torch Tensor pileup queried from a BAM file.
Expand All @@ -182,8 +197,13 @@ def __call__(self, variant):
variant_pos = variant.pos
bam_file = variant.bam

assert(variant.type ==
VariantType.SNP), "Only SNP variants supported in PileupEncoder currently."
# Check that the ref and alt alleles all fit in the window context.
if len(variant.ref) > self.window_size:
raise RuntimeError("Ref allele {} too large for window {}. Please increase window size.".format(
variant.ref, self.window_size))
if len(variant.allele) > self.window_size:
raise RuntimeError("Alt allele {} too large for window {}. Please increase window size.".format(
variant.allele, self.window_size))

# Create BAM object if one hasn't been opened before.
if bam_file not in self.bams:
Expand All @@ -199,6 +219,10 @@ def __call__(self, variant):
truncate=True,
max_depth=self.max_reads)

if self.print_encoding:
print("\nEncoding for {}".format(variant))
print("Order of rows : {}".format(self.layers))

for col, pileup_col in enumerate(pileups):
for row, pileupread in enumerate(pileup_col.pileups):
# Skip rows beyond the max depth
Expand All @@ -208,6 +232,9 @@ def __call__(self, variant):
if pileupread.is_del or pileupread.is_refskip:
continue

if pileupread.is_head or pileupread.is_tail:
continue

# Using the variant locus as the center, find the left and right offset
# from that locus to use as bounds for fetching bases from reads.
#
Expand Down