Skip to content

Commit 6aa68cb

Browse files
committed
[pileup_encoder] indel encoding initial commit
1 parent 376d7ca commit 6aa68cb

File tree

9 files changed

+98
-19
lines changed

9 files changed

+98
-19
lines changed

samples/simple_snp_trainer/pileup_hdf5_generator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ def generate_hdf5(args):
5252

5353
# Generate the variant entries using VCF reader.
5454
vcf_reader = VCFReader(file_list)
55+
for variant in vcf_reader:
56+
print(variant)
57+
exit(0)
5558

5659
# Setup encoder for samples and labels.
5760
sample_encoder = PileupEncoder(window_size=100, max_reads=100,

tests/data/some_indels.bam

2.13 MB
Binary file not shown.

tests/data/some_indels.bam.bai

5.88 KB
Binary file not shown.

tests/data/some_indels.vcf.gz

2.93 KB
Binary file not shown.

tests/data/some_indels.vcf.gz.tbi

213 Bytes
Binary file not shown.

tests/test_pileup_encoder.py

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,37 @@
2727

2828
@pytest.fixture
2929
def snp_variant():
30-
bam = os.path.join(get_data_folder(), "small_bam.bam")
31-
variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A',
32-
quality=60, filter=None, info={'DP': 35, 'AF': 0.0185714}, format=['GT', 'GQ'],
33-
samples=[['1/1', '50']], zygosity=VariantZygosity.HOMOZYGOUS,
30+
bam = os.path.join(get_data_folder(), "some_indels.bam")
31+
variant = Variant(chrom="1", pos=10106775, id="rs12406448", ref='T', allele='C',
32+
quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
33+
samples=[['0/1', None, 638, [149, 142], [175, 174], 1079]], zygosity=VariantZygosity.HETEROZYGOUS,
3434
type=VariantType.SNP, vcf='null.vcf', bam=bam)
3535
return variant
3636

3737

38+
@pytest.fixture
39+
def insertion_variant():
40+
bam = os.path.join(get_data_folder(), "some_indels.bam")
41+
variant = Variant(chrom="1", pos=10122622, id="rs57037935", ref='T', allele='TG',
42+
quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
43+
samples=[['1/1', None, 546, [0, 246], [25, 25], 330]], zygosity=VariantZygosity.HOMOZYGOUS,
44+
type=VariantType.INSERTION, vcf='null.vcf', bam=bam)
45+
return variant
46+
47+
48+
@pytest.fixture
49+
def deletion_variant():
50+
bam = os.path.join(get_data_folder(), "some_indels.bam")
51+
variant = Variant(chrom="1", pos=10163457, id=None, ref='CTTTA', allele='C',
52+
quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
53+
samples=[['1/0', None, 177, [0, 0, 0], [0, 0, 0], 160]], zygosity=VariantZygosity.HETEROZYGOUS,
54+
type=VariantType.DELETION, vcf='null.vcf', bam=bam)
55+
return variant
56+
57+
3858
def test_snp_encoder_basic(snp_variant):
3959
max_reads = 100
40-
window_size = 5
60+
window_size = 10
4161
width = 2 * window_size + 1
4262
height = max_reads
4363
layers = [PileupEncoder.Layer.READ]
@@ -96,7 +116,7 @@ def test_snp_encoder_base_quality(snp_variant):
96116
# and then converting it to a long tensor and summing up all elements to match
97117
# against total size.
98118
all_lt_1 = (encoding <= 1.0).long()
99-
assert(torch.sum(all_lt_1) == (max_reads * width))
119+
assert(torch.sum(all_lt_1) == (height * width))
100120

101121

102122
def test_snp_encoder_mapping_quality(snp_variant):
@@ -118,7 +138,39 @@ def test_snp_encoder_mapping_quality(snp_variant):
118138
# and then converting it to a long tensor and summing up all elements to match
119139
# against total size.
120140
all_lt_1 = (encoding <= 1.0).long()
121-
assert(torch.sum(all_lt_1) == (max_reads * width))
141+
assert(torch.sum(all_lt_1) == (height * width))
142+
143+
144+
def test_insertion_read_encoding(insertion_variant):
145+
max_reads = 100
146+
window_size = 30
147+
width = 2 * window_size + 1
148+
height = max_reads
149+
layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.ALLELE]
150+
151+
encoder = PileupEncoder(window_size=window_size,
152+
max_reads=max_reads, layers=layers)
153+
154+
variant = insertion_variant
155+
156+
encoding = encoder(variant)
157+
assert(encoding.size() == torch.Size([len(layers), height, width]))
158+
159+
160+
def test_deletion_read_encoding(deletion_variant):
161+
max_reads = 100
162+
window_size = 10
163+
width = 2 * window_size + 1
164+
height = max_reads
165+
layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.ALLELE]
166+
167+
encoder = PileupEncoder(window_size=window_size,
168+
max_reads=max_reads, layers=layers)
169+
170+
variant = deletion_variant
171+
172+
encoding = encoder(variant)
173+
assert(encoding.size() == torch.Size([len(layers), height, width]))
122174

123175

124176
def test_pileup_unknown_layer():

tests/test_vcfio.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def test_vcf_loader_snps(monkeypatch):
6161
vcf_bam_tuple = VCFReader.VcfBamPath(
6262
vcf="/dummy/path.gz", bam="temp.bam", is_fp=False)
6363
vcf_loader = MockPyVCFReader.get_vcf(monkeypatch, [vcf_bam_tuple])
64-
assert(len(vcf_loader) == 13)
64+
assert(len(vcf_loader) == 15)
6565

6666

6767
def test_vcf_fetch_variant(monkeypatch):

variantworks/io/vcfio.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,6 @@ def _parse_vcf(self, vcf_file, bam, labels, is_fp=False):
163163
"Can not parse record %s in %s, all samples must be called in true positive VCF file" % (
164164
record, vcf_file)
165165
)
166-
if not record.is_snp:
167-
warnings.warn("%s is filtered - not an SNP record" % record)
168-
continue
169166
if len(record.ALT) > 1:
170167
warnings.warn(
171168
"%s is filtered - multiallele recrods are not supported" % record)

variantworks/sample_encoder.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import torch
2222

2323
from variantworks.base_encoder import base_enum_encoder
24-
from variantworks.types import Variant, VariantType, VariantZygosity
24+
from variantworks.types import Variant, VariantZygosity
2525

2626

2727
class SampleEncoder:
@@ -80,7 +80,7 @@ class Layer(Enum):
8080
REFERENCE = 3
8181
ALLELE = 4
8282

83-
def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encoder=None):
83+
def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encoder=None, print_encoding=False):
8484
"""Construct class instance.
8585
8686
Args:
@@ -91,6 +91,7 @@ def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encod
9191
encoding follows the ordering of layers in the list. [Layer.READ]
9292
base_encoder : A dict defining conversion of nucleotide string chars to numeric representation.
9393
[base_encoder.base_enum_encoder]
94+
print_encoding : Print ASCII representation of each encoding that's converted to a tensor. [False]
9495
9596
Returns:
9697
Instance of class.
@@ -108,6 +109,7 @@ def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encod
108109
(self.height, self.width), dtype=torch.float32)
109110
self.layer_tensors.append(tensor)
110111
self.layer_dict[layer] = tensor
112+
self.print_encoding = print_encoding
111113

112114
@property
113115
def width(self):
@@ -135,6 +137,9 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
135137
# Fetch the subsequence based on the offsets
136138
seq = pileupread.alignment.query_sequence[query_pos -
137139
left_offset: query_pos + right_offset]
140+
if self.print_encoding:
141+
print("{}{}{}".format("-" * pileup_pos_range[0], seq, "-" *
142+
(2 * self.window_size + 1 - len(seq) - pileup_pos_range[0])))
138143
for seq_pos, pileup_pos in enumerate(range(pileup_pos_range[0], pileup_pos_range[1])):
139144
# Encode base characters to enum
140145
tensor[row, pileup_pos] = self.base_encoder[seq[seq_pos]]
@@ -153,7 +158,7 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
153158
qual = qual / MAX_BASE_QUALITY
154159
tensor[row, pileup_pos] = qual
155160
elif layer == self.Layer.MAPPING_QUALITY:
156-
MAX_MAPPING_QUALITY = 50.0
161+
MAX_MAPPING_QUALITY = 100.0
157162
# Getch mapping quality of alignment
158163
map_qual = pileupread.alignment.mapping_quality
159164
# Missing mapiping quality is 255
@@ -165,11 +170,21 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
165170
# Encode base characters to enum
166171
tensor[row, pileup_pos] = map_qual
167172
elif layer == self.Layer.REFERENCE:
173+
if self.print_encoding:
174+
print("{}{}{}".format("-" * self.window_size, variant.ref, "-" *
175+
(2 * self.window_size + 1 - len(variant.ref) - self.window_size)))
168176
# Only encode the reference at the variant position, rest all 0
169-
tensor[row, self.window_size] = self.base_encoder[variant.ref]
177+
for seq_pos, pileup_pos in enumerate(
178+
range(self.window_size, min(self.window_size + len(variant.ref), 2 * self.window_size - 1))):
179+
tensor[row, pileup_pos] = self.base_encoder[variant.ref[seq_pos]]
170180
elif layer == self.Layer.ALLELE:
181+
if self.print_encoding:
182+
print("{}{}{}".format("-" * self.window_size, variant.allele, "-" *
183+
(2 * self.window_size + 1 - len(variant.allele) - self.window_size)))
171184
# Only encode the allele at the variant position, rest all 0
172-
tensor[row, self.window_size] = self.base_encoder[variant.allele]
185+
for seq_pos, pileup_pos in enumerate(
186+
range(self.window_size, min(self.window_size + len(variant.allele), 2 * self.window_size - 1))):
187+
tensor[row, pileup_pos] = self.base_encoder[variant.allele[seq_pos]]
173188

174189
def __call__(self, variant):
175190
"""Return a torch Tensor pileup queried from a BAM file.
@@ -182,8 +197,13 @@ def __call__(self, variant):
182197
variant_pos = variant.pos
183198
bam_file = variant.bam
184199

185-
assert(variant.type ==
186-
VariantType.SNP), "Only SNP variants supported in PileupEncoder currently."
200+
# Check that the ref and alt alleles all fit in the window context.
201+
if len(variant.ref) > self.window_size:
202+
raise RuntimeError("Ref allele {} too large for window {}. Please increase window size.".format(
203+
variant.ref, self.window_size))
204+
if len(variant.allele) > self.window_size:
205+
raise RuntimeError("Alt allele {} too large for window {}. Please increase window size.".format(
206+
variant.allele, self.window_size))
187207

188208
# Create BAM object if one hasn't been opened before.
189209
if bam_file not in self.bams:
@@ -193,10 +213,14 @@ def __call__(self, variant):
193213

194214
# Get pileups from BAM
195215
pileups = bam.pileup(chrom,
196-
variant_pos, variant_pos + 1,
216+
variant_pos - 1, variant_pos,
197217
truncate=True,
198218
max_depth=self.max_reads)
199219

220+
if self.print_encoding:
221+
print("\nEncoding for {}".format(variant))
222+
print("Order of rows : {}".format(self.layers))
223+
200224
for col, pileup_col in enumerate(pileups):
201225
for row, pileupread in enumerate(pileup_col.pileups):
202226
# Skip rows beyond the max depth
@@ -206,6 +230,9 @@ def __call__(self, variant):
206230
if pileupread.is_del or pileupread.is_refskip:
207231
continue
208232

233+
if pileupread.is_head or pileupread.is_tail:
234+
continue
235+
209236
# Using the variant locus as the center, find the left and right offset
210237
# from that locus to use as bounds for fetching bases from reads.
211238
#

0 commit comments

Comments
 (0)