21
21
import torch
22
22
23
23
from variantworks .base_encoder import base_enum_encoder
24
- from variantworks .types import Variant , VariantType , VariantZygosity
24
+ from variantworks .types import Variant , VariantZygosity
25
25
26
26
27
27
class SampleEncoder :
@@ -80,7 +80,7 @@ class Layer(Enum):
80
80
REFERENCE = 3
81
81
ALLELE = 4
82
82
83
- def __init__ (self , window_size = 50 , max_reads = 50 , layers = [Layer .READ ], base_encoder = None ):
83
+ def __init__ (self , window_size = 50 , max_reads = 50 , layers = [Layer .READ ], base_encoder = None , print_encoding = False ):
84
84
"""Construct class instance.
85
85
86
86
Args:
@@ -91,6 +91,7 @@ def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encod
91
91
encoding follows the ordering of layers in the list. [Layer.READ]
92
92
base_encoder : A dict defining conversion of nucleotide string chars to numeric representation.
93
93
[base_encoder.base_enum_encoder]
94
+ print_encoding : Print ASCII representation of each encoding that's converted to a tensor. [False]
94
95
95
96
Returns:
96
97
Instance of class.
@@ -108,6 +109,7 @@ def __init__(self, window_size=50, max_reads=50, layers=[Layer.READ], base_encod
108
109
(self .height , self .width ), dtype = torch .float32 )
109
110
self .layer_tensors .append (tensor )
110
111
self .layer_dict [layer ] = tensor
112
+ self .print_encoding = print_encoding
111
113
112
114
@property
113
115
def width (self ):
@@ -135,6 +137,9 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
135
137
# Fetch the subsequence based on the offsets
136
138
seq = pileupread .alignment .query_sequence [query_pos -
137
139
left_offset : query_pos + right_offset ]
140
+ if self .print_encoding :
141
+ print ("{}{}{}" .format ("-" * pileup_pos_range [0 ], seq , "-" *
142
+ (2 * self .window_size + 1 - len (seq ) - pileup_pos_range [0 ])))
138
143
for seq_pos , pileup_pos in enumerate (range (pileup_pos_range [0 ], pileup_pos_range [1 ])):
139
144
# Encode base characters to enum
140
145
tensor [row , pileup_pos ] = self .base_encoder [seq [seq_pos ]]
@@ -153,7 +158,7 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
153
158
qual = qual / MAX_BASE_QUALITY
154
159
tensor [row , pileup_pos ] = qual
155
160
elif layer == self .Layer .MAPPING_QUALITY :
156
- MAX_MAPPING_QUALITY = 50 .0
161
+ MAX_MAPPING_QUALITY = 100 .0
157
162
# Getch mapping quality of alignment
158
163
map_qual = pileupread .alignment .mapping_quality
159
164
# Missing mapiping quality is 255
@@ -165,11 +170,21 @@ def _fill_layer(self, layer, pileupread, left_offset, right_offset, row, pileup_
165
170
# Encode base characters to enum
166
171
tensor [row , pileup_pos ] = map_qual
167
172
elif layer == self .Layer .REFERENCE :
173
+ if self .print_encoding :
174
+ print ("{}{}{}" .format ("-" * self .window_size , variant .ref , "-" *
175
+ (2 * self .window_size + 1 - len (variant .ref ) - self .window_size )))
168
176
# Only encode the reference at the variant position, rest all 0
169
- tensor [row , self .window_size ] = self .base_encoder [variant .ref ]
177
+ for seq_pos , pileup_pos in enumerate (
178
+ range (self .window_size , min (self .window_size + len (variant .ref ), 2 * self .window_size - 1 ))):
179
+ tensor [row , pileup_pos ] = self .base_encoder [variant .ref [seq_pos ]]
170
180
elif layer == self .Layer .ALLELE :
181
+ if self .print_encoding :
182
+ print ("{}{}{}" .format ("-" * self .window_size , variant .allele , "-" *
183
+ (2 * self .window_size + 1 - len (variant .allele ) - self .window_size )))
171
184
# Only encode the allele at the variant position, rest all 0
172
- tensor [row , self .window_size ] = self .base_encoder [variant .allele ]
185
+ for seq_pos , pileup_pos in enumerate (
186
+ range (self .window_size , min (self .window_size + len (variant .allele ), 2 * self .window_size - 1 ))):
187
+ tensor [row , pileup_pos ] = self .base_encoder [variant .allele [seq_pos ]]
173
188
174
189
def __call__ (self , variant ):
175
190
"""Return a torch Tensor pileup queried from a BAM file.
@@ -182,8 +197,13 @@ def __call__(self, variant):
182
197
variant_pos = variant .pos
183
198
bam_file = variant .bam
184
199
185
- assert (variant .type ==
186
- VariantType .SNP ), "Only SNP variants supported in PileupEncoder currently."
200
+ # Check that the ref and alt alleles all fit in the window context.
201
+ if len (variant .ref ) > self .window_size :
202
+ raise RuntimeError ("Ref allele {} too large for window {}. Please increase window size." .format (
203
+ variant .ref , self .window_size ))
204
+ if len (variant .allele ) > self .window_size :
205
+ raise RuntimeError ("Alt allele {} too large for window {}. Please increase window size." .format (
206
+ variant .allele , self .window_size ))
187
207
188
208
# Create BAM object if one hasn't been opened before.
189
209
if bam_file not in self .bams :
@@ -193,10 +213,14 @@ def __call__(self, variant):
193
213
194
214
# Get pileups from BAM
195
215
pileups = bam .pileup (chrom ,
196
- variant_pos , variant_pos + 1 ,
216
+ variant_pos - 1 , variant_pos ,
197
217
truncate = True ,
198
218
max_depth = self .max_reads )
199
219
220
+ if self .print_encoding :
221
+ print ("\n Encoding for {}" .format (variant ))
222
+ print ("Order of rows : {}" .format (self .layers ))
223
+
200
224
for col , pileup_col in enumerate (pileups ):
201
225
for row , pileupread in enumerate (pileup_col .pileups ):
202
226
# Skip rows beyond the max depth
@@ -206,6 +230,9 @@ def __call__(self, variant):
206
230
if pileupread .is_del or pileupread .is_refskip :
207
231
continue
208
232
233
+ if pileupread .is_head or pileupread .is_tail :
234
+ continue
235
+
209
236
# Using the variant locus as the center, find the left and right offset
210
237
# from that locus to use as bounds for fetching bases from reads.
211
238
#
0 commit comments