1
1
# TODO: consistent reference coordinates across outputs
2
2
3
+ import sys
4
+ import subprocess
3
5
import random
4
6
import argparse
7
+ import os
5
8
6
9
from poplars .common import convert_fasta
7
10
from poplars .mafft import align
8
- import numpy as np
9
11
10
- # subset of HIV-1 group M subtype references curated by LANL
11
- with open ('../poplars/ref_genomes/HIV1_Mgroup.fasta' ) as handle :
12
- reference = convert_fasta (handle )
13
12
14
-
15
- def pdistance (seq1 , seq2 ):
13
+ def hamming (fasta ):
16
14
"""
17
- Calculate p-distance between two aligned sequences
18
- :param seq1: First sequence
19
- :param seq2: Second sequence
20
- :return: <ndiff> is the number of differences, <denom> is the number of valid positions
15
+ Convert list of lists into boolean outcomes (difference between query and reference)
16
+ :param fasta: object returned by align()
17
+ :return: dictionary of boolean lists keyed by reference label
21
18
"""
22
- denom = 0. # number of valid columns
23
- ndiff = 0
24
-
25
- seqs = np .char .asarray ([seq1 , seq2 ]) # Convert sequences to numpy arrays
26
-
27
- # Stack 2-D numpy arrays and find columns not containing '-'
28
- # Gives an array containing True and False
29
- denoms = np .where (np .all (np .isin (seqs , '-' , invert = True ), 0 ))[0 ]
30
- denom = denoms .shape [0 ]
31
-
32
- # From the valid positions, find where the sequences contain different nucleotides
33
- ndiff = np .sum (seqs [0 , :][denoms ] != seqs [1 , :][denoms ])
19
+ aln = dict (fasta )
20
+ assert "query" in aln , "Argument <fasta> must contain 'query' entry"
21
+ query = aln .pop ('query' )
22
+ _ = aln .pop ('CON_OF_CONS' )
23
+
24
+ # iterate over remaining sequences as references
25
+ results = {}
26
+ for h , s in aln .items ():
27
+ result = []
28
+ for i , nt1 in enumerate (query ):
29
+ nt2 = s [i ]
30
+ if nt1 == '-' or nt2 == '-' :
31
+ result .append (None )
32
+ continue
33
+ result .append (int (nt1 != nt2 ))
34
+ results .update ({h : result })
34
35
35
- return ndiff , denom
36
+ return results
36
37
37
38
38
- def bootstrap ( s1 , s2 , reps = 100 ):
39
+ def update_alignment ( seq , reference ):
39
40
"""
40
- Sample positions at random with replacement.
41
- :param s1: first sequence
42
- :param s2: second sequence (must be aligned to s1)
43
- :param reps: number of replicates to generate
44
-
45
- :yield: tuples of sequences generated by bootstrap resampling
41
+ Append query sequence <seq> to reference alignment and remove insertions relative to
42
+ global consensus sequence.
43
+ :param seq: the query sequence
44
+ :param reference: the reference sequence
45
+ :return: a list of [header, sequence] lists
46
46
"""
47
- seqlen = len (s1 )
48
- assert len (s2 ) == seqlen , "s1 and s2 must be of same length in bootstrap()"
49
-
50
- # Convert sequences to numpy arrays
51
- s1_np = np .char .asarray (list (s1 ))
52
- s2_np = np .char .asarray (list (s2 ))
53
-
54
- for rep in range (reps ):
55
- bootstrap = np .random .randint (0 , seqlen , seqlen )
56
- b1 = s1_np [bootstrap ]
57
- b2 = s2_np [bootstrap ]
58
- yield b1 , b2
59
-
60
-
61
- def update_alignment (seq ):
62
47
# append query sequence to reference alignment
63
48
fasta = align (seq , reference )
64
49
@@ -78,38 +63,53 @@ def update_alignment(seq):
78
63
return fasta2
79
64
80
65
81
- def riplike (seq , outfile , window = 400 , step = 5 , nrep = 100 ):
66
+ def encode (sequence ):
67
+ """
68
+ Encodes each nucleotide in a sequence using 4-bits
69
+ :param sequence: the sequence
70
+ :return: the sequence as a bitstring where each nucleotide is encoded using a 4-bits
71
+ """
72
+ seq = []
73
+ binary_nt = {'A' : 0B0001 , 'T' : 0B0010 , 'C' : 0B0011 , 'G' : 0B0100 , ' ' : 0B0000 , '-' : 0B1111 }
74
+ for nt in sequence :
75
+ seq .append (binary_nt [nt ])
76
+ return seq
77
+
78
+
79
+ def riplike (seq , reference , window = 400 , step = 5 , nrep = 100 ):
82
80
"""
83
81
:param seq: query sequence
84
- :param outfile: open file stream in write mode for results
82
+ :param reference: the alignment background
85
83
:param window: width of sliding window in nucleotides
86
84
:param step: step size of sliding window in nucleotides
87
85
:param nrep: number of replicates for nonparametric bootstrap sampling
86
+ :return: list of result dictionaries in order of window position
88
87
"""
88
+
89
89
results = []
90
90
91
- fasta = update_alignment (seq )
91
+ fasta = update_alignment (seq , reference )
92
92
query = dict (fasta )['query' ] # aligned query
93
93
seqlen = len (query )
94
+ ham = hamming (fasta )
94
95
95
- for center in range (window // 2 , seqlen - (window // 2 ), step ):
96
+ for centre in range (window // 2 , seqlen - (window // 2 ), step ):
96
97
best_p , second_p = 1. , 1. # maximum p-distance
97
98
best_ref , second_ref = None , None
98
- best_seq = ''
99
-
100
- # cut slice from query sequence for this window
101
- q1 = query [center - (window // 2 ):center + (window // 2 )]
99
+ best_seq = []
102
100
103
101
# iterate over reference genomes
104
- for h , s in fasta :
102
+ for h , s in ham . items () :
105
103
if h == 'query' or h == 'CON_OF_CONS' :
106
104
continue
107
105
108
- # slice window segment from reference
109
- s1 = s [center - (window // 2 ):center + (window // 2 )]
106
+ # slice window segment from reference
107
+ s1 = s [centre - (window // 2 ): centre + (window // 2 )]
108
+ s2 = [x for x in s1 if x is not None ]
110
109
111
110
# calculate p-distance
112
- ndiff , denom = pdistance (list (s1 ), list (q1 ))
111
+ ndiff = sum (s2 )
112
+ denom = len (s2 )
113
113
if denom == 0 :
114
114
# no overlap! TODO: require minimum overlap?
115
115
continue
@@ -121,31 +121,25 @@ def riplike(seq, outfile, window=400, step=5, nrep=100):
121
121
second_ref = best_ref
122
122
best_p = pd
123
123
best_ref = h
124
- best_seq = s1
124
+ best_seq = s2
125
125
elif pd < second_p :
126
126
# replace second best
127
127
second_p = pd
128
128
second_ref = h
129
129
130
- if best_ref is None :
131
- outfile .write ('{},{},None,,None,,\n ' .format (h , center ))
132
- continue
133
-
134
- result = {'center' : center , 'best_ref' : best_ref , 'best_p' : best_p ,
130
+ result = {'centre' : centre , 'best_ref' : best_ref , 'best_p' : best_p ,
135
131
'second_ref' : second_ref , 'second_p' : None if second_ref is None else second_p }
136
132
137
133
quant = None
138
- if second_ref is not None :
134
+ if second_ref is not None and nrep > 0 :
139
135
# use nonparametric bootstrap to determine significance
140
- boot_dist = []
141
- for bs , bq in bootstrap (best_seq , q1 , reps = nrep ):
142
- ndiff , denom = pdistance (bs , bq )
143
- if denom > 0 :
144
- boot_dist .append (ndiff / denom )
145
-
146
- # how many are closer than second best?
147
- quant = list (map (lambda x : x < second_p , boot_dist ))
148
- quant = sum (quant ) / float (len (quant ))
136
+ count = 0.
137
+ n = len (best_seq )
138
+ for rep in range (nrep ):
139
+ boot = [best_seq [round (random .random () * (n - 1 ))] for _ in range (n )]
140
+ if sum (boot ) / len (boot ) < second_p :
141
+ count += 1
142
+ quant = count / nrep
149
143
150
144
result .update ({'quant' : quant })
151
145
results .append (result )
@@ -168,18 +162,31 @@ def main():
168
162
help = '<optional, int> Window step size.' )
169
163
parser .add_argument ('-nrep' , type = int , default = 100 ,
170
164
help = '<optional, int> Number of bootstrap replicates.' )
165
+ parser .add_argument ('-custombg' , type = argparse .FileType ('r' ),
166
+ help = '<optional> FASTA file to be used as the alignment background' )
171
167
172
168
args = parser .parse_args ()
169
+
170
+ if args .custombg :
171
+ ref_seq = args .custombg
172
+ else :
173
+ # subset of HIV-1 group M subtype references curated by LANL
174
+ seq_path = os .path .dirname (os .path .abspath (__file__ ))
175
+ ref_seq = os .path .join (seq_path , 'ref_genomes/HIV1_Mgroup.fasta' )
176
+
177
+ with open (ref_seq ) as handle :
178
+ reference = convert_fasta (handle )
179
+
173
180
args .outfile .write ('qname,pos,rname,pdist,rname2,pdist2,qboot\n ' )
174
181
175
182
fasta = convert_fasta (args .infile )
176
183
for h , s in fasta :
177
184
print (h ) # crude progress monitoring
178
- results = riplike (s , args . outfile , window = args .window , step = args .step , nrep = args .nrep )
185
+ results = riplike (s , reference , window = args .window , step = args .step , nrep = args .nrep )
179
186
for result in results :
180
187
args .outfile .write (
181
- '{},{center },{best_ref},{best_p},{second_ref},{second_p},{quant}\n '
182
- .format (h , ** result )
188
+ '{},{centre },{best_ref},{best_p},{second_ref},{second_p},{quant}\n '
189
+ .format (h , ** result )
183
190
)
184
191
185
192
args .outfile .close ()
0 commit comments