15
15
with open (ref_seq ) as handle :
16
16
reference = convert_fasta (handle )
17
17
18
- def pdistance (seq1 , seq2 ):
19
- """
20
- Calculate p-distance between two aligned sequences
21
- :param seq1: First sequence
22
- :param seq2: Second sequence
23
- :return: <ndiff> is the number of differences, <denom> is the number of valid positions
18
+
19
+ def hamming (fasta ):
24
20
"""
25
- denom = 0. # number of valid columns
26
- ndiff = 0
27
- for i , nt1 in enumerate (seq1 ):
28
- nt2 = seq2 [i ]
29
- if nt1 == '-' or nt2 == '-' :
30
- continue
31
- denom += 1
32
- if nt1 != nt2 :
33
- ndiff += 1
34
- return ndiff , denom
35
-
36
-
37
- def bootstrap (s1 , s2 , reps = 100 ):
21
+ Convert list of lists into boolean outcomes (difference between query and reference)
22
+ :param fasta: object returned by align()
23
+ :return: dictionary of boolean lists keyed by reference label
38
24
"""
39
- Sample positions at random with replacement.
40
- :param s1: first sequence
41
- :param s2: second sequence (must be aligned to s1)
42
- :param reps: number of replicates to generate
43
-
44
- :yield: tuples of sequences generated by bootstrap resampling
45
- """
46
- seqlen = len (s1 )
47
- assert len (s2 )== seqlen , "s1 and s2 must be of same length in bootstrap()"
48
-
49
- for rep in range (reps ):
25
+ aln = dict (fasta )
26
+ assert "query" in aln , "Argument <fasta> must contain 'query' entry"
27
+ query = aln .pop ('query' )
28
+ _ = aln .pop ('CON_OF_CONS' )
29
+
30
+ # iterate over remaining sequences as references
31
+ results = {}
32
+ for h , s in aln .items ():
50
33
result = []
51
- bootstrap = [random .randint (0 , seqlen - 1 ) for _ in range (seqlen )]
52
- b1 = '' .join ([s1 [i ] for i in bootstrap ])
53
- b2 = '' .join ([s2 [i ] for i in bootstrap ])
54
- yield b1 , b2
34
+ for i , nt1 in enumerate (query ):
35
+ nt2 = s [i ]
36
+ if nt1 == '-' or nt2 == '-' :
37
+ result .append (None )
38
+ continue
39
+ result .append (int (nt1 != nt2 ))
40
+ results .update ({h : result })
41
+
42
+ return results
43
+
55
44
56
45
57
46
def update_alignment (seq ):
47
+ """
48
+ Append query sequence <seq> to reference alignment and remove insertions relative to
49
+ global consensus sequence.
50
+ :param seq:
51
+ :return: a list of [header, sequence] lists
52
+ """
58
53
# append query sequence to reference alignment
59
54
fasta = align (seq , reference )
60
55
@@ -81,32 +76,33 @@ def riplike(seq, window=400, step=5, nrep=100):
81
76
:param window: width of sliding window in nucleotides
82
77
:param step: step size of sliding window in nucleotides
83
78
:param nrep: number of replicates for nonparametric bootstrap sampling
79
+ :return: list of result dictionaries in order of window position
84
80
"""
85
81
86
82
results = []
87
-
83
+
88
84
fasta = update_alignment (seq )
89
85
query = dict (fasta )['query' ] # aligned query
90
86
seqlen = len (query )
91
-
87
+ ham = hamming (fasta )
88
+
92
89
for left in range (0 , seqlen - window , step ):
93
90
best_p , second_p = 1. , 1. # maximum p-distance
94
91
best_ref , second_ref = None , None
95
- best_seq = ''
96
-
97
- # cut slice from query sequence for this window
98
- q1 = query [left :(left + window )]
99
-
92
+ best_seq = []
93
+
100
94
# iterate over reference genomes
101
- for h , s in fasta :
102
- if h == 'query' or h == 'CON_OF_CONS' :
95
+ for h , s in ham . items () :
96
+ if h == 'query' or h == 'CON_OF_CONS' :
103
97
continue
104
98
105
99
# slice window segment from reference
106
100
s1 = s [left :(left + window )]
101
+ s2 = [x for x in s1 if x is not None ]
107
102
108
103
# calculate p-distance
109
- ndiff , denom = pdistance (s1 , q1 )
104
+ ndiff = sum (s2 )
105
+ denom = len (s2 )
110
106
if denom == 0 :
111
107
# no overlap! TODO: require minimum overlap?
112
108
continue
@@ -115,30 +111,25 @@ def riplike(seq, window=400, step=5, nrep=100):
115
111
if pd < best_p :
116
112
# query is closer to this reference
117
113
second_p = best_p ; second_ref = best_ref
118
- best_p = pd ; best_ref = h ; best_seq = s1
114
+ best_p = pd ; best_ref = h ; best_seq = s2
119
115
elif pd < second_p :
120
116
# replace second best
121
117
second_p = pd ; second_ref = h
122
118
123
- if best_ref is None :
124
- outfile .write ('{},{},None,,None,,\n ' .format (header , left ))
125
- continue
126
-
127
- result = {'left' : left , 'best_ref' : best_ref , 'best_p' : best_p ,
128
- 'second_ref' : second_ref , 'second_p' : None if second_ref is None else second_p }
119
+ result = {'left' : left , 'best_ref' : best_ref , 'best_p' : best_p ,
120
+ 'second_ref' : second_ref , 'second_p' : None if second_ref is None else second_p }
121
+ # print(result)
129
122
130
123
quant = None
131
- if second_ref is not None :
124
+ if second_ref is not None and nrep > 0 :
132
125
# use nonparametric bootstrap to determine significance
133
- boot_dist = []
134
- for bs , bq in bootstrap (best_seq , q1 , reps = nrep ):
135
- ndiff , denom = pdistance (bs , bq )
136
- if denom > 0 :
137
- boot_dist .append (ndiff / denom )
138
-
139
- # how many are closer than second best?
140
- quant = list (map (lambda x : x < second_p , boot_dist ))
141
- quant = sum (quant ) / float (len (quant ))
126
+ count = 0.
127
+ n = len (best_seq )
128
+ for rep in range (nrep ):
129
+ boot = [best_seq [round (random .random ()* (n - 1 ))] for _ in range (n )]
130
+ if sum (boot )/ len (boot ) < second_p :
131
+ count += 1
132
+ quant = count / nrep
142
133
143
134
result .update ({'quant' : quant })
144
135
results .append (result )
@@ -172,7 +163,7 @@ def main():
172
163
for result in results :
173
164
args .outfile .write (
174
165
'{},{left},{best_ref},{best_p},{second_ref},{second_p},{quant}\n '
175
- .format (header , ** result )
166
+ .format (h , ** result )
176
167
)
177
168
178
169
args .outfile .close ()
0 commit comments