-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkov_gen.py
160 lines (135 loc) · 4.33 KB
/
markov_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/python2.7
############################################################################
# simulation.py
# Code for generating a simulated chromosome based on a k-degree markov chain and a model sequences
# Written by: Jiajun Wang, John Karro
# Date: March, 2014
# Email: [email protected]
import re
import sys
import argparse
import random
import unittest
import sys
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import pickle
fn={
'A':0, 'a':0,
'C':1, 'c':1,
'G':2, 'g':2,
'T':3, 't':3,
'N':-1, 'n':-1
}
bases = "ACGTacgt"
baseSet = set(bases)
base2index = {x:y for x,y in zip("acgtnACGTN", [0,1,2,3,-1,0,1,2,3,-1])}
def ProbTuple(Mark,kmo):
total = 0
tlist=[0]*4
for t in range(4):
total += Mark[(kmo<<2)|t]
if total == 0 :
return [0,0,0,0]
probsum = 0
for t in range(4):
tlist[t] = probsum + Mark[(kmo<<2)|t]/float(total)
probsum = tlist[t]
return tlist
def pickFromProbTuple(probTuple):
"""Pick a random index based on a probability tuple. Assumes last element of tuple is 1."""
r = random.random()
i = 0
while r > probTuple[i]:
i += 1
return i
#return the index of the first base without 'N' in following it for k length
def NextIndex(i,k,seq): # Should be having inSeq as a parameter
"""Return the smallest j >= i such that inSeq[j:j+k] contains only bases"""
j = i
while j + k < len(seq) and seq[j] not in baseSet:
j += 1
if k == 0:
return j
while j + k < len(seq):
p = 1
while p < k and seq[j+p] in baseSet:
p += 1
if p == k:
return j
j += p + 1
return len(seq)
def Markov(k,inSeq): # inSeq as parameter?
"""
Create a k-th order Markov chain based on the distribution
of inSeq
"""
Mark= [0]*(4**(k+1))
mask=(1<<((k+1)*2))-1
t=0
#get first k+1 subsequence
i=NextIndex(0,k,inSeq)
for j in range(k+1):
t=(t<<2) | base2index[inSeq[i+j]]
Mark[t] = 1
#go through the whole seq
i += k+1
while i<len(inSeq) :
if inSeq[i] not in bases:
i=NextIndex(i,k,inSeq)
if (i+k)>=len(inSeq):
break
t=0
for j in range(k+1):
t = (t<<2) | base2index[inSeq[i+j]]
i += k+1
else:
t = ((t<<2) | base2index[inSeq[i]]) & mask
i += 1
Mark[t] += 1
Klist=[[0,0,0,0]]*(4**k)
for KMOne in range(4**k):
Klist[KMOne] = ProbTuple(Mark,KMOne)
return Klist
def MarkovArray(k, inSeq):
"""
Create an array such that M[i] is the ith-order for inSeq
(0 <= i <= k).
"""
return [Markov(i,inSeq) for i in range(0,k+1)]
def generate_sequence(markov_list, seq_len):
"""Generate a random sequence of length length using the list of k-order Markov chain modeled on sequence seq"""
k = len(markov_list)-1
mask = (1 << (k*2)) - 1
j = 0
seq = ['']*seq_len
key = 0
for i in range(0, seq_len):
next_tuple = markov_list[j][key]
if next_tuple[-1] < 0.9999999:
j = 0
key = 0
next_tuple = markov_list[j][key]
base = pickFromProbTuple(next_tuple)
key = ((key << 2) | base) & mask
seq[i] = bases[base]
j = min(j+1,k)
return "".join(seq)
def pickle_markov_list(markov_list, file):
pickle.dump(markov_list, open(file, "wb"))
def create_pmck(seqFile, k, output = None):
s = "N".join([str(r.seq) for r in SeqIO.parse(seqFile, 'fasta')])
M = MarkovArray(k,s)
if not ouput:
output = re.sub("\.(fa|fasta)$", ".pmc%d" % (k), seqFile)
pickle_markov_list(M, output)
def read_pmck(file):
return pickle.load(open(file, "rb"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = "Generate simulated sequence using a k-order Markov chain")
parser.add_argument('-k', type = int, help = "Order of Markov chain", default = 5)
parser.add_argument('-p', '--pickle_file', help = "Name of pickled file (default: <seq file>.pmc<k>)")
parser.add_argument('seq_file', help = "Model sequence file (fasta format)")
args = parser.parse_args()
create_pmck(args.seq_file, args.k, args.output)