-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathNanoSim_Wrapper.py
executable file
·164 lines (141 loc) · 6.36 KB
/
NanoSim_Wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
import sys, os, glob, re, argparse, textwrap, subprocess, random
from collections import defaultdict
from natsort import natsorted
parser = argparse.ArgumentParser(formatter_class= argparse.RawDescriptionHelpFormatter, description=textwrap.dedent('''
This program creates individual fasta files that will be used as input for NanoSim as well as a text file
that designates how many reads are wanted for each transcript along with the minimum and maximum values
for NanoSim commands '''))
group = parser.add_argument_group('required arguments')
group.add_argument('-c', '--count', type=str, required=True, help="Tab delimited file that contains desired transcript headers along with the number of desired reads")
group.add_argument('-f', '--fasta_file', type=str, required=True, help="fasta file that contains all the potential transcripts that could be simulated (gencode)")
group.add_argument('-p', '--mincov', type=float, default=0.9, help="Minimum transcript coverage to be considered full length (default 0.9)")
group.add_argument('-n', '--not_full_length', type=float, default=0.2, help="What fraction of the output reads should be not full length (default 0.2)")
group.add_argument('-M', '--model_prefix', type=str, required=True, help="Model prefix used to simulate reads e.g. Hg38_Model2/Hg38_training")
group.add_argument('-d', '--outdir', type=str, default="ns_output", help="Output directory")
group.add_argument('-O', '--combined', type=str, default="combined.fasta", help="final combined simulated reads fasta file")
#optional flag
group.add_argument('--min_len', type=int, default=0, help="Transcripts less than this length will not be simulated")
group.add_argument('--max_len', type=int, required=False, help="Transcripts greater than this length will not be simulated")
group.add_argument('-m', '--multiply', type=int, default=1, help="Use to increase the number of reads you want per transcript, will use the count number from count file and will multiply by number given. Use 1 to use original count number")
def getCounts(infile):
counts = dict()
with open(infile, 'r') as f:
for line in f:
line = line.strip()
[id, score] = line.split("\t")
counts[id] = int(score)
return counts
def yieldFasta(fasta):
"""Generator function to yield fasta id and sequence"""
ok = True
try:
with open(fasta, 'r') as fh:
while ok:
try:
line = next(fh)
except:
ok = False
yield id, sequence
if line[0] == '>':
try:
yield id, sequence
except NameError:
pass
# remove newline and '>'
id = line.strip('\n')
id = id[1:]
sequence = ''
elif line[0] != '>':
sequence += line.strip('\n')
except (IOError, OSError):
print("Error opening / processing file")
except StopIteration:
pass
def runNanosimPerTx(args, countDict):
"""For every transcript in the input fasta, see if it's in the counts file and if it passes other requirements. If so, run NanoSim"""
prog = which('simulator.py')
for id, sequence in yieldFasta(args.fasta_file):
if id in countDict:
print 'simulating for {}'.format(id)
seq_len = len(sequence)
if seq_len < args.min_len:
continue
if args.max_len:
if seq_len > args.max_len:
continue
count = countDict[id] * args.multiply
minimum = int(float(seq_len) * args.mincov)
num_full_length = int(float(count) * (1- args.not_full_length))
num_non_full_length = int(float(count) * args.not_full_length)
# create inputfile for Nanosim
nsinput = os.path.join(args.outdir, "tmp.fa")
with open(nsinput, 'w') as output:
output.write(">{}\n{}\n".format(id, sequence))
# nanoSim creates two files, capture their names
runNanoSim(prog, args, nsinput, num_full_length, num_non_full_length, seq_len, minimum)
else:
print 'skipping {}'.format(id)
def yieldCount():
"""Simple generator function to yield an incremental number"""
i=0
while True:
i += 1
yield i
def runNanoSim(NanoSim, args, inputfa, num_reads_full, num_reads_part, seq_size, min_len):
# full length reads have a size range from min_len to seq_size
outfileFull = "{}/full.{}".format(args.outdir, counter.next())
cmd=[
NanoSim, 'linear',
'-r', inputfa,
'-c', args.model_prefix,
'-n', str(num_reads_full),
'--max_len', str(seq_size),
'--min_len', str(min_len),
'-o', outfileFull
]
subprocess.call(cmd)
# partial reads have a size range from 100 to min_len
outfilePart = "{}/part.{}".format(args.outdir, counter.next())
cmd=[
NanoSim, 'linear',
'-r', inputfa,
'-c', args.model_prefix,
'-n', str(num_reads_part),
'--max_len', str(min_len -1),
'--min_len', "100",
'-o', outfilePart
]
subprocess.call(cmd)
return [outfilePart, outfileFull]
def combineFa(args, read_files):
with open(args.combined, "w") as outfile:
for f in read_files:
with open(f, "r") as infile:
outfile.writelines(infile.read())
def which(prog):
cmd = ["which", prog]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
res = p.stdout.readline().rstrip()
if len(res) == 0:
print >>sys.stderr, 'ERROR, cannot find program {}, please add to $PATH'.format(prog)
sys.exit(1)
return(res)
def main():
args = parser.parse_args()
if os.path.exists(args.outdir):
print >>sys.stderr, 'ERROR, output directory {} already exists, please remove or rename'.format(args.outdir)
sys.exit(1)
os.makedirs(args.outdir)
# get counts per transcript
countsDict = getCounts(args.count)
# set a counter
global counter
counter = yieldCount()
# run NanoSim
runNanosimPerTx(args, countsDict)
outfiles = glob.glob("{}/*.fasta".format(args.outdir))
# combine output
combineFa(args, outfiles)
if __name__ == '__main__':
main()