5
5
import sys
6
6
import argparse
7
7
8
- def dump_transcript (gene_name ,
9
- gene_id ,
10
- gene_type ,
11
- gene_description ,
12
- gene_importance ,
13
- gene_start ,
14
- gene_end ,
15
- transcript_id ,
16
- chrom , start , end , strand ,cdss , exons ):
8
+
9
+ def dump_transcript (gene_name ,
10
+ gene_id ,
11
+ gene_type ,
12
+ gene_description ,
13
+ gene_importance ,
14
+ gene_start ,
15
+ gene_end ,
16
+ transcript_id ,
17
+ chrom , start , end , strand , cdss , exons ):
17
18
'''
18
19
Print out a set of transcripts for this gene
19
20
'''
20
21
if int (end ) < int (start ):
21
- print ("WARNING: end < start:" , transcript_id , start , end , file = sys .stderr )
22
+ print ("WARNING: end < start:" , transcript_id ,
23
+ start , end , file = sys .stderr )
22
24
23
25
print ('{chrom}\t {start}\t {end}\t {gene_name}\t {importance}\t {strand}\t {transcript_id}\t {gene_id}\t {gene_type}\t {gene_description}\t {cds_start}\t {cds_end}\t {exon_starts}\t {exon_ends}' .format (
24
- chrom = chrom ,
25
- start = gene_start ,
26
- end = gene_end ,
27
- gene_name = gene_name ,
28
- importance = gene_importance ,
29
- strand = strand ,
30
- transcript_id = transcript_id ,
31
- gene_id = gene_id ,
32
- gene_type = gene_type ,
33
- gene_description = gene_description ,
34
- cds_start = start ,
35
- cds_end = end ,
36
- exon_starts = ',' .join ([str (e [1 ]) for e in exons ]),
37
- exon_ends = ',' .join ([str (e [2 ]) for e in exons ])))
26
+ chrom = chrom ,
27
+ start = gene_start ,
28
+ end = gene_end ,
29
+ gene_name = gene_name ,
30
+ importance = gene_importance ,
31
+ strand = strand ,
32
+ transcript_id = transcript_id ,
33
+ gene_id = gene_id ,
34
+ gene_type = gene_type ,
35
+ gene_description = gene_description ,
36
+ cds_start = start ,
37
+ cds_end = end ,
38
+ exon_starts = ',' .join ([str (e [1 ]) for e in exons ]),
39
+ exon_ends = ',' .join ([str (e [2 ]) for e in exons ])))
40
+
38
41
39
42
def main ():
40
43
parser = argparse .ArgumentParser (description = """
@@ -44,11 +47,11 @@ def main():
44
47
45
48
parser .add_argument ('gff_file' )
46
49
parser .add_argument ('--save-chromsizes' , default = None ,
47
- help = 'Store the chromsizes in a separate file' ,
48
- type = str )
49
- #parser.add_argument('-o', '--options', default='yo',
50
+ help = 'Store the chromsizes in a separate file' ,
51
+ type = str )
52
+ # parser.add_argument('-o', '--options', default='yo',
50
53
# help="Some option", type='str')
51
- #parser.add_argument('-u', '--useless', action='store_true',
54
+ # parser.add_argument('-u', '--useless', action='store_true',
52
55
# help='Another useless option')
53
56
54
57
args = parser .parse_args ()
@@ -57,7 +60,7 @@ def main():
57
60
with open (args .gff_file , 'r' ) as f :
58
61
transcript_id = None
59
62
chromsizes = []
60
-
63
+
61
64
for line in f :
62
65
counter += 1
63
66
if line .strip ()[0 ] == '#' :
@@ -114,7 +117,8 @@ def main():
114
117
x_split = x .split ('=' )
115
118
attrs [x_split [0 ]] = x_split [1 ]
116
119
except IndexError as ve :
117
- print ("WARNING: Strange Parts:" , to_split , ve , file = sys .stderr )
120
+ print ("WARNING: Strange Parts:" ,
121
+ to_split , ve , file = sys .stderr )
118
122
119
123
if annotation_type == 'chromosome' :
120
124
id_parts = attrs ['ID' ].split (':' )
@@ -123,23 +127,22 @@ def main():
123
127
124
128
chromsizes += [(chromname , chromsize )]
125
129
126
-
127
130
if annotation_type == 'gene' or annotation_type == 'tRNA_gene' :
128
131
if transcript_id is not None :
129
132
dump_transcript (gene_name ,
130
- gene_id ,
131
- gene_type ,
132
- gene_description ,
133
- gene_importance ,
134
- gene_start ,
135
- gene_end ,
136
- transcript_id ,
137
- transcript_chrom ,
138
- transcript_start ,
139
- transcript_end ,
140
- transcript_strand ,
141
- transcript_cdss ,
142
- transcript_exons )
133
+ gene_id ,
134
+ gene_type ,
135
+ gene_description ,
136
+ gene_importance ,
137
+ gene_start ,
138
+ gene_end ,
139
+ transcript_id ,
140
+ transcript_chrom ,
141
+ transcript_start ,
142
+ transcript_end ,
143
+ transcript_strand ,
144
+ transcript_cdss ,
145
+ transcript_exons )
143
146
144
147
split_id = attrs ['ID' ].split (':' )
145
148
gene_id = attrs ['ID' ]
@@ -149,20 +152,23 @@ def main():
149
152
elif 'Name' in attrs :
150
153
split_name = attrs ['Name' ].split (':' )
151
154
print ("split_name" , split_name , file = sys .stderr )
152
- gene_name = split_name [0 ] if len (split_name ) == 1 else split_name [1 ]
155
+ gene_name = split_name [0 ] if len (
156
+ split_name ) == 1 else split_name [1 ]
153
157
else :
154
- gene_name = split_id [0 ] if len (split_id ) == 1 else split_id [1 ]
158
+ gene_name = split_id [0 ] if len (
159
+ split_id ) == 1 else split_id [1 ]
155
160
print ("WARNING: no gene name:" , to_split , file = sys .stderr )
156
161
157
162
if 'GENE_TYPE' in attrs :
158
163
gene_type = attrs ['GENE_TYPE' ]
159
164
elif 'biotype' in attrs :
160
165
gene_type = attrs ['biotype' ]
161
166
else :
162
- print ("WARNING: no gene type (GENE_TYPE or biotype attribute)" , to_split , file = sys .stderr )
167
+ print ("WARNING: no gene type (GENE_TYPE or biotype attribute)" ,
168
+ to_split , file = sys .stderr )
163
169
164
170
gene_description = attrs ['description' ] if 'description' in attrs else '-'
165
- gene_importance = random .randint (0 ,10000 )
171
+ gene_importance = random .randint (0 , 10000 )
166
172
gene_start = start_pos
167
173
gene_end = end_pos
168
174
@@ -181,19 +187,19 @@ def main():
181
187
if annotation_type == 'transcript' or annotation_type == 'mRNA' :
182
188
if transcript_id is not None :
183
189
dump_transcript (gene_name ,
184
- gene_id ,
185
- gene_type ,
186
- gene_description ,
187
- gene_importance ,
188
- gene_start ,
189
- gene_end ,
190
- transcript_id ,
191
- transcript_chrom ,
192
- transcript_start ,
193
- transcript_end ,
194
- transcript_strand ,
195
- transcript_cdss ,
196
- transcript_exons )
190
+ gene_id ,
191
+ gene_type ,
192
+ gene_description ,
193
+ gene_importance ,
194
+ gene_start ,
195
+ gene_end ,
196
+ transcript_id ,
197
+ transcript_chrom ,
198
+ transcript_start ,
199
+ transcript_end ,
200
+ transcript_strand ,
201
+ transcript_cdss ,
202
+ transcript_exons )
197
203
198
204
transcript_exons = []
199
205
transcript_id = attrs ['ID' ]
@@ -208,33 +214,29 @@ def main():
208
214
parent_id = attrs ['Parent' ]
209
215
if parent_id != transcript_id :
210
216
print ("Exon parent doesn't match transcript_id" ,
211
- parent_id , transcript_id , file = sys .stderr )
217
+ parent_id , transcript_id , file = sys .stderr )
212
218
transcript_exons += [(chrom , start_pos , end_pos )]
213
219
214
220
dump_transcript (gene_name ,
215
- gene_id ,
216
- gene_type ,
217
- gene_description ,
218
- gene_importance ,
219
- gene_start ,
220
- gene_end ,
221
- transcript_id ,
222
- transcript_chrom ,
223
- transcript_start ,
224
- transcript_end ,
225
- transcript_strand ,
226
- transcript_cdss ,
227
- transcript_exons )
221
+ gene_id ,
222
+ gene_type ,
223
+ gene_description ,
224
+ gene_importance ,
225
+ gene_start ,
226
+ gene_end ,
227
+ transcript_id ,
228
+ transcript_chrom ,
229
+ transcript_start ,
230
+ transcript_end ,
231
+ transcript_strand ,
232
+ transcript_cdss ,
233
+ transcript_exons )
228
234
229
235
if args .save_chromsizes :
230
236
with open (args .save_chromsizes , 'w' ) as f :
231
237
for (name , size ) in chromsizes :
232
238
f .write ("{}\t {}\n " .format (name , size ))
233
239
234
240
235
-
236
-
237
241
if __name__ == '__main__' :
238
242
main ()
239
-
240
-
0 commit comments