Skip to content

Commit 5d0341b

Browse files
mccallucpkerpedjiev
authored andcommitted
autopep8 scripts (#75)
1 parent 98fa413 commit 5d0341b

File tree

6 files changed

+118
-128
lines changed

6 files changed

+118
-128
lines changed

.flake8

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,6 @@ exclude =
2020
notebooks/ENSEMBL annotations and RNAseq.ipynb
2121
notebooks/h37rv gene annotations.ipynb
2222
pyprof.sh
23-
scripts/exonU.py
24-
scripts/gff_to_chromsizes.py
25-
scripts/gff_to_genepred.py
26-
scripts/replace_importances.py
27-
scripts/tsv_to_mrmatrix.py
2823
setup.py
2924

3025
ignore =

scripts/exonU.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88
import sys
99
import argparse
1010

11+
1112
class GeneInfo:
1213
def __init__(self):
1314
pass
1415

16+
1517
def merge_gene_info(gene_infos, gene_info):
1618
'''
1719
Add a new gene_info. If it's txStart and txEnd overlap with a previous entry for this
@@ -20,15 +22,15 @@ def merge_gene_info(gene_infos, gene_info):
2022
merged = False
2123

2224
for existing_gene_info in gene_infos[gene_info.geneId]:
23-
if (existing_gene_info.chrName == gene_info.chrName and
25+
if (existing_gene_info.chrName == gene_info.chrName and
2426
existing_gene_info.txEnd > gene_info.txStart and
2527
gene_info.txEnd > existing_gene_info.txStart):
2628

2729
# overlapping genes, merge the exons of the second into the first
2830
existing_gene_info.txStart = min(existing_gene_info.txStart,
2931
gene_info.txStart)
3032
existing_gene_info.txEnd = max(existing_gene_info.txEnd,
31-
gene_info.txEnd)
33+
gene_info.txEnd)
3234

3335
for (exon_start, exon_end) in gene_info.exonUnions:
3436
existing_gene_info.exonUnions.add((exon_start, exon_end))
@@ -51,9 +53,9 @@ def main():
5153
""")
5254

5355
parser.add_argument('transcript_bed')
54-
#parser.add_argument('-o', '--options', default='yo',
56+
# parser.add_argument('-o', '--options', default='yo',
5557
# help="Some option", type='str')
56-
#parser.add_argument('-u', '--useless', action='store_true',
58+
# parser.add_argument('-u', '--useless', action='store_true',
5759
# help='Another useless option')
5860
args = parser.parse_args()
5961

@@ -85,29 +87,26 @@ def main():
8587
print("ERROR: line:", line, file=sys.stderr)
8688
continue
8789

88-
8990
# for some reason, exon starts and ends have trailing commas
9091
gene_info.exonStartParts = gene_info.exonStarts.strip(",").split(',')
9192
gene_info.exonEndParts = gene_info.exonEnds.strip(",").split(',')
92-
gene_info.exonUnions = set([(int(s), int(e)) for (s,e) in zip(gene_info.exonStartParts, gene_info.exonEndParts)])
93+
gene_info.exonUnions = set([(int(s), int(e)) for (s, e) in zip(
94+
gene_info.exonStartParts, gene_info.exonEndParts)])
9395

9496
# add this gene info by checking whether it overlaps with any existing ones
9597
gene_infos = merge_gene_info(gene_infos, gene_info)
9698

9799
for gene_id in gene_infos:
98100
for contig in gene_infos[gene_id]:
99101
output = "\t".join(map(str, [contig.chrName, contig.txStart, contig.txEnd,
100-
contig.geneName, contig.score, contig.strand,
101-
'union_' + gene_id, gene_id, contig.geneType, contig.geneDesc,
102-
contig.cdsStart, contig.cdsEnd,
103-
",".join([str(e[0]) for e in sorted(contig.exonUnions)]),
104-
",".join([str(e[1]) for e in sorted(contig.exonUnions)])]))
102+
contig.geneName, contig.score, contig.strand,
103+
'union_' + gene_id, gene_id, contig.geneType, contig.geneDesc,
104+
contig.cdsStart, contig.cdsEnd,
105+
",".join(
106+
[str(e[0]) for e in sorted(contig.exonUnions)]),
107+
",".join([str(e[1]) for e in sorted(contig.exonUnions)])]))
105108
print(output)
106109

107110

108-
109111
if __name__ == '__main__':
110112
main()
111-
112-
113-

scripts/gff_to_chromsizes.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,21 @@
33
import sys
44
import argparse
55

6+
67
def main():
78
parser = argparse.ArgumentParser(description="""
89
910
python gff_to_chromsizes.py
1011
""")
1112

1213
#parser.add_argument('argument', nargs=1)
13-
#parser.add_argument('-o', '--options', default='yo',
14+
# parser.add_argument('-o', '--options', default='yo',
1415
# help="Some option", type='str')
15-
#parser.add_argument('-u', '--useless', action='store_true',
16+
# parser.add_argument('-u', '--useless', action='store_true',
1617
# help='Another useless option')
1718

1819
args = parser.parse_args()
19-
20+
2021

2122
if __name__ == '__main__':
2223
main()
23-
24-

scripts/gff_to_genepred.py

Lines changed: 81 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -5,36 +5,39 @@
55
import sys
66
import argparse
77

8-
def dump_transcript(gene_name,
9-
gene_id,
10-
gene_type,
11-
gene_description,
12-
gene_importance,
13-
gene_start,
14-
gene_end,
15-
transcript_id,
16-
chrom, start, end, strand,cdss, exons):
8+
9+
def dump_transcript(gene_name,
10+
gene_id,
11+
gene_type,
12+
gene_description,
13+
gene_importance,
14+
gene_start,
15+
gene_end,
16+
transcript_id,
17+
chrom, start, end, strand, cdss, exons):
1718
'''
1819
Print out a set of transcripts for this gene
1920
'''
2021
if int(end) < int(start):
21-
print("WARNING: end < start:", transcript_id, start, end, file=sys.stderr)
22+
print("WARNING: end < start:", transcript_id,
23+
start, end, file=sys.stderr)
2224

2325
print('{chrom}\t{start}\t{end}\t{gene_name}\t{importance}\t{strand}\t{transcript_id}\t{gene_id}\t{gene_type}\t{gene_description}\t{cds_start}\t{cds_end}\t{exon_starts}\t{exon_ends}'.format(
24-
chrom=chrom,
25-
start=gene_start,
26-
end=gene_end,
27-
gene_name=gene_name,
28-
importance=gene_importance,
29-
strand=strand,
30-
transcript_id=transcript_id,
31-
gene_id=gene_id,
32-
gene_type=gene_type,
33-
gene_description=gene_description,
34-
cds_start=start,
35-
cds_end=end,
36-
exon_starts=','.join([str(e[1]) for e in exons]),
37-
exon_ends=','.join([str(e[2]) for e in exons])))
26+
chrom=chrom,
27+
start=gene_start,
28+
end=gene_end,
29+
gene_name=gene_name,
30+
importance=gene_importance,
31+
strand=strand,
32+
transcript_id=transcript_id,
33+
gene_id=gene_id,
34+
gene_type=gene_type,
35+
gene_description=gene_description,
36+
cds_start=start,
37+
cds_end=end,
38+
exon_starts=','.join([str(e[1]) for e in exons]),
39+
exon_ends=','.join([str(e[2]) for e in exons])))
40+
3841

3942
def main():
4043
parser = argparse.ArgumentParser(description="""
@@ -44,11 +47,11 @@ def main():
4447

4548
parser.add_argument('gff_file')
4649
parser.add_argument('--save-chromsizes', default=None,
47-
help='Store the chromsizes in a separate file',
48-
type=str)
49-
#parser.add_argument('-o', '--options', default='yo',
50+
help='Store the chromsizes in a separate file',
51+
type=str)
52+
# parser.add_argument('-o', '--options', default='yo',
5053
# help="Some option", type='str')
51-
#parser.add_argument('-u', '--useless', action='store_true',
54+
# parser.add_argument('-u', '--useless', action='store_true',
5255
# help='Another useless option')
5356

5457
args = parser.parse_args()
@@ -57,7 +60,7 @@ def main():
5760
with open(args.gff_file, 'r') as f:
5861
transcript_id = None
5962
chromsizes = []
60-
63+
6164
for line in f:
6265
counter += 1
6366
if line.strip()[0] == '#':
@@ -114,7 +117,8 @@ def main():
114117
x_split = x.split('=')
115118
attrs[x_split[0]] = x_split[1]
116119
except IndexError as ve:
117-
print("WARNING: Strange Parts:", to_split, ve, file=sys.stderr)
120+
print("WARNING: Strange Parts:",
121+
to_split, ve, file=sys.stderr)
118122

119123
if annotation_type == 'chromosome':
120124
id_parts = attrs['ID'].split(':')
@@ -123,23 +127,22 @@ def main():
123127

124128
chromsizes += [(chromname, chromsize)]
125129

126-
127130
if annotation_type == 'gene' or annotation_type == 'tRNA_gene':
128131
if transcript_id is not None:
129132
dump_transcript(gene_name,
130-
gene_id,
131-
gene_type,
132-
gene_description,
133-
gene_importance,
134-
gene_start,
135-
gene_end,
136-
transcript_id,
137-
transcript_chrom,
138-
transcript_start,
139-
transcript_end,
140-
transcript_strand,
141-
transcript_cdss,
142-
transcript_exons)
133+
gene_id,
134+
gene_type,
135+
gene_description,
136+
gene_importance,
137+
gene_start,
138+
gene_end,
139+
transcript_id,
140+
transcript_chrom,
141+
transcript_start,
142+
transcript_end,
143+
transcript_strand,
144+
transcript_cdss,
145+
transcript_exons)
143146

144147
split_id = attrs['ID'].split(':')
145148
gene_id = attrs['ID']
@@ -149,20 +152,23 @@ def main():
149152
elif 'Name' in attrs:
150153
split_name = attrs['Name'].split(':')
151154
print("split_name", split_name, file=sys.stderr)
152-
gene_name = split_name[0] if len(split_name) == 1 else split_name[1]
155+
gene_name = split_name[0] if len(
156+
split_name) == 1 else split_name[1]
153157
else:
154-
gene_name = split_id[0] if len(split_id) == 1 else split_id[1]
158+
gene_name = split_id[0] if len(
159+
split_id) == 1 else split_id[1]
155160
print("WARNING: no gene name:", to_split, file=sys.stderr)
156161

157162
if 'GENE_TYPE' in attrs:
158163
gene_type = attrs['GENE_TYPE']
159164
elif 'biotype' in attrs:
160165
gene_type = attrs['biotype']
161166
else:
162-
print("WARNING: no gene type (GENE_TYPE or biotype attribute)", to_split, file=sys.stderr)
167+
print("WARNING: no gene type (GENE_TYPE or biotype attribute)",
168+
to_split, file=sys.stderr)
163169

164170
gene_description = attrs['description'] if 'description' in attrs else '-'
165-
gene_importance = random.randint(0,10000)
171+
gene_importance = random.randint(0, 10000)
166172
gene_start = start_pos
167173
gene_end = end_pos
168174

@@ -181,19 +187,19 @@ def main():
181187
if annotation_type == 'transcript' or annotation_type == 'mRNA':
182188
if transcript_id is not None:
183189
dump_transcript(gene_name,
184-
gene_id,
185-
gene_type,
186-
gene_description,
187-
gene_importance,
188-
gene_start,
189-
gene_end,
190-
transcript_id,
191-
transcript_chrom,
192-
transcript_start,
193-
transcript_end,
194-
transcript_strand,
195-
transcript_cdss,
196-
transcript_exons)
190+
gene_id,
191+
gene_type,
192+
gene_description,
193+
gene_importance,
194+
gene_start,
195+
gene_end,
196+
transcript_id,
197+
transcript_chrom,
198+
transcript_start,
199+
transcript_end,
200+
transcript_strand,
201+
transcript_cdss,
202+
transcript_exons)
197203

198204
transcript_exons = []
199205
transcript_id = attrs['ID']
@@ -208,33 +214,29 @@ def main():
208214
parent_id = attrs['Parent']
209215
if parent_id != transcript_id:
210216
print("Exon parent doesn't match transcript_id",
211-
parent_id, transcript_id, file=sys.stderr)
217+
parent_id, transcript_id, file=sys.stderr)
212218
transcript_exons += [(chrom, start_pos, end_pos)]
213219

214220
dump_transcript(gene_name,
215-
gene_id,
216-
gene_type,
217-
gene_description,
218-
gene_importance,
219-
gene_start,
220-
gene_end,
221-
transcript_id,
222-
transcript_chrom,
223-
transcript_start,
224-
transcript_end,
225-
transcript_strand,
226-
transcript_cdss,
227-
transcript_exons)
221+
gene_id,
222+
gene_type,
223+
gene_description,
224+
gene_importance,
225+
gene_start,
226+
gene_end,
227+
transcript_id,
228+
transcript_chrom,
229+
transcript_start,
230+
transcript_end,
231+
transcript_strand,
232+
transcript_cdss,
233+
transcript_exons)
228234

229235
if args.save_chromsizes:
230236
with open(args.save_chromsizes, 'w') as f:
231237
for (name, size) in chromsizes:
232238
f.write("{}\t{}\n".format(name, size))
233239

234240

235-
236-
237241
if __name__ == '__main__':
238242
main()
239-
240-

0 commit comments

Comments
 (0)