-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfix_known_errors_svn_files.py
41 lines (33 loc) · 1.49 KB
/
fix_known_errors_svn_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# Fix known errors
# File that has iD instead of ID on first line
with open('data/chromosome1/3219.contig', errors='replace') as ins:
lines = ins.readlines()
lines[0] = lines[0].replace('disorders;','')
with open('data/chromosome1/3219.contig', 'w') as out:
out.writelines(lines)
for f in ['data/chromosome2/5338.contig', 'data/chromosome2/5334.contig']:
with open(f, errors='replace') as ins:
lines = ins.readlines()
if 'ID' not in lines[0]:
lines = ['ID CU329671 standard; DNA; FUN; 4539804 BP.\n'] + lines
with open(f, 'w') as out:
out.writelines(lines)
# A bunch of revisions of chromosome 3 that do not have header
for rev in [28, 30, 31, 33, 34, 35, 39, 40]:
f = f'data/chromosome3/{rev}.contig'
with open(f, errors='replace') as ins:
lines = ins.readlines()
if 'ID' not in lines[0]:
missing_header = '''ID chromosome_3 standard; DNA; FUN; 2452883 BP.\nXX\nAC chromosome_3;\nXX\n'''
lines = [missing_header] + lines
with open(f, 'w') as out:
out.writelines(lines)
for rev in [25, 26]:
f = f'data/chromosome2/{rev}.contig'
with open(f, errors='replace') as ins:
lines = ins.readlines()
if 'ID' not in lines[0]:
missing_header = 'ID chromosome_2 standard; DNA; FUN; 4539804 BP.\nXX\nAC chromosome_2;\nXX\nFH Key Location/Qualifiers\nFH\n'
lines = [missing_header] + lines
with open(f, 'w') as out:
out.writelines(lines)