-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpombe_svn_diff.py
89 lines (67 loc) · 4.03 KB
/
pombe_svn_diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Calculate the differences between genome versions, relies on the folder structure defined in the readme.
"""
import os
from genome_functions import genome_dict_diff, build_seqfeature_dict,read_pombe_genome, genome_sequences_are_different, make_synonym_dict
from formatting_functions import write_diff_to_files
import glob
import argparse
from custom_biopython import SeqIO
class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument('--data_folders', nargs='+', default=glob.glob('data/*'), help='folders where the analysis will be ran.')
args = parser.parse_args()
# Known errors (revision number)
skip_files = dict()
with open('valid_ids_data/revisions2skip.tsv') as ins:
for line in ins:
ls = line.strip().split('\t')
skip_files[ls[0]] = ls[1].split(',')
synonym_dict = make_synonym_dict('valid_ids_data/gene_IDs_names.tsv', 'valid_ids_data/obsoleted_ids.tsv', 'valid_ids_data/missing_synonyms.tsv')
def parse_genome(genome_file, date):
# Special reader for old genomes handles several edge-cases
if date < '2023-03-01':
return read_pombe_genome(genome_file, 'embl', synonym_dict, 'valid_ids_data/all_systematic_ids_ever.txt','valid_ids_data/known_exceptions.tsv')
else:
with open(genome_file, errors='replace') as ins:
return SeqIO.read(ins,'embl')
for folder in args.data_folders:
folder = os.path.normpath(folder)
output_folder = f'{folder}/change_log'
contig_file_name = os.path.basename(folder)
with open(f'{folder}/revisions.txt') as f:
revisions = f.read().splitlines()
# Remove the known errors
if contig_file_name in skip_files:
revisions = [r for r in revisions if r.split()[0] not in skip_files[contig_file_name]]
# Prepare first iteration
old_genome_dict = None
sequences_different_prev_iteration = None
for i in range(len(revisions)-1):
new_revision_list = revisions[i].split()
old_revision_list = revisions[i+1].split()
new_genome_file = f'{folder}/{new_revision_list[0]}.contig'
old_genome_file = f'{folder}/{old_revision_list[0]}.contig'
locations_output_file = f'{output_folder}/locations/{new_revision_list[0]}.tsv'
qualifiers_output_file = f'{output_folder}/qualifiers/{new_revision_list[0]}.tsv'
# Check if output files exist, if so skip
if os.path.isfile(locations_output_file) and os.path.isfile(qualifiers_output_file):
print(f'{folder}, skipped diff {new_revision_list[0]} & {old_revision_list[0]}')
# Next genome will have to be read
old_genome_dict = None
continue
print(f'{folder}, performing diff {new_revision_list[0]} & {old_revision_list[0]}')
# Check if genome sequence has changed, in that case we use a custom SeqFeature that takes longer to load
sequences_different_this_iteration = genome_sequences_are_different(new_genome_file, old_genome_file)
if (sequences_different_this_iteration):
print('> Loading feature sequences, this comparison will be slow')
# Keep data from last iteration to avoid re-reading contig file
if (old_genome_dict is not None) and (sequences_different_this_iteration == sequences_different_prev_iteration):
new_genome_dict = old_genome_dict
else:
new_genome_dict = build_seqfeature_dict(parse_genome(new_genome_file, new_revision_list[2]), sequences_different_this_iteration)
old_genome_dict = build_seqfeature_dict(parse_genome(old_genome_file, old_revision_list[2]), sequences_different_this_iteration)
# Get diffs
locations_added, locations_removed, qualifiers_added, qualifiers_removed = genome_dict_diff(new_genome_dict, old_genome_dict,sequences_different_this_iteration)
write_diff_to_files(locations_added, locations_removed, qualifiers_added, qualifiers_removed, new_revision_list, locations_output_file, qualifiers_output_file)