Skip to content

Commit

Permalink
docs: small fixes in generate_from_genbank script
Browse files Browse the repository at this point in the history
  • Loading branch information
rneher committed Jan 14, 2024
1 parent 152e1b4 commit 39ce2c4
Showing 1 changed file with 14 additions and 10 deletions.
24 changes: 14 additions & 10 deletions docs/example-workflow/scripts/generate_from_genbank.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/env python3
import shutil
import os
from Bio import SeqIO
from collections import defaultdict

Expand Down Expand Up @@ -63,6 +63,8 @@ def check_name(new_name):
args = parse_args()
reference = get_reference_sequence(args.reference)
gff = get_gff(args.reference)
if not os.path.isdir(args.output_dir):
os.makedirs(args.output_dir)

# write the reference sequence to the output directory
SeqIO.write(reference, f"{args.output_dir}/reference.fasta", "fasta")
Expand Down Expand Up @@ -102,15 +104,16 @@ def check_name(new_name):
available_attributes_by_type = {'CDS':set(), 'mature_protein': set()}
for annot, annot_set in available_attributes_by_type.items():
for feat in all_cds.values():
for seg in feat[annot]:
if len(annot_set):
annot_set = annot_set.intersection(list(seg[1].keys()))
else:
annot_set = annot_set.union(list(seg[1].keys()))
if annot in feat:
for seg in feat[annot]:
if len(annot_set):
annot_set = annot_set.intersection(list(seg[1].keys()))
else:
annot_set = annot_set.union(list(seg[1].keys()))
available_attributes_by_type[annot] = annot_set
print(f"\nCommon attributes of all available {annot} annotations are:\n", annot_set, '\n')

available_attributes = available_attributes_by_type['CDS'] if annot_set==1 else \
available_attributes = available_attributes_by_type['CDS'] if annotation_choice==1 else \
available_attributes_by_type['CDS'].intersection(available_attributes_by_type['mature_protein'])

print("Note that CDS names should be short, unique, and recognizable as they are used to label amino acid mutations. Space, commas, colons, etc are not allowed.\n")
Expand All @@ -120,13 +123,15 @@ def check_name(new_name):
break
else:
print("invalid choice: ", name_choice)
print("pick one of", available_attributes)

streamlined_cds = {}
names_by_id = {}
name_count = defaultdict(int)
# loop through all CDS and ask the user to pick one of the annotations
# allow renaming of the CDS to user friendly names
for cds_id, cds_sets in all_cds.items():
cds_set = list(cds_sets.keys())[0]
if annotation_choice==0:
if len(cds_sets)>1:
print(f"\nCDS {cds_id} is annotated in multiple ways:")
Expand All @@ -147,8 +152,6 @@ def check_name(new_name):
segment_id = segment[1]["ID"]
# only needed for one segment per CDS, skip if already in streamlined_cds
if segment_id not in streamlined_cds:
streamlined_cds[segment_id] = []

if name_choice:
new_name = segment[1][name_choice]
else:
Expand All @@ -158,10 +161,11 @@ def check_name(new_name):
new_name = input("Enter desired name (leave empty to drop): ")
if new_name=='': continue

streamlined_cds[segment_id] = []
new_name = check_name(new_name)
name_count[new_name] += 1
if name_count[new_name]>1:
print(f"CDS name {new_name} is not unique, attaching index to make unique.")
print(f"\nWARNING: CDS name {new_name} is not unique, attaching index to make unique.\n")
new_name = f"{new_name}_{name_count[new_name]:03d}"

names_by_id[segment_id] = new_name
Expand Down

0 comments on commit 39ce2c4

Please sign in to comment.