Skip to content

Commit ff8dcf4

Browse files
author
Rachel Colquhoun
committed
add code to publish from config
1 parent 7cf18c4 commit ff8dcf4

7 files changed

+373
-42
lines changed

bin/add_mask.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python3
2+
23
from Bio import SeqIO
34
import argparse
45
import re

bin/geography_cleaning.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env python3
2+
13
import csv
24
import argparse
35
import geopandas as gp

bin/publish_from_config.py

+129
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import json
5+
import subprocess
6+
import os
7+
8+
def parse_args():
9+
parser = argparse.ArgumentParser(description="""Create published files from config file""",
10+
formatter_class=argparse.RawTextHelpFormatter)
11+
parser.add_argument('--unaligned_fasta', dest = 'unaligned_fasta', required=True, help='Raw FASTA')
12+
parser.add_argument('--aligned_fasta', dest = 'aligned_fasta', required=True, help='Aligned, masked, untrimmed FASTA')
13+
parser.add_argument('--trimmed_fasta', dest = 'trimmed_fasta', required=True, help='Aligned, masked, trimmed and filtered FASTA')
14+
parser.add_argument('--cog_global_fasta', dest = 'cog_global_fasta', required=True, help='COG GISAID aligned FASTA')
15+
16+
parser.add_argument('--cog_metadata', dest = 'cog_metadata', required=True, help='MASSIVE CSV')
17+
parser.add_argument('--cog_global_metadata', dest = 'cog_global_metadata', required=True, help='MASSIVE CSV')
18+
19+
parser.add_argument('--cog_variants', dest = 'cog_variants', required=True, help='Mutations CSV')
20+
parser.add_argument('--cog_global_variants', dest = 'cog_global_variants', required=True, help='Mutations CSV')
21+
22+
parser.add_argument('--recipes', dest = 'recipes', required=True, help='JSON of recipes')
23+
24+
args = parser.parse_args()
25+
return args
26+
27+
#"data": "cog" or "cog_global"
28+
#"fasta": "unaligned", "aligned", "trimmed", "cog_global"
29+
#"metadata_fields": []
30+
#"variants": True or False to add columns from variants
31+
#"where": free text to be passed to fastafunk fetch --where-column
32+
#"suffix": something to append to file names
33+
34+
def get_info_from_config(config_dict, outdir, fasta_dict, csv_dict, var_dict):
35+
info_dict = {"suffix":None, "data":None, "fasta":None, "metadata_fields":None, "where": None, "variants":False,
36+
"in_fa":None, "in_csv":None, "in_var":None,
37+
"out_fa":"tmp.fa", "out_csv":"tmp.csv", "out_var":None}
38+
info_dict.update(config_dict)
39+
40+
if info_dict["fasta"] in fasta_dict.keys():
41+
info_dict["in_fa"] = fasta_dict[info_dict["fasta"]]
42+
elif info_dict["data"] == "cog_global":
43+
info_dict["in_fa"] = fasta_dict["cog_global"]
44+
elif info_dict["data"] == "cog":
45+
info_dict["in_fa"] = fasta_dict["trimmed"]
46+
else:
47+
sys.exit("Config entries need to specify either fasta in ['unaligned', 'aligned', 'trimmed', 'cog_global'] or data \
48+
in ['cog', 'cog_global']")
49+
50+
if info_dict["data"] == "cog_global":
51+
info_dict["in_csv"] = csv_dict["cog_global"]
52+
info_dict["in_var"] = var_dict["cog_global"]
53+
elif info_dict["data"] == "cog":
54+
info_dict["in_csv"] = csv_dict["cog"]
55+
info_dict["in_var"] = var_dict["cog"]
56+
57+
if info_dict["data"] is None:
58+
if fasta == "cog_global":
59+
info_dict["data"] = "cog_global"
60+
else:
61+
info_dict["data"] = "cog"
62+
63+
if info_dict["variants"]:
64+
if info_dict["suffix"] is None:
65+
info_dict["out_var"] = "%s/%s_%s_metadata.csv" %(outdir, info_dict["data"], info_dict["date"])
66+
else:
67+
info_dict["out_var"] = "%s/%s_%s_%s_metadata.csv" %(outdir, info_dict["data"], info_dict["date"], info_dict["suffix"])
68+
elif info_dict["metadata_fields"]:
69+
if info_dict["suffix"] is None:
70+
info_dict["out_csv"] = "%s/%s_%s_metadata.csv" %(outdir, info_dict["data"], info_dict["date"])
71+
else:
72+
info_dict["out_csv"] = "%s/%s_%s_%s_metadata.csv" %(outdir, info_dict["data"], info_dict["date"], info_dict["suffix"])
73+
74+
if info_dict["fasta"]:
75+
if info_dict["metadata_fields"]:
76+
if info_dict["suffix"] is None:
77+
info_dict["out_fa"] = "%s/%s_%s_alignment.csv" %(outdir, info_dict["data"], info_dict["date"])
78+
else:
79+
info_dict["out_fa"] = "%s/%s_%s_%s_alignment.csv" %(outdir, info_dict["data"], info_dict["date"], info_dict["suffix"])
80+
else:
81+
if info_dict["suffix"] is None:
82+
info_dict["out_fa"] = "%s/%s_%s.csv" %(outdir, info_dict["data"], info_dict["date"])
83+
else:
84+
info_dict["out_fa"] = "%s/%s_%s_%s.csv" %(outdir, info_dict["data"], info_dict["date"], info_dict["suffix"])
85+
86+
return info_dict
87+
88+
89+
def publish_file(outdir, info_dict):
90+
os.makedirs(outdir)
91+
92+
if info_dict["metadata_fields"] is None:
93+
cmd_list = ["cp", info_dict["in_fa"], info_dict["out_fa"]]
94+
subprocess.run(' '.join(cmd_list), shell=True)
95+
return
96+
97+
cmd_list = ["fastafunk fetch --in-fasta", info_dict["in_fa"], "--in-metadata", info_dict["in_csv"],
98+
"--index-column sequence_name --out-fasta", info_dict["out_fa"],
99+
"--out-metadata", info_dict["out_csv"], "--restrict"]
100+
if info_dict["where"]:
101+
cmd_list.append("--where-column %s" %info_dict["where"])
102+
subprocess.run(' '.join(cmd_list), shell=True)
103+
104+
if info_dict["variants"]:
105+
cmd_list = ["fastafunk add_columns --in-metadata", info_dict["out_csv"],
106+
"--in-data", info_dict["in_var"], "--index-column sequence_name",
107+
"--join-on query --out-metadata", info_dict["out_var"]]
108+
subprocess.run(' '.join(cmd_list), shell=True)
109+
110+
subprocess.run("rm tmp.*", shell=True)
111+
112+
def main():
113+
args = parse_args()
114+
115+
fasta_dict = {"unaligned":args.unaligned_fasta, "aligned":args.aligned_fasta, "trimmed":args.trimmed_fasta, "cog_global": args.cog_global_fasta}
116+
csv_dict = {"cog":args.cog_metadata, "cog_global":args.cog_global_metadata}
117+
var_dict = {"cog":args.cog_variants, "cog_global":args.cog_global_variants}
118+
119+
recipes = {}
120+
with open(args.recipes, 'r') as f:
121+
recipes = json.load(f)
122+
123+
for outdir in recipes.keys():
124+
for recipe in recipes[outdir]:
125+
info_dict = get_info_from_config(recipe, outdir, fasta_dict, csv_dict, var_dict)
126+
publish_file(outdir, info_dict)
127+
128+
if __name__ == '__main__':
129+
main()

config/base.config

+6-2
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,21 @@ params {
33

44
date = false
55
publish_dir = "publish"
6+
publish_dev = "publish_dev"
67

78
// new cog-uk files each week
89
uk_fasta = "test/matched.fa"
910
uk_metadata = "test/matched.tsv"
1011
uk_accessions = "test/accessions.tsv"
12+
uk_variants = "test/matched2.variants" // null param so exists
1113

1214
// if carrying forward from previous
1315
uk_previous_metadata = ""
1416

1517
// latest gisaid results output by gisaid pipeline
16-
gisaid_fasta = false
17-
gisaid_metdata = false
18+
gisaid_fasta = "test/gisaid.matched.fa"
19+
gisaid_metadata = "test/gisaid.matched.csv"
20+
gisaid_variants = "test/gisaid.matched.variants"
1821

1922
// resources files
2023
uk_updated_dates = "resources/date_corrections.csv"
@@ -25,6 +28,7 @@ params {
2528
dels = "resources/dels.csv"
2629
mask_file = "resources/mask.txt"
2730
uk_geography = "resources/geography/"
31+
publish_recipes = "resources/publish_recipes.json"
2832

2933
// parameter values set
3034
time_window = false

modules/align_and_variant_call_cog_uk.nf

+2-3
Original file line numberDiff line numberDiff line change
@@ -253,12 +253,11 @@ aas = file(params.aas)
253253
dels = file(params.dels)
254254
reference_fasta = file(params.reference_fasta)
255255
reference_genbank = file(params.reference_genbank)
256-
publish_dir = file(params.publish_dir)
257256

258257

259258
workflow {
260-
uk_fasta = file(params.uk_fasta)
261-
uk_metadata = file(params.uk_metadata)
259+
uk_fasta = Channel.fromPath(params.uk_fasta)
260+
uk_metadata = Channel.fromPath(params.uk_metadata)
262261

263262
align_and_variant_call_cog_uk(uk_fasta,
264263
uk_metadata)

0 commit comments

Comments
 (0)