Skip to content

Commit 165f919

Browse files
Rachel Colquhounrmcolq
Rachel Colquhoun
authored andcommitted
label source id duplicates, don't remove, keep UK-ENG format for public metadata
1 parent ccbd142 commit 165f919

File tree

5 files changed

+31
-137
lines changed

5 files changed

+31
-137
lines changed

bin/add_to_uk_metadata.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ def add_sample_date(row, date_dict):
4848
except:
4949
row["sample_date"] = ""
5050

51+
def add_source_id(row):
52+
row["source_id"] = row["biosample_source_id"]
53+
if row["root_biosample_source_id"] not in [None,""]:
54+
row["source_id"] = row["root_biosample_source_id"]
55+
5156
def add_pillar_2(row):
5257
if row['collection_pillar'] == 2 or row['central_sample_id'][0:4] in ["ALDP", "CAMC", "MILK", "QEUH"]:
5358
row["pillar_2"] = True
@@ -151,7 +156,7 @@ def main():
151156

152157
date_dict = load_updated_dates(args.updated_date_file)
153158
accession_dict = load_accession(args.accession_file, log_handle)
154-
new_columns = ["sample_date", "pillar_2", "sequence_name", "covv_accession_id", "edin_epi_week", "edin_epi_day", "why_excluded"]
159+
new_columns = ["sample_date", "source_id", "pillar_2", "sequence_name", "covv_accession_id", "edin_epi_week", "edin_epi_day", "why_excluded"]
155160

156161
with open(args.in_metadata, 'r', newline = '') as csv_in, \
157162
open(args.out_metadata, 'w', newline = '') as csv_out:
@@ -163,6 +168,7 @@ def main():
163168
for row in reader:
164169
try:
165170
add_sample_date(row, date_dict)
171+
add_source_id(row)
166172
add_pillar_2(row)
167173
add_sequence_name(row)
168174
add_covv_accession_id(row, accession_dict)

modules/deduplicate_cog_uk.nf

+16-128
Original file line numberDiff line numberDiff line change
@@ -148,43 +148,37 @@ process uk_unify_headers {
148148
}
149149

150150

151-
process uk_remove_duplicates_biosamplesourceid_by_date {
151+
process uk_label_sourceid_duplicates_to_omit {
152152
/**
153-
* Where duplicate biosample_source_id, keeps the earliest
153+
* Where duplicate source_id, labels all but the earliest as duplicates
154154
* @input uk_fasta, uk_metadata
155155
* @output uk_fasta_updated, uk_metadata_updated
156156
*/
157157

158158
publishDir "${publish_dev}/", pattern: "*.log", mode: 'copy'
159159

160160
input:
161-
path uk_fasta
162161
path uk_metadata
163162

164163
output:
165-
path "${uk_fasta.baseName}.deduplicated_by_biosamplesourceid.fa", emit: uk_fasta_updated
166-
path "${uk_metadata.baseName}.deduplicated_by_biosamplesourceid.csv", emit: uk_metadata_updated
167-
path "deduplicated_by_biosamplesourceid.log", emit: deduplicate_log
164+
path "${uk_metadata.baseName}.deduplicated_by_sourceid.csv", emit: uk_metadata_updated
165+
path "deduplicated_by_sourceid.log", emit: deduplicate_log
168166

169167
script:
170168
"""
171169
#!/usr/bin/env python3
172170
from Bio import SeqIO
173171
import csv
174172
175-
alignment = SeqIO.index("${uk_fasta}", "fasta")
176-
177173
dup_dict = {}
178174
tokeep = set()
179175
180176
with open("${uk_metadata}", 'r', newline = '') as csv_in:
181177
reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix")
182178
183179
for row in reader:
184-
if row["why_excluded"]:
185-
continue
186180
fasta_header = row["sequence_name"]
187-
id = row["biosample_source_id"]
181+
id = row["source_id"]
188182
epi_day = int(row["edin_epi_day"])
189183
completeness = float(row["unmapped_genome_completeness"])
190184
@@ -200,7 +194,7 @@ process uk_remove_duplicates_biosamplesourceid_by_date {
200194
else:
201195
dup_dict[id] = [{"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}]
202196
203-
with open("deduplicated_by_biosamplesourceid.log", "w") as log:
197+
with open("deduplicated_by_sourceid.log", "w") as log:
204198
for k,v in dup_dict.items():
205199
tokeep.add(v[0]["fasta_header"])
206200
if len(v) > 1:
@@ -211,145 +205,39 @@ process uk_remove_duplicates_biosamplesourceid_by_date {
211205
212206
213207
with open("${uk_metadata}", 'r', newline = '') as csv_in, \
214-
open("${uk_metadata.baseName}.deduplicated_by_biosamplesourceid.csv", 'w', newline = '') as csv_out, \
215-
open("${uk_fasta.baseName}.deduplicated_by_biosamplesourceid.fa", 'w') as fasta_out:
208+
open("${uk_metadata.baseName}.deduplicated_by_sourceid.csv", 'w', newline = '') as csv_out:
216209
217210
reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix")
218-
writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
211+
writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + ["duplicate"], delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
219212
writer.writeheader()
220213
221214
for row in reader:
222-
if row["why_excluded"]:
223-
writer.writerow(row)
224-
continue
215+
row["duplicate"] = None
225216
fasta_header = row["sequence_name"]
226-
if fasta_header in tokeep:
227-
writer.writerow(row)
228-
seqrec = alignment[fasta_header]
229-
fasta_out.write(">" + seqrec.id + "\\n")
230-
fasta_out.write(str(seqrec.seq) + "\\n")
231-
else:
232-
row["why_excluded"] = "duplicate biosample_source_id"
233-
writer.writerow(row)
217+
if fasta_header not in tokeep:
218+
row["duplicate"] = "True"
219+
writer.writerow(row)
234220
"""
235221
}
236222

237-
process uk_remove_duplicates_rootbiosample_by_date {
238-
/**
239-
* Where duplicate root_biosample, keeps the oldest
240-
* @input uk_fasta, uk_metadata
241-
* @output uk_fasta_updated, uk_metadata_updated
242-
* @params date
243-
*/
244-
245-
publishDir "${publish_dev}/", pattern: "*.log", mode: 'copy'
246-
247-
input:
248-
path uk_fasta
249-
path uk_metadata
250-
251-
output:
252-
path "${uk_fasta.baseName}.deduplicated_by_rootbiosample.fa", emit: uk_fasta_updated
253-
path "${uk_metadata.baseName}.deduplicated_by_rootbiosample.csv", emit: uk_metadata_updated
254-
path "deduplicated_by_rootbiosample.log", emit: deduplicate_log
255-
256-
script:
257-
"""
258-
#!/usr/bin/env python3
259-
from Bio import SeqIO
260-
import csv
261223

262-
alignment = SeqIO.index("${uk_fasta}", "fasta")
263-
264-
dup_dict = {}
265-
tokeep = set()
266-
267-
with open("${uk_metadata}", 'r', newline = '') as csv_in:
268-
reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix")
269-
270-
for row in reader:
271-
if row["why_excluded"]:
272-
continue
273-
fasta_header = row["sequence_name"]
274-
id = row["root_biosample_source_id"]
275-
epi_day = int(row["edin_epi_day"])
276-
completeness = float(row["unmapped_genome_completeness"])
277-
278-
if id in ["None", "", None]:
279-
tokeep.add(fasta_header)
280-
continue
281-
282-
if id in dup_dict:
283-
if epi_day < dup_dict[id][0]["epi_day"]:
284-
dup_dict[id].insert(0, {"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness})
285-
else:
286-
dup_dict[id].append({"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness})
287-
else:
288-
dup_dict[id] = [{"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}]
289-
290-
with open("deduplicated_by_rootbiosample.log", "w") as log:
291-
for k,v in dup_dict.items():
292-
tokeep.add(v[0]["fasta_header"])
293-
if len(v) > 1:
294-
for dup in v[1:]:
295-
log.write("For id %s, %s epi_day:%s completeness:%s kept, %s epi_day:%s completeness:%s removed as duplicate\\n" \
296-
%(k, v[0]["fasta_header"], v[0]["epi_day"], v[0]["completeness"], dup["fasta_header"], \
297-
dup["epi_day"], dup["completeness"]))
298-
299-
with open("${uk_metadata}", 'r', newline = '') as csv_in, \
300-
open("${uk_metadata.baseName}.deduplicated_by_rootbiosample.csv", 'w', newline = '') as csv_out, \
301-
open("${uk_fasta.baseName}.deduplicated_by_rootbiosample.fa", 'w') as fasta_out:
302-
303-
reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix")
304-
writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
305-
writer.writeheader()
306-
307-
for row in reader:
308-
if row["why_excluded"]:
309-
writer.writerow(row)
310-
continue
311-
fasta_header = row["sequence_name"]
312-
if fasta_header in tokeep:
313-
writer.writerow(row)
314-
seqrec = alignment[fasta_header]
315-
fasta_out.write(">" + seqrec.id + "\\n")
316-
fasta_out.write(str(seqrec.seq) + "\\n")
317-
else:
318-
row["why_excluded"] = "duplicate root_biosample_source_id"
319-
writer.writerow(row)
320-
"""
321-
}
322-
323-
324-
workflow deduplicate_by_cogid_cog_uk {
224+
workflow deduplicate_cog_uk {
325225
take:
326226
uk_fasta
327227
uk_metadata
328228
main:
329229
uk_annotate_with_unmapped_genome_completeness(uk_fasta, uk_metadata)
330230
uk_remove_duplicates_COGID_by_proportionN(uk_fasta, uk_annotate_with_unmapped_genome_completeness.out)
331231
uk_unify_headers(uk_remove_duplicates_COGID_by_proportionN.out.uk_fasta_updated, uk_remove_duplicates_COGID_by_proportionN.out.uk_metadata_updated)
232+
uk_label_sourceid_duplicates_to_omit(uk_remove_duplicates_COGID_by_proportionN.out.uk_metadata_updated)
332233
emit:
333234
fasta = uk_unify_headers.out
334-
metadata = uk_remove_duplicates_COGID_by_proportionN.out.uk_metadata_updated
335-
}
336-
337-
workflow deduplicate_by_biosample_cog_uk {
338-
take:
339-
uk_fasta
340-
uk_metadata
341-
main:
342-
uk_remove_duplicates_biosamplesourceid_by_date(uk_fasta, uk_metadata)
343-
uk_remove_duplicates_rootbiosample_by_date(uk_remove_duplicates_biosamplesourceid_by_date.out.uk_fasta_updated, uk_remove_duplicates_biosamplesourceid_by_date.out.uk_metadata_updated)
344-
emit:
345-
fasta = uk_remove_duplicates_rootbiosample_by_date.out.uk_fasta_updated
346-
metadata = uk_remove_duplicates_rootbiosample_by_date.out.uk_metadata_updated
235+
metadata = uk_label_sourceid_duplicates_to_omit.out.uk_metadata_updated
347236
}
348237

349238

350239
workflow {
351240
uk_fasta = file(params.uk_fasta)
352241
uk_metadata = file(params.uk_metadata)
353-
deduplicate_by_cogid_cog_uk(uk_fasta, uk_metadata)
354-
deduplicate_by_biosample_cog_uk(deduplicate_by_cogid_cog_uk.out.fasta, deduplicate_by_cogid_cog_uk.out.metadata)
242+
deduplicate_cog_uk(uk_fasta, uk_metadata)
355243
}

modules/publish_all.nf

+3-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ process combine_cog_gisaid {
3333
--filter-column fasta_header covv_accession_id central_sample_id biosample_source_id secondary_identifier root_sample_id \
3434
pillar_2 \
3535
sequence_name sample_date epi_week \
36-
country adm1 adm2 outer_postcode adm2_raw adm2_source nuts1 region latitude longitude location \
36+
country adm1 adm2 outer_postcode adm2_raw adm2_source NUTS1 region latitude longitude location \
3737
submission_org_code is_surveillance is_community is_hcw \
3838
is_travel_history travel_history \
3939
lineage lineage_support lineages_version \
@@ -160,7 +160,8 @@ process uk_geography {
160160
--in-metadata ${uk_metadata} \
161161
--index-column sequence_name \
162162
--filter-column central_sample_id sequence_name sample_date epi_week \
163-
adm0 adm1 adm2 adm2_private \
163+
adm0 adm1 adm2 adm2_private adm1_UK \
164+
--where-column adm1_UK=adm1 \
164165
--out-fasta geography_tmp/fetch.fa \
165166
--out-metadata geography_tmp/fetch.csv \
166167
--restrict

resources/publish_recipes.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"suffix": "public",
2525
"data": "cog_global",
2626
"metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","country","adm1","pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineage_support","lineages_version"],
27-
"where": "gisaid_id=covv_accession_id cog_id=central_sample_id"
27+
"where": "gisaid_id=covv_accession_id cog_id=central_sample_id adm1=adm1_UK"
2828
},
2929
{
3030
"suffix": "consortium",

workflows/process_cog_uk.nf

+4-5
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,11 @@ workflow process_cog_uk {
1919
main:
2020
preprocess_cog_uk(uk_fasta, uk_metadata, uk_accessions)
2121
pangolin_cog_uk(preprocess_cog_uk.out.fasta, preprocess_cog_uk.out.metadata)
22-
deduplicate_by_cogid_cog_uk(preprocess_cog_uk.out.fasta, pangolin_cog_uk.out.metadata)
23-
align_and_variant_call_cog_uk(deduplicate_by_cogid_cog_uk.out.fasta)
24-
deduplicate_by_biosample_cog_uk(align_and_variant_call_cog_uk.out.fasta,deduplicate_by_cogid_cog_uk.out.metadata)
25-
filter_and_trim_cog_uk(deduplicate_by_biosample_cog_uk.out.fasta, deduplicate_by_biosample_cog_uk.out.metadata)
22+
deduplicate_cog_uk(preprocess_cog_uk.out.fasta, pangolin_cog_uk.out.metadata)
23+
align_and_variant_call_cog_uk(deduplicate_cog_uk.out.fasta)
24+
filter_and_trim_cog_uk(deduplicate_cog_uk.out.fasta, deduplicate_cog_uk.out.metadata)
2625
emit:
27-
unaligned_fasta = deduplicate_by_cogid_cog_uk.out.fasta
26+
unaligned_fasta = deduplicate_cog_uk.out.fasta
2827
aligned_fasta = align_and_variant_call_cog_uk.out.fasta
2928
trimmed_fasta = filter_and_trim_cog_uk.out.fasta
3029
metadata = filter_and_trim_cog_uk.out.metadata

0 commit comments

Comments
 (0)