@@ -148,43 +148,37 @@ process uk_unify_headers {
148
148
}
149
149
150
150
151
- process uk_remove_duplicates_biosamplesourceid_by_date {
151
+ process uk_label_sourceid_duplicates_to_omit {
152
152
/* *
153
- * Where duplicate biosample_source_id, keeps the earliest
153
+ * Where duplicate source_id, labels all but the earliest as duplicates
154
154
* @input uk_fasta, uk_metadata
155
155
* @output uk_fasta_updated, uk_metadata_updated
156
156
*/
157
157
158
158
publishDir " ${ publish_dev} /" , pattern: " *.log" , mode: ' copy'
159
159
160
160
input:
161
- path uk_fasta
162
161
path uk_metadata
163
162
164
163
output:
165
- path " ${ uk_fasta.baseName} .deduplicated_by_biosamplesourceid.fa" , emit: uk_fasta_updated
166
- path " ${ uk_metadata.baseName} .deduplicated_by_biosamplesourceid.csv" , emit: uk_metadata_updated
167
- path " deduplicated_by_biosamplesourceid.log" , emit: deduplicate_log
164
+ path " ${ uk_metadata.baseName} .deduplicated_by_sourceid.csv" , emit: uk_metadata_updated
165
+ path " deduplicated_by_sourceid.log" , emit: deduplicate_log
168
166
169
167
script:
170
168
"""
171
169
#!/usr/bin/env python3
172
170
from Bio import SeqIO
173
171
import csv
174
172
175
- alignment = SeqIO.index("${ uk_fasta} ", "fasta")
176
-
177
173
dup_dict = {}
178
174
tokeep = set()
179
175
180
176
with open("${ uk_metadata} ", 'r', newline = '') as csv_in:
181
177
reader = csv.DictReader(csv_in, delimiter=",", quotechar='\" ', dialect = "unix")
182
178
183
179
for row in reader:
184
- if row["why_excluded"]:
185
- continue
186
180
fasta_header = row["sequence_name"]
187
- id = row["biosample_source_id "]
181
+ id = row["source_id "]
188
182
epi_day = int(row["edin_epi_day"])
189
183
completeness = float(row["unmapped_genome_completeness"])
190
184
@@ -200,7 +194,7 @@ process uk_remove_duplicates_biosamplesourceid_by_date {
200
194
else:
201
195
dup_dict[id] = [{"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}]
202
196
203
- with open("deduplicated_by_biosamplesourceid .log", "w") as log:
197
+ with open("deduplicated_by_sourceid .log", "w") as log:
204
198
for k,v in dup_dict.items():
205
199
tokeep.add(v[0]["fasta_header"])
206
200
if len(v) > 1:
@@ -211,145 +205,39 @@ process uk_remove_duplicates_biosamplesourceid_by_date {
211
205
212
206
213
207
with open("${ uk_metadata} ", 'r', newline = '') as csv_in, \
214
- open("${ uk_metadata.baseName} .deduplicated_by_biosamplesourceid.csv", 'w', newline = '') as csv_out, \
215
- open("${ uk_fasta.baseName} .deduplicated_by_biosamplesourceid.fa", 'w') as fasta_out:
208
+ open("${ uk_metadata.baseName} .deduplicated_by_sourceid.csv", 'w', newline = '') as csv_out:
216
209
217
210
reader = csv.DictReader(csv_in, delimiter=",", quotechar='\" ', dialect = "unix")
218
- writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\" ', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
211
+ writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + ["duplicate"] , delimiter=",", quotechar='\" ', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
219
212
writer.writeheader()
220
213
221
214
for row in reader:
222
- if row["why_excluded"]:
223
- writer.writerow(row)
224
- continue
215
+ row["duplicate"] = None
225
216
fasta_header = row["sequence_name"]
226
- if fasta_header in tokeep:
227
- writer.writerow(row)
228
- seqrec = alignment[fasta_header]
229
- fasta_out.write(">" + seqrec.id + "\\ n")
230
- fasta_out.write(str(seqrec.seq) + "\\ n")
231
- else:
232
- row["why_excluded"] = "duplicate biosample_source_id"
233
- writer.writerow(row)
217
+ if fasta_header not in tokeep:
218
+ row["duplicate"] = "True"
219
+ writer.writerow(row)
234
220
"""
235
221
}
236
222
237
- process uk_remove_duplicates_rootbiosample_by_date {
238
- /* *
239
- * Where duplicate root_biosample, keeps the oldest
240
- * @input uk_fasta, uk_metadata
241
- * @output uk_fasta_updated, uk_metadata_updated
242
- * @params date
243
- */
244
-
245
- publishDir " ${ publish_dev} /" , pattern: " *.log" , mode: ' copy'
246
-
247
- input:
248
- path uk_fasta
249
- path uk_metadata
250
-
251
- output:
252
- path " ${ uk_fasta.baseName} .deduplicated_by_rootbiosample.fa" , emit: uk_fasta_updated
253
- path " ${ uk_metadata.baseName} .deduplicated_by_rootbiosample.csv" , emit: uk_metadata_updated
254
- path " deduplicated_by_rootbiosample.log" , emit: deduplicate_log
255
-
256
- script:
257
- """
258
- #!/usr/bin/env python3
259
- from Bio import SeqIO
260
- import csv
261
223
262
- alignment = SeqIO.index("${ uk_fasta} ", "fasta")
263
-
264
- dup_dict = {}
265
- tokeep = set()
266
-
267
- with open("${ uk_metadata} ", 'r', newline = '') as csv_in:
268
- reader = csv.DictReader(csv_in, delimiter=",", quotechar='\" ', dialect = "unix")
269
-
270
- for row in reader:
271
- if row["why_excluded"]:
272
- continue
273
- fasta_header = row["sequence_name"]
274
- id = row["root_biosample_source_id"]
275
- epi_day = int(row["edin_epi_day"])
276
- completeness = float(row["unmapped_genome_completeness"])
277
-
278
- if id in ["None", "", None]:
279
- tokeep.add(fasta_header)
280
- continue
281
-
282
- if id in dup_dict:
283
- if epi_day < dup_dict[id][0]["epi_day"]:
284
- dup_dict[id].insert(0, {"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness})
285
- else:
286
- dup_dict[id].append({"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness})
287
- else:
288
- dup_dict[id] = [{"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}]
289
-
290
- with open("deduplicated_by_rootbiosample.log", "w") as log:
291
- for k,v in dup_dict.items():
292
- tokeep.add(v[0]["fasta_header"])
293
- if len(v) > 1:
294
- for dup in v[1:]:
295
- log.write("For id %s, %s epi_day:%s completeness:%s kept, %s epi_day:%s completeness:%s removed as duplicate\\ n" \
296
- %(k, v[0]["fasta_header"], v[0]["epi_day"], v[0]["completeness"], dup["fasta_header"], \
297
- dup["epi_day"], dup["completeness"]))
298
-
299
- with open("${ uk_metadata} ", 'r', newline = '') as csv_in, \
300
- open("${ uk_metadata.baseName} .deduplicated_by_rootbiosample.csv", 'w', newline = '') as csv_out, \
301
- open("${ uk_fasta.baseName} .deduplicated_by_rootbiosample.fa", 'w') as fasta_out:
302
-
303
- reader = csv.DictReader(csv_in, delimiter=",", quotechar='\" ', dialect = "unix")
304
- writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\" ', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
305
- writer.writeheader()
306
-
307
- for row in reader:
308
- if row["why_excluded"]:
309
- writer.writerow(row)
310
- continue
311
- fasta_header = row["sequence_name"]
312
- if fasta_header in tokeep:
313
- writer.writerow(row)
314
- seqrec = alignment[fasta_header]
315
- fasta_out.write(">" + seqrec.id + "\\ n")
316
- fasta_out.write(str(seqrec.seq) + "\\ n")
317
- else:
318
- row["why_excluded"] = "duplicate root_biosample_source_id"
319
- writer.writerow(row)
320
- """
321
- }
322
-
323
-
324
- workflow deduplicate_by_cogid_cog_uk {
224
+ workflow deduplicate_cog_uk {
325
225
take :
326
226
uk_fasta
327
227
uk_metadata
328
228
main :
329
229
uk_annotate_with_unmapped_genome_completeness(uk_fasta, uk_metadata)
330
230
uk_remove_duplicates_COGID_by_proportionN(uk_fasta, uk_annotate_with_unmapped_genome_completeness. out)
331
231
uk_unify_headers(uk_remove_duplicates_COGID_by_proportionN. out. uk_fasta_updated, uk_remove_duplicates_COGID_by_proportionN. out. uk_metadata_updated)
232
+ uk_label_sourceid_duplicates_to_omit(uk_remove_duplicates_COGID_by_proportionN. out. uk_metadata_updated)
332
233
emit :
333
234
fasta = uk_unify_headers. out
334
- metadata = uk_remove_duplicates_COGID_by_proportionN. out. uk_metadata_updated
335
- }
336
-
337
- workflow deduplicate_by_biosample_cog_uk {
338
- take :
339
- uk_fasta
340
- uk_metadata
341
- main :
342
- uk_remove_duplicates_biosamplesourceid_by_date(uk_fasta, uk_metadata)
343
- uk_remove_duplicates_rootbiosample_by_date(uk_remove_duplicates_biosamplesourceid_by_date. out. uk_fasta_updated, uk_remove_duplicates_biosamplesourceid_by_date. out. uk_metadata_updated)
344
- emit :
345
- fasta = uk_remove_duplicates_rootbiosample_by_date. out. uk_fasta_updated
346
- metadata = uk_remove_duplicates_rootbiosample_by_date. out. uk_metadata_updated
235
+ metadata = uk_label_sourceid_duplicates_to_omit. out. uk_metadata_updated
347
236
}
348
237
349
238
350
239
workflow {
351
240
uk_fasta = file(params. uk_fasta)
352
241
uk_metadata = file(params. uk_metadata)
353
- deduplicate_by_cogid_cog_uk(uk_fasta, uk_metadata)
354
- deduplicate_by_biosample_cog_uk(deduplicate_by_cogid_cog_uk. out. fasta, deduplicate_by_cogid_cog_uk. out. metadata)
242
+ deduplicate_cog_uk(uk_fasta, uk_metadata)
355
243
}
0 commit comments