bugfixes as keeping full metadata

Rachel Colquhoun · rmcolq · commit 229dd9cdcf31 · 2021-02-08T09:10:20.000Z
diff --git a/modules/deduplicate_cog_uk.nf b/modules/deduplicate_cog_uk.nf
@@ -138,9 +138,12 @@ process uk_unify_headers {
         open("${uk_fasta.baseName}.UH.fa", "w") as fasta_out:
         reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix")
         for row in reader:
-            record = alignment[row["fasta_header"]]
-            fasta_out.write(">" + row["sequence_name"] + "\\n")
-            fasta_out.write(str(record.seq) + "\\n")
+            if row["why_excluded"]:
+                continue
+            if row["fasta_header"] in alignment:
+                record = alignment[row["fasta_header"]]
+                fasta_out.write(">" + row["sequence_name"] + "\\n")
+                fasta_out.write(str(record.seq) + "\\n")
     """
 }
 
@@ -216,15 +219,17 @@ process uk_remove_duplicates_biosamplesourceid_by_date {
         writer.writeheader()
 
         for row in reader:
+            if row["why_excluded"]:
+                writer.writerow(row)
+                continue
             fasta_header = row["sequence_name"]
             if fasta_header in tokeep:
                 writer.writerow(row)
                 seqrec = alignment[fasta_header]
                 fasta_out.write(">" + seqrec.id + "\\n")
                 fasta_out.write(str(seqrec.seq) + "\\n")
             else:
-                if not row["why_excluded"]:
-                    row["why_excluded"] = "duplicate biosample_source_id"
+                row["why_excluded"] = "duplicate biosample_source_id"
                 writer.writerow(row)
     """
 }
@@ -300,15 +305,17 @@ process uk_remove_duplicates_rootbiosample_by_date {
         writer.writeheader()
 
         for row in reader:
+            if row["why_excluded"]:
+                writer.writerow(row)
+                continue
             fasta_header = row["sequence_name"]
             if fasta_header in tokeep:
                 writer.writerow(row)
                 seqrec = alignment[fasta_header]
                 fasta_out.write(">" + seqrec.id + "\\n")
                 fasta_out.write(str(seqrec.seq) + "\\n")
             else:
-                if not row["why_excluded"]:
-                    row["why_excluded"] = "duplicate root_biosample_source_id"
+                row["why_excluded"] = "duplicate root_biosample_source_id"
                 writer.writerow(row)
     """
 }
diff --git a/modules/filter_and_trim_cog_uk.nf b/modules/filter_and_trim_cog_uk.nf
@@ -47,6 +47,9 @@ process uk_filter_low_coverage_sequences {
             writer.writeheader()
 
             for row in reader:
+                if row["why_excluded"]:
+                    writer.writerow(row)
+                    continue
                 id = row["sequence_name"]
                 if id in alignment:
                     seq = str(alignment[id].seq)
@@ -56,8 +59,7 @@ process uk_filter_low_coverage_sequences {
                         fasta_out.write(">" + id + "\\n")
                         fasta_out.write(seq + "\\n")
                     else:
-                        if not row["why_excluded"]:
-                            row["why_excluded"] = "low mapped_completeness"
+                        row["why_excluded"] = "low mapped_completeness"
                         writer.writerow(row)
         """
 }