From 4454321e1d632c139f6bc0bce66ce6699ecaee68 Mon Sep 17 00:00:00 2001 From: Matt Stone Date: Fri, 19 Jan 2024 12:12:08 -0500 Subject: [PATCH 1/8] feat: add --umi-prefix to CopyUmiFromReadName --- .../com/fulcrumgenomics/umi/CopyUmiFromReadName.scala | 9 +++++++-- src/main/scala/com/fulcrumgenomics/umi/Umis.scala | 11 ++++++----- .../fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala | 11 +++++++++-- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala index 3f658d013..a6583567f 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala @@ -42,11 +42,16 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger} | |If a read name contains multiple UMIs they may be delimited by either hyphens (`-`) or pluses (`+`). The |resulting UMI in the `RX` tag will always be hyphen delimited. + | + |To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and + |prefix separately, i.e. `--umi-delimiter=":"` and `--umi-prefix="r"`. """) class CopyUmiFromReadName ( @arg(flag='i', doc="The input BAM file") input: PathToBam, @arg(flag='o', doc="The output BAM file") output: PathToBam, - @arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false + @arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false, + @arg(doc="Delimiter between the read name and UMI.") umiDelimiter: Char = ':', + @arg(doc="Any characters preceding the UMI sequence in the read name.") umiPrefix: Option[String] = None, ) extends FgBioTool with LazyLogging { Io.assertReadable(input) @@ -58,7 +63,7 @@ class CopyUmiFromReadName val progress = new ProgressLogger(logger) source.foreach { rec => progress.record(rec) - writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi) + writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi, delimiter=umiDelimiter, prefix=umiPrefix) } progress.logLast() source.safelyClose() diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index 2fbd4cfe9..ef9962007 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -41,9 +41,9 @@ object Umis { * @param delimiter the delimiter of fields within the read name * @return the modified record */ - def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, delimiter: Char = ':'): SamRecord = { + def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, delimiter: Char = ':', prefix: Option[String] = None): SamRecord = { // Extract and set the UMI - val umi = extractUmisFromReadName(rec.name, delimiter, strict=false) + val umi = extractUmisFromReadName(rec.name, delimiter, strict=false, prefix=prefix) require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}") umi.foreach(u => rec(ConsensusTags.UmiBases) = u) @@ -67,7 +67,7 @@ object Umis { * * If `strict` is false the last segment is returned so long as it appears to be a valid UMI. */ - def extractUmisFromReadName(name: String, delimiter: Char = ':', strict: Boolean): Option[String] = { + def extractUmisFromReadName(name: String, delimiter: Char = ':', strict: Boolean, prefix: Option[String] = None): Option[String] = { // If strict, check that the read name actually has eight parts, which is expected val rawUmi = if (strict) { val colons = name.count(_ == delimiter) @@ -80,8 +80,9 @@ object Umis { Some(name.substring(idx + 1, name.length)) } - val umi = rawUmi.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase) - val valid = umi.forall(u => u.forall(isValidUmiCharacter)) + val umiSeq = rawUmi.map(seq => (if (prefix.isEmpty) seq else seq.stripPrefix(prefix.get))) + val umi = umiSeq.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase) + val valid = umi.forall(u => u.forall(isValidUmiCharacter)) if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${name}") else if (!valid) None diff --git a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala index 0f17dc0a8..3503fdeba 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala @@ -33,14 +33,14 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { private case class Result(name: String, umi: String) /** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */ - private def run(names: Iterable[String], removeUmi: Boolean): IndexedSeq[Result] = { + private def run(names: Iterable[String], removeUmi: Boolean, umiPrefix: Option[String] = None): IndexedSeq[Result] = { // build the reads val builder = new SamBuilder() names.foreach { name => builder.addFrag(name=name, unmapped=true) } // run the tool val out = makeTempFile("test.", ".bam") - val tool = new CopyUmiFromReadName(input=builder.toTempFile(), output=out, removeUmi=removeUmi) + val tool = new CopyUmiFromReadName(input=builder.toTempFile(), output=out, removeUmi=removeUmi, umiPrefix=umiPrefix) executeFgbioTool(tool) // slurp the results @@ -69,4 +69,11 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") } + + it should "remove any additional separator characters preceding the UMI" in { + val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA-CCCC") + val results = run(names=names, removeUmi=true, umiPrefix=Some("r")) + results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") + results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") + } } From 2fd68b281cdf286e16f6d8fb8b4186cd59ef5eb2 Mon Sep 17 00:00:00 2001 From: jdidion Date: Fri, 12 Jul 2024 13:04:56 -0700 Subject: [PATCH 2/8] introduce umiDelimiter, rcPrefix, and normalizeRcUmis options; make changes from PR comments --- .../umi/CopyUmiFromReadName.scala | 33 ++++++++---- .../scala/com/fulcrumgenomics/umi/Umis.scala | 54 ++++++++++++++----- .../umi/CopyUmiFromReadNameTest.scala | 24 +++++++-- .../com/fulcrumgenomics/umi/UmisTest.scala | 6 +-- 4 files changed, 86 insertions(+), 31 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala index a6583567f..a2ea75414 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala @@ -36,22 +36,30 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger} """ |Copies the UMI at the end of the BAM's read name to the RX tag. | - |The read name is split on `:` characters with the last field is assumed to be the UMI sequence. The UMI + |The read name is split on `:` characters with the last field assumed to be the UMI sequence. The UMI |will be copied to the `RX` tag as per the SAM specification. If any read does not have a UMI composed of |valid bases (ACGTN), the program will report the error and fail. | - |If a read name contains multiple UMIs they may be delimited by either hyphens (`-`) or pluses (`+`). The - |resulting UMI in the `RX` tag will always be hyphen delimited. + |If a read name contains multiple UMIs they may be delimited (typically by a hyphen (`-`) or plus (`+`)). + |The `--umi-delimiter` option specifies the delimiter on which to split. The resulting UMI in the `RX` tag + |will always be hyphen delimited. + | + |Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add a prefix to indicate that the sequence + |has been reverse-complemented. The `--rc-prefix` option specifies the prefix character(s) and causes them to + |be removed. Additionally, if the `--normalize-rc-umis` flag is specified, any reverse-complemented UMIs will + |be normalized (i.e., reverse-completemented back to be in the forward orientation). | |To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and - |prefix separately, i.e. `--umi-delimiter=":"` and `--umi-prefix="r"`. + |prefix separately, i.e. `--field-delimiter=":"` and `--rc-prefix="r"`. """) class CopyUmiFromReadName -( @arg(flag='i', doc="The input BAM file") input: PathToBam, - @arg(flag='o', doc="The output BAM file") output: PathToBam, - @arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false, - @arg(doc="Delimiter between the read name and UMI.") umiDelimiter: Char = ':', - @arg(doc="Any characters preceding the UMI sequence in the read name.") umiPrefix: Option[String] = None, +( @arg(flag='i', doc="The input BAM file.") input: PathToBam, + @arg(flag='o', doc="The output BAM file.") output: PathToBam, + @arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false, + @arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':', + @arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+', + @arg(doc="The prefix to a UMI sequence that indicates it is reverse-complemented.") rcPrefix: Option[String] = None, + @arg(doc="Whether to reverse-complement UMI sequences with the '--rc-prefix'.") normalizeRcUmis: Boolean = false, ) extends FgBioTool with LazyLogging { Io.assertReadable(input) @@ -63,7 +71,12 @@ class CopyUmiFromReadName val progress = new ProgressLogger(logger) source.foreach { rec => progress.record(rec) - writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi, delimiter=umiDelimiter, prefix=umiPrefix) + writer += Umis.copyUmiFromReadName(rec=rec, + removeUmi=removeUmi, + fieldDelimiter=fieldDelimiter, + umiDelimiter=umiDelimiter, + rcPrefix=rcPrefix, + normalizeRcUmis=normalizeRcUmis) } progress.logLast() source.safelyClose() diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index ef9962007..610dd55c2 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -26,6 +26,7 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.SamRecord +import com.fulcrumgenomics.util.Sequences object Umis { @@ -38,17 +39,30 @@ object Umis { * * @param rec the record to modify * @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag - * @param delimiter the delimiter of fields within the read name + * @param fieldDelimiter the delimiter of fields within the read name + * @param umiDelimiter the delimiter between sequences in the UMI string + * @param rcPrefix the prefix of a UMI that indicates it is reverse-complimented + * @param normalizeRcUmis whether to normalize reverse-complemented UMIs * @return the modified record */ - def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, delimiter: Char = ':', prefix: Option[String] = None): SamRecord = { + def copyUmiFromReadName(rec: SamRecord, + removeUmi: Boolean = false, + fieldDelimiter: Char = ':', + umiDelimiter: Char = '+', + rcPrefix: Option[String] = None, + normalizeRcUmis: Boolean = false): SamRecord = { // Extract and set the UMI - val umi = extractUmisFromReadName(rec.name, delimiter, strict=false, prefix=prefix) + val umi = extractUmisFromReadName(rec.name, + fieldDelimiter, + strict=false, + umiDelimiter, + rcPrefix=rcPrefix, + normalizeRcUmis=normalizeRcUmis) require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}") umi.foreach(u => rec(ConsensusTags.UmiBases) = u) // Remove the UMI from the read name if requested - if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(delimiter)) + if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(fieldDelimiter)) rec } @@ -59,29 +73,43 @@ object Umis { * * See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm * The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI - * field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be - * translated to hyphens before returning. + * field may contain multiple UMIs, in which case they will delimit them with `umiDelimiter` characters, which + * will be translated to hyphens before returning. * * If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments, with the UMI being the last in the case of 8 and `None` in the case of 7. * * If `strict` is false the last segment is returned so long as it appears to be a valid UMI. */ - def extractUmisFromReadName(name: String, delimiter: Char = ':', strict: Boolean, prefix: Option[String] = None): Option[String] = { + def extractUmisFromReadName(name: String, + fieldDelimiter: Char = ':', + strict: Boolean, + umiDelimiter: Char = '+', + rcPrefix: Option[String] = None, + normalizeRcUmis: Boolean = false): Option[String] = { // If strict, check that the read name actually has eight parts, which is expected val rawUmi = if (strict) { - val colons = name.count(_ == delimiter) + val colons = name.count(_ == fieldDelimiter) if (colons == 6) None - else if (colons == 7) Some(name.substring(name.lastIndexOf(delimiter) + 1, name.length)) + else if (colons == 7) Some(name.substring(name.lastIndexOf(fieldDelimiter) + 1, name.length)) else throw new IllegalArgumentException(s"Trying to extract UMI from read with ${colons + 1} parts (7-8 expected): ${name}") } else { - val idx = name.lastIndexOf(delimiter) - require(idx != -1, s"Read did not have multiple '${delimiter}'-separated fields: ${name}") + val idx = name.lastIndexOf(fieldDelimiter) + require(idx != -1, s"Read did not have multiple '${fieldDelimiter}'-separated fields: ${name}") Some(name.substring(idx + 1, name.length)) } - val umiSeq = rawUmi.map(seq => (if (prefix.isEmpty) seq else seq.stripPrefix(prefix.get))) - val umi = umiSeq.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase) + var umi = rawUmi.map(raw => rcPrefix match { + case Some(prefix) if raw.indexOf(prefix) >= 0 && normalizeRcUmis => + raw.split(umiDelimiter).map(seq => + (if (seq.startsWith(prefix)) Sequences.revcomp(seq.stripPrefix(prefix)) else seq).toUpperCase + ).mkString("-") + case Some(prefix) if raw.indexOf(prefix) >= 0 => + raw.replace(prefix, "").replace(umiDelimiter, '-').toUpperCase + case _ if raw.indexOf(umiDelimiter) > 0 => raw.replace(umiDelimiter, '-').toUpperCase + case _ => raw.toUpperCase + }) + val valid = umi.forall(u => u.forall(isValidUmiCharacter)) if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${name}") diff --git a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala index 3503fdeba..85c4a81c0 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala @@ -33,14 +33,21 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { private case class Result(name: String, umi: String) /** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */ - private def run(names: Iterable[String], removeUmi: Boolean, umiPrefix: Option[String] = None): IndexedSeq[Result] = { + private def run(names: Iterable[String], + removeUmi: Boolean, + rcPrefix: Option[String] = None, + normalizeRcUmis: Boolean = false): IndexedSeq[Result] = { // build the reads val builder = new SamBuilder() names.foreach { name => builder.addFrag(name=name, unmapped=true) } // run the tool val out = makeTempFile("test.", ".bam") - val tool = new CopyUmiFromReadName(input=builder.toTempFile(), output=out, removeUmi=removeUmi, umiPrefix=umiPrefix) + val tool = new CopyUmiFromReadName(input=builder.toTempFile(), + output=out, + removeUmi=removeUmi, + rcPrefix=rcPrefix, + normalizeRcUmis=normalizeRcUmis) executeFgbioTool(tool) // slurp the results @@ -70,10 +77,17 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") } - it should "remove any additional separator characters preceding the UMI" in { - val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA-CCCC") - val results = run(names=names, removeUmi=true, umiPrefix=Some("r")) + it should "remove a reverse-complement prefix to the UMI" in { + val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") + val results = run(names=names, removeUmi=true, rcPrefix=Some("r"), normalizeRcUmis = false) results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") } + + it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI when '--normalize-rc-umis'" in { + val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") + val results = run(names=names, removeUmi=true, rcPrefix=Some("r"), normalizeRcUmis = true) + results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") + results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC") + } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala index 96298b246..3d8c010b3 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala @@ -92,9 +92,9 @@ class UmisTest extends UnitSpec with OptionValues { } it should "split on a different name delimiter if specified" in { - copyUmiFromReadName(rec=rec("UMI-A"), delimiter='-').nameAndUmi shouldBe ("UMI-A", "A") - copyUmiFromReadName(rec=rec("UMI-C-A"), delimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A") - copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), delimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT") + copyUmiFromReadName(rec=rec("UMI-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-A", "A") + copyUmiFromReadName(rec=rec("UMI-C-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A") + copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT") } it should "change the UMI delimiter from + to -" in { From 4d056ecb504a3e73b191d120668e1d5d5671231a Mon Sep 17 00:00:00 2001 From: jdidion Date: Fri, 12 Jul 2024 13:07:10 -0700 Subject: [PATCH 3/8] add codeowners --- .github/CODEOWNERS | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..4b23d67e4 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,2 @@ +@nh13 +@tfenne \ No newline at end of file From c17ed1e6daa4d0d3157f2aecef3f15e2c9ba862a Mon Sep 17 00:00:00 2001 From: jdidion Date: Fri, 12 Jul 2024 13:08:02 -0700 Subject: [PATCH 4/8] fix typo --- .../scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala index a2ea75414..b8f5274d9 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala @@ -47,7 +47,7 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger} |Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add a prefix to indicate that the sequence |has been reverse-complemented. The `--rc-prefix` option specifies the prefix character(s) and causes them to |be removed. Additionally, if the `--normalize-rc-umis` flag is specified, any reverse-complemented UMIs will - |be normalized (i.e., reverse-completemented back to be in the forward orientation). + |be normalized (i.e., reverse-complemented back to be in the forward orientation). | |To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and |prefix separately, i.e. `--field-delimiter=":"` and `--rc-prefix="r"`. From 51369ba69807190f95c2727d0728a25fae582e57 Mon Sep 17 00:00:00 2001 From: John Didion Date: Mon, 15 Jul 2024 10:32:13 -0700 Subject: [PATCH 5/8] Update src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala Co-authored-by: Nils Homer --- .../fulcrumgenomics/umi/CopyUmiFromReadName.scala | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala index b8f5274d9..bd26525d9 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala @@ -71,12 +71,14 @@ class CopyUmiFromReadName val progress = new ProgressLogger(logger) source.foreach { rec => progress.record(rec) - writer += Umis.copyUmiFromReadName(rec=rec, - removeUmi=removeUmi, - fieldDelimiter=fieldDelimiter, - umiDelimiter=umiDelimiter, - rcPrefix=rcPrefix, - normalizeRcUmis=normalizeRcUmis) + writer += Umis.copyUmiFromReadName( + rec = rec, + removeUmi = removeUmi, + fieldDelimiter = fieldDelimiter, + umiDelimiter = umiDelimiter, + rcPrefix = rcPrefix, + normalizeRcUmis = normalizeRcUmis + ) } progress.logLast() source.safelyClose() From 41882df237391ba8d1c91d31f6f833356e450f9a Mon Sep 17 00:00:00 2001 From: jdidion Date: Mon, 15 Jul 2024 10:45:13 -0700 Subject: [PATCH 6/8] update tests --- .../umi/CopyUmiFromReadName.scala | 20 ++++++------ .../scala/com/fulcrumgenomics/umi/Umis.scala | 30 +++++++++-------- .../umi/CopyUmiFromReadNameTest.scala | 32 +++++++++++++------ 3 files changed, 49 insertions(+), 33 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala index bd26525d9..58d428160 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala @@ -50,7 +50,7 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger} |be normalized (i.e., reverse-complemented back to be in the forward orientation). | |To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and - |prefix separately, i.e. `--field-delimiter=":"` and `--rc-prefix="r"`. + |prefix separately, i.e. `--field-delimiter=":"` and `--reverse-complement-prefix="r"`. """) class CopyUmiFromReadName ( @arg(flag='i', doc="The input BAM file.") input: PathToBam, @@ -58,12 +58,13 @@ class CopyUmiFromReadName @arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false, @arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':', @arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+', - @arg(doc="The prefix to a UMI sequence that indicates it is reverse-complemented.") rcPrefix: Option[String] = None, - @arg(doc="Whether to reverse-complement UMI sequences with the '--rc-prefix'.") normalizeRcUmis: Boolean = false, + @arg(flag='p', doc="The prefix to a UMI sequence that indicates it is reverse-complemented.") reverseComplementPrefix: Option[String] = None, + @arg(flag='r', doc="Whether to reverse-complement UMI sequences with the '--reverse-complement-prefix'.") normalizeReverseComplementUmis: Boolean = false, ) extends FgBioTool with LazyLogging { Io.assertReadable(input) Io.assertCanWriteFile(output) + validate(reverseComplementPrefix.forall(_.nonEmpty), "--reverse-complement-prefix cannot be an empty string") override def execute(): Unit = { val source = SamSource(input) @@ -71,13 +72,14 @@ class CopyUmiFromReadName val progress = new ProgressLogger(logger) source.foreach { rec => progress.record(rec) + writer += Umis.copyUmiFromReadName( - rec = rec, - removeUmi = removeUmi, - fieldDelimiter = fieldDelimiter, - umiDelimiter = umiDelimiter, - rcPrefix = rcPrefix, - normalizeRcUmis = normalizeRcUmis + rec = rec, + removeUmi = removeUmi, + fieldDelimiter = fieldDelimiter, + umiDelimiter = umiDelimiter, + reverseComplementPrefix = reverseComplementPrefix, + normalizeReverseComplementUmis = normalizeReverseComplementUmis ) } progress.logLast() diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index 610dd55c2..323dac633 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -41,23 +41,25 @@ object Umis { * @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag * @param fieldDelimiter the delimiter of fields within the read name * @param umiDelimiter the delimiter between sequences in the UMI string - * @param rcPrefix the prefix of a UMI that indicates it is reverse-complimented - * @param normalizeRcUmis whether to normalize reverse-complemented UMIs + * @param reverseComplementPrefix the prefix of a UMI that indicates it is reverse-complimented + * @param normalizeReverseComplementUmis whether to normalize reverse-complemented UMIs * @return the modified record */ def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, fieldDelimiter: Char = ':', umiDelimiter: Char = '+', - rcPrefix: Option[String] = None, - normalizeRcUmis: Boolean = false): SamRecord = { + reverseComplementPrefix: Option[String] = None, + normalizeReverseComplementUmis: Boolean = false): SamRecord = { // Extract and set the UMI - val umi = extractUmisFromReadName(rec.name, - fieldDelimiter, - strict=false, - umiDelimiter, - rcPrefix=rcPrefix, - normalizeRcUmis=normalizeRcUmis) + val umi = extractUmisFromReadName( + name = rec.name, + fieldDelimiter = fieldDelimiter, + strict = false, + umiDelimiter = umiDelimiter, + reverseComplementPrefix = reverseComplementPrefix, + normalizeReverseComplementUmis = normalizeReverseComplementUmis + ) require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}") umi.foreach(u => rec(ConsensusTags.UmiBases) = u) @@ -85,8 +87,8 @@ object Umis { fieldDelimiter: Char = ':', strict: Boolean, umiDelimiter: Char = '+', - rcPrefix: Option[String] = None, - normalizeRcUmis: Boolean = false): Option[String] = { + reverseComplementPrefix: Option[String] = None, + normalizeReverseComplementUmis: Boolean = false): Option[String] = { // If strict, check that the read name actually has eight parts, which is expected val rawUmi = if (strict) { val colons = name.count(_ == fieldDelimiter) @@ -99,8 +101,8 @@ object Umis { Some(name.substring(idx + 1, name.length)) } - var umi = rawUmi.map(raw => rcPrefix match { - case Some(prefix) if raw.indexOf(prefix) >= 0 && normalizeRcUmis => + var umi = rawUmi.map(raw => reverseComplementPrefix match { + case Some(prefix) if raw.indexOf(prefix) >= 0 && normalizeReverseComplementUmis => raw.split(umiDelimiter).map(seq => (if (seq.startsWith(prefix)) Sequences.revcomp(seq.stripPrefix(prefix)) else seq).toUpperCase ).mkString("-") diff --git a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala index 85c4a81c0..13e975129 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala @@ -35,19 +35,21 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { /** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */ private def run(names: Iterable[String], removeUmi: Boolean, - rcPrefix: Option[String] = None, - normalizeRcUmis: Boolean = false): IndexedSeq[Result] = { + reverseComplementPrefix: Option[String] = None, + normalizeReverseComplementUmis: Boolean = false): IndexedSeq[Result] = { // build the reads val builder = new SamBuilder() names.foreach { name => builder.addFrag(name=name, unmapped=true) } // run the tool val out = makeTempFile("test.", ".bam") - val tool = new CopyUmiFromReadName(input=builder.toTempFile(), - output=out, - removeUmi=removeUmi, - rcPrefix=rcPrefix, - normalizeRcUmis=normalizeRcUmis) + val tool = new CopyUmiFromReadName( + input = builder.toTempFile(), + output = out, + removeUmi = removeUmi, + reverseComplementPrefix = reverseComplementPrefix, + normalizeReverseComplementUmis = normalizeReverseComplementUmis + ) executeFgbioTool(tool) // slurp the results @@ -79,14 +81,24 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { it should "remove a reverse-complement prefix to the UMI" in { val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") - val results = run(names=names, removeUmi=true, rcPrefix=Some("r"), normalizeRcUmis = false) + val results = run( + names = names, + removeUmi = true, + reverseComplementPrefix = Some("r"), + normalizeReverseComplementUmis = false + ) results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") } - it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI when '--normalize-rc-umis'" in { + it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI when '--normalize-reverse-complement-umis'" in { val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") - val results = run(names=names, removeUmi=true, rcPrefix=Some("r"), normalizeRcUmis = true) + val results = run( + names = names, + removeUmi = true, + reverseComplementPrefix = Some("r"), + normalizeReverseComplementUmis = true + ) results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC") } From 700741e72115cdaa415d1a0a9152784922a40f33 Mon Sep 17 00:00:00 2001 From: jdidion Date: Mon, 15 Jul 2024 12:33:10 -0700 Subject: [PATCH 7/8] remove option to normalize reverse-complemented UMIs --- .../umi/CopyUmiFromReadName.scala | 8 +++----- .../scala/com/fulcrumgenomics/umi/Umis.scala | 12 +++--------- .../umi/CopyUmiFromReadNameTest.scala | 19 ++----------------- 3 files changed, 8 insertions(+), 31 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala index 58d428160..332471c26 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala @@ -45,9 +45,9 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger} |will always be hyphen delimited. | |Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add a prefix to indicate that the sequence - |has been reverse-complemented. The `--rc-prefix` option specifies the prefix character(s) and causes them to - |be removed. Additionally, if the `--normalize-rc-umis` flag is specified, any reverse-complemented UMIs will - |be normalized (i.e., reverse-complemented back to be in the forward orientation). + |has been reverse-complemented. The `--reverse-complement-prefix` option specifies the prefix character(s) and + |causes them to be removed. Any reverse-complemented UMIs will be normalized (i.e., reverse-complemented back to + |be in the forward orientation). | |To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and |prefix separately, i.e. `--field-delimiter=":"` and `--reverse-complement-prefix="r"`. @@ -59,7 +59,6 @@ class CopyUmiFromReadName @arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':', @arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+', @arg(flag='p', doc="The prefix to a UMI sequence that indicates it is reverse-complemented.") reverseComplementPrefix: Option[String] = None, - @arg(flag='r', doc="Whether to reverse-complement UMI sequences with the '--reverse-complement-prefix'.") normalizeReverseComplementUmis: Boolean = false, ) extends FgBioTool with LazyLogging { Io.assertReadable(input) @@ -79,7 +78,6 @@ class CopyUmiFromReadName fieldDelimiter = fieldDelimiter, umiDelimiter = umiDelimiter, reverseComplementPrefix = reverseComplementPrefix, - normalizeReverseComplementUmis = normalizeReverseComplementUmis ) } progress.logLast() diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index 323dac633..b293dac83 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -42,15 +42,13 @@ object Umis { * @param fieldDelimiter the delimiter of fields within the read name * @param umiDelimiter the delimiter between sequences in the UMI string * @param reverseComplementPrefix the prefix of a UMI that indicates it is reverse-complimented - * @param normalizeReverseComplementUmis whether to normalize reverse-complemented UMIs * @return the modified record */ def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, fieldDelimiter: Char = ':', umiDelimiter: Char = '+', - reverseComplementPrefix: Option[String] = None, - normalizeReverseComplementUmis: Boolean = false): SamRecord = { + reverseComplementPrefix: Option[String] = None): SamRecord = { // Extract and set the UMI val umi = extractUmisFromReadName( name = rec.name, @@ -58,7 +56,6 @@ object Umis { strict = false, umiDelimiter = umiDelimiter, reverseComplementPrefix = reverseComplementPrefix, - normalizeReverseComplementUmis = normalizeReverseComplementUmis ) require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}") umi.foreach(u => rec(ConsensusTags.UmiBases) = u) @@ -87,8 +84,7 @@ object Umis { fieldDelimiter: Char = ':', strict: Boolean, umiDelimiter: Char = '+', - reverseComplementPrefix: Option[String] = None, - normalizeReverseComplementUmis: Boolean = false): Option[String] = { + reverseComplementPrefix: Option[String] = None): Option[String] = { // If strict, check that the read name actually has eight parts, which is expected val rawUmi = if (strict) { val colons = name.count(_ == fieldDelimiter) @@ -102,12 +98,10 @@ object Umis { } var umi = rawUmi.map(raw => reverseComplementPrefix match { - case Some(prefix) if raw.indexOf(prefix) >= 0 && normalizeReverseComplementUmis => + case Some(prefix) if raw.indexOf(prefix) >= 0 => raw.split(umiDelimiter).map(seq => (if (seq.startsWith(prefix)) Sequences.revcomp(seq.stripPrefix(prefix)) else seq).toUpperCase ).mkString("-") - case Some(prefix) if raw.indexOf(prefix) >= 0 => - raw.replace(prefix, "").replace(umiDelimiter, '-').toUpperCase case _ if raw.indexOf(umiDelimiter) > 0 => raw.replace(umiDelimiter, '-').toUpperCase case _ => raw.toUpperCase }) diff --git a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala index 13e975129..1159e99ef 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala @@ -35,8 +35,7 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { /** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */ private def run(names: Iterable[String], removeUmi: Boolean, - reverseComplementPrefix: Option[String] = None, - normalizeReverseComplementUmis: Boolean = false): IndexedSeq[Result] = { + reverseComplementPrefix: Option[String] = None): IndexedSeq[Result] = { // build the reads val builder = new SamBuilder() names.foreach { name => builder.addFrag(name=name, unmapped=true) } @@ -48,7 +47,6 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { output = out, removeUmi = removeUmi, reverseComplementPrefix = reverseComplementPrefix, - normalizeReverseComplementUmis = normalizeReverseComplementUmis ) executeFgbioTool(tool) @@ -84,20 +82,7 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { val results = run( names = names, removeUmi = true, - reverseComplementPrefix = Some("r"), - normalizeReverseComplementUmis = false - ) - results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") - results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") - } - - it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI when '--normalize-reverse-complement-umis'" in { - val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") - val results = run( - names = names, - removeUmi = true, - reverseComplementPrefix = Some("r"), - normalizeReverseComplementUmis = true + reverseComplementPrefix = Some("r"), ) results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC") From 56b4d757c45155384bdf064fafe270ca19a1b910 Mon Sep 17 00:00:00 2001 From: jdidion Date: Mon, 15 Jul 2024 13:41:39 -0700 Subject: [PATCH 8/8] change to having an option that disables default behavior --- .../umi/CopyUmiFromReadName.scala | 24 +++++------ .../scala/com/fulcrumgenomics/umi/Umis.scala | 43 ++++++++++++------- .../umi/CopyUmiFromReadNameTest.scala | 27 ++++++++---- 3 files changed, 56 insertions(+), 38 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala index 332471c26..deb38f3a6 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala @@ -44,13 +44,10 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger} |The `--umi-delimiter` option specifies the delimiter on which to split. The resulting UMI in the `RX` tag |will always be hyphen delimited. | - |Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add a prefix to indicate that the sequence - |has been reverse-complemented. The `--reverse-complement-prefix` option specifies the prefix character(s) and - |causes them to be removed. Any reverse-complemented UMIs will be normalized (i.e., reverse-complemented back to - |be in the forward orientation). - | - |To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and - |prefix separately, i.e. `--field-delimiter=":"` and `--reverse-complement-prefix="r"`. + |Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add an 'r' prefix to indicate that the sequence + |has been reverse-complemented. By default, the 'r' prefix is removed and the sequence is reverse-complemented + |back to the forward orientation. The `--override-reverse-complement-umis` disables the latter behavior, such that + |the 'r' prefix is removed but the UMI sequence is left as reverse-complemented. """) class CopyUmiFromReadName ( @arg(flag='i', doc="The input BAM file.") input: PathToBam, @@ -58,12 +55,11 @@ class CopyUmiFromReadName @arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false, @arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':', @arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+', - @arg(flag='p', doc="The prefix to a UMI sequence that indicates it is reverse-complemented.") reverseComplementPrefix: Option[String] = None, + @arg(doc="Do not reverse-complement UMIs prefixed with 'r'.") overrideReverseComplementUmis: Boolean = false, ) extends FgBioTool with LazyLogging { Io.assertReadable(input) Io.assertCanWriteFile(output) - validate(reverseComplementPrefix.forall(_.nonEmpty), "--reverse-complement-prefix cannot be an empty string") override def execute(): Unit = { val source = SamSource(input) @@ -73,11 +69,11 @@ class CopyUmiFromReadName progress.record(rec) writer += Umis.copyUmiFromReadName( - rec = rec, - removeUmi = removeUmi, - fieldDelimiter = fieldDelimiter, - umiDelimiter = umiDelimiter, - reverseComplementPrefix = reverseComplementPrefix, + rec = rec, + removeUmi = removeUmi, + fieldDelimiter = fieldDelimiter, + umiDelimiter = umiDelimiter, + reverseComplementPrefixedUmis = !overrideReverseComplementUmis, ) } progress.logLast() diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index b293dac83..08369a682 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -29,6 +29,7 @@ import com.fulcrumgenomics.bam.api.SamRecord import com.fulcrumgenomics.util.Sequences object Umis { + val RevcompPrefix: String = "r" /** Copies the UMI sequence from the read name. * @@ -41,21 +42,21 @@ object Umis { * @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag * @param fieldDelimiter the delimiter of fields within the read name * @param umiDelimiter the delimiter between sequences in the UMI string - * @param reverseComplementPrefix the prefix of a UMI that indicates it is reverse-complimented + * @param reverseComplementPrefixedUmis whether to reverse-compliment UMIs prefixed with 'r' * @return the modified record */ def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, fieldDelimiter: Char = ':', umiDelimiter: Char = '+', - reverseComplementPrefix: Option[String] = None): SamRecord = { + reverseComplementPrefixedUmis: Boolean = true): SamRecord = { // Extract and set the UMI val umi = extractUmisFromReadName( - name = rec.name, - fieldDelimiter = fieldDelimiter, - strict = false, - umiDelimiter = umiDelimiter, - reverseComplementPrefix = reverseComplementPrefix, + name = rec.name, + fieldDelimiter = fieldDelimiter, + strict = false, + umiDelimiter = umiDelimiter, + reverseComplementPrefixedUmis = reverseComplementPrefixedUmis, ) require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}") umi.foreach(u => rec(ConsensusTags.UmiBases) = u) @@ -84,7 +85,7 @@ object Umis { fieldDelimiter: Char = ':', strict: Boolean, umiDelimiter: Char = '+', - reverseComplementPrefix: Option[String] = None): Option[String] = { + reverseComplementPrefixedUmis: Boolean = true): Option[String] = { // If strict, check that the read name actually has eight parts, which is expected val rawUmi = if (strict) { val colons = name.count(_ == fieldDelimiter) @@ -97,14 +98,24 @@ object Umis { Some(name.substring(idx + 1, name.length)) } - var umi = rawUmi.map(raw => reverseComplementPrefix match { - case Some(prefix) if raw.indexOf(prefix) >= 0 => - raw.split(umiDelimiter).map(seq => - (if (seq.startsWith(prefix)) Sequences.revcomp(seq.stripPrefix(prefix)) else seq).toUpperCase - ).mkString("-") - case _ if raw.indexOf(umiDelimiter) > 0 => raw.replace(umiDelimiter, '-').toUpperCase - case _ => raw.toUpperCase - }) + // Remove 'r' prefixes, optionally reverse-complementing the prefixed UMIs if + // reverseComplementPrefixedUmis = true, replace the delimiter (if any) with '-', + // and make sure the sequence is upper-case. + var umi = rawUmi.map(raw => + (raw.indexOf(RevcompPrefix) >= 0, raw.indexOf(umiDelimiter) > 0) match { + case (true, true) if reverseComplementPrefixedUmis => + raw.split(umiDelimiter).map(seq => + if (seq.startsWith(RevcompPrefix)) Sequences.revcomp(seq.stripPrefix(RevcompPrefix)) + else seq.stripPrefix(RevcompPrefix) + ).mkString("-").toUpperCase + case (true, false) if reverseComplementPrefixedUmis => + Sequences.revcomp(raw.stripPrefix(RevcompPrefix)).toUpperCase + case (true, true) => raw.replace(RevcompPrefix, "").replace(umiDelimiter, '-').toUpperCase + case (true, false) => raw.replace(RevcompPrefix, "").toUpperCase + case (false, true) => raw.replace(umiDelimiter, '-').toUpperCase + case (false, false) => raw.toUpperCase + } + ) val valid = umi.forall(u => u.forall(isValidUmiCharacter)) diff --git a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala index 1159e99ef..2e4eb14de 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala @@ -35,7 +35,7 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { /** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */ private def run(names: Iterable[String], removeUmi: Boolean, - reverseComplementPrefix: Option[String] = None): IndexedSeq[Result] = { + overrideReverseComplementUmis: Boolean = false): IndexedSeq[Result] = { // build the reads val builder = new SamBuilder() names.foreach { name => builder.addFrag(name=name, unmapped=true) } @@ -43,10 +43,10 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { // run the tool val out = makeTempFile("test.", ".bam") val tool = new CopyUmiFromReadName( - input = builder.toTempFile(), - output = out, - removeUmi = removeUmi, - reverseComplementPrefix = reverseComplementPrefix, + input = builder.toTempFile(), + output = out, + removeUmi = removeUmi, + overrideReverseComplementUmis = overrideReverseComplementUmis ) executeFgbioTool(tool) @@ -80,9 +80,20 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { it should "remove a reverse-complement prefix to the UMI" in { val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") val results = run( - names = names, - removeUmi = true, - reverseComplementPrefix = Some("r"), + names = names, + removeUmi = true, + overrideReverseComplementUmis = true + ) + results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") + results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") + } + + it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI" in { + val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") + val results = run( + names = names, + removeUmi = true, + overrideReverseComplementUmis = false ) results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC")