From bc9839caa95e551b4e00bab59c18efdc16f59b1a Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 23 Jun 2022 22:23:48 +0200 Subject: [PATCH 1/5] feat: retry with reverse complement when seed matching fails Adds flag `--retry-reverse-complement` which enables additional attempt of seed matching when initial attempt fails. The second attempt is performed on reverse-complemented sequence. As a consequence, the output alignment, peptides and analysis results correspond to this modified sequence and not to the original. This functionality is opt-in and the default behavior is to skip sequence with a warning. --- packages_rs/nextclade/src/align/align.rs | 31 ++++++++++++------- packages_rs/nextclade/src/align/params.rs | 8 ++++- .../src/translate/translate_genes.rs | 2 +- 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/packages_rs/nextclade/src/align/align.rs b/packages_rs/nextclade/src/align/align.rs index 5378f8ddc..3d17fc6e1 100644 --- a/packages_rs/nextclade/src/align/align.rs +++ b/packages_rs/nextclade/src/align/align.rs @@ -8,8 +8,9 @@ use crate::io::aa::Aa; use crate::io::letter::Letter; use crate::io::nuc::Nuc; use crate::make_error; +use crate::translate::complement::reverse_complement_in_place; use eyre::Report; -use log::trace; +use log::{info, trace}; fn align_pairwise>( qry_seq: &[T], @@ -17,14 +18,14 @@ fn align_pairwise>( gap_open_close: &[i32], params: &AlignPairwiseParams, stripes: &[Stripe], -) -> Result, Report> { +) -> AlignmentOutput { trace!("Align pairwise: started. Params: {params:?}"); let max_indel = params.max_indel; let ScoreMatrixResult { scores, paths } = score_matrix(qry_seq, ref_seq, gap_open_close, stripes, params); - Ok(backtrace(qry_seq, ref_seq, &scores, &paths)) + backtrace(qry_seq, ref_seq, &scores, &paths) } /// align nucleotide sequences via seed alignment and banded smith watermann without penalizing terminal gaps @@ -42,13 +43,21 @@ pub fn align_nuc( ); } - let stripes = seed_alignment(qry_seq, ref_seq, params)?; - - let result = align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes)?; - - trace!("Score: {}", result.alignment_score); - - Ok(result) + #[allow(clippy::map_err_ignore)] + match seed_alignment(qry_seq, ref_seq, params) { + Ok(stripes) => Ok(align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes)), + Err(report) => { + if params.retry_reverse_complement { + info!("Seed matching failed. Retrying reverse complement"); // TODO: would be nice to have sequence index and name here + let mut qry_seq = qry_seq.to_owned(); + reverse_complement_in_place(&mut qry_seq); + let stripes = seed_alignment(&qry_seq, ref_seq, params).map_err(|_| report)?; + Ok(align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes)) + } else { + Err(report) + } + } + } } /// align amino acids using a fixed bandwidth banded alignment while penalizing terminal indels @@ -59,7 +68,7 @@ pub fn align_aa( params: &AlignPairwiseParams, band_width: usize, mean_shift: i32, -) -> Result, Report> { +) -> AlignmentOutput { let stripes = simple_stripes(mean_shift, band_width, ref_seq.len(), qry_seq.len()); align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes) diff --git a/packages_rs/nextclade/src/align/params.rs b/packages_rs/nextclade/src/align/params.rs index 4ee255424..a27575346 100644 --- a/packages_rs/nextclade/src/align/params.rs +++ b/packages_rs/nextclade/src/align/params.rs @@ -13,6 +13,7 @@ pub enum GapAlignmentSide { // as well as adds a method `.merge_opt(&opt)` to the original struct, which merges values from the optional counterpart // into self (mutably). +#[allow(clippy::struct_excessive_bools)] #[optfield(pub AlignPairwiseParamsOptional, attrs, doc, field_attrs, field_doc, merge_fn = pub)] #[derive(Parser, Debug, Clone, Serialize, Deserialize)] pub struct AlignPairwiseParams { @@ -70,8 +71,12 @@ pub struct AlignPairwiseParams { #[clap(long)] pub seed_spacing: i32, + /// Retry seed matching step with a reverse complement if the first attempt failed + #[clap(long, takes_value = false, forbid_empty_values = false, default_missing_value = "true")] + pub retry_reverse_complement: bool, + /// If this flag is present, the amino acid sequences will be truncated at the first stop codon, if mutations or sequencing errors cause premature stop codons to be present. No amino acid mutations in the truncated region will be recorded. - #[clap(long)] + #[clap(long, takes_value = false, forbid_empty_values = false, default_missing_value = "true")] pub no_translate_past_stop: bool, // Internal alignment parameter @@ -112,6 +117,7 @@ impl Default for AlignPairwiseParams { min_match_rate: 0.3, seed_spacing: 100, mismatches_allowed: 3, + retry_reverse_complement: false, no_translate_past_stop: false, left_terminal_gaps_free: true, right_terminal_gaps_free: true, diff --git a/packages_rs/nextclade/src/translate/translate_genes.rs b/packages_rs/nextclade/src/translate/translate_genes.rs index 2c019c7d2..a3292d8a8 100644 --- a/packages_rs/nextclade/src/translate/translate_genes.rs +++ b/packages_rs/nextclade/src/translate/translate_genes.rs @@ -176,7 +176,7 @@ pub fn translate_gene( &aa_params, band_width, mean_shift, - )?; + ); let mut stripped = insertions_strip(&alignment.qry_seq, &alignment.ref_seq); From 879f78051ef858c3edf691b86199d916bbfb22b0 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 24 Jun 2022 21:08:18 +0200 Subject: [PATCH 2/5] feat: append suffix to sequence if reverse complemented --- .../src/cli/nextalign_ordered_writer.rs | 16 ++++++++++++---- .../src/cli/nextclade_ordered_writer.rs | 8 ++++++++ packages_rs/nextclade/src/align/align.rs | 4 +++- packages_rs/nextclade/src/align/backtrace.rs | 3 +++ packages_rs/nextclade/src/constants.rs | 1 + packages_rs/nextclade/src/lib.rs | 1 + .../nextclade/src/run/nextalign_run_one.rs | 3 +++ .../nextclade/src/run/nextclade_run_one.rs | 11 ++++++++++- packages_rs/nextclade/src/types/outputs.rs | 2 ++ 9 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 packages_rs/nextclade/src/constants.rs diff --git a/packages_rs/nextclade-cli/src/cli/nextalign_ordered_writer.rs b/packages_rs/nextclade-cli/src/cli/nextalign_ordered_writer.rs index 49f4ff6e4..3052600eb 100644 --- a/packages_rs/nextclade-cli/src/cli/nextalign_ordered_writer.rs +++ b/packages_rs/nextclade-cli/src/cli/nextalign_ordered_writer.rs @@ -1,6 +1,7 @@ use crate::cli::nextalign_loop::NextalignRecord; use eyre::{Report, WrapErr}; use log::warn; +use nextclade::constants::REVERSE_COMPLEMENT_SUFFIX; use nextclade::io::errors_csv::ErrorsCsvWriter; use nextclade::io::fasta::{FastaPeptideWriter, FastaRecord, FastaWriter}; use nextclade::io::gene_map::GeneMap; @@ -87,24 +88,31 @@ impl<'a> NextalignOrderedWriter<'a> { translations, warnings, missing_genes, + is_reverse_complement, } = output; + let seq_name = if *is_reverse_complement { + format!("{seq_name}{REVERSE_COMPLEMENT_SUFFIX}") + } else { + seq_name.clone() + }; + if let Some(fasta_writer) = &mut self.fasta_writer { - fasta_writer.write(seq_name, &from_nuc_seq(&stripped.qry_seq))?; + fasta_writer.write(&seq_name, &from_nuc_seq(&stripped.qry_seq))?; } if let Some(fasta_peptide_writer) = &mut self.fasta_peptide_writer { for translation in translations { - fasta_peptide_writer.write(seq_name, translation)?; + fasta_peptide_writer.write(&seq_name, translation)?; } } if let Some(insertions_csv_writer) = &mut self.insertions_csv_writer { - insertions_csv_writer.write(seq_name, &stripped.insertions, translations)?; + insertions_csv_writer.write(&seq_name, &stripped.insertions, translations)?; } if let Some(errors_csv_writer) = &mut self.errors_csv_writer { - errors_csv_writer.write_aa_errors(seq_name, warnings, missing_genes)?; + errors_csv_writer.write_aa_errors(&seq_name, warnings, missing_genes)?; } } Err(report) => { diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs b/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs index 934294d49..3b5a064d4 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs @@ -2,6 +2,7 @@ use crate::cli::nextclade_loop::NextcladeRecord; use eyre::{Report, WrapErr}; use itertools::Itertools; use log::warn; +use nextclade::constants::REVERSE_COMPLEMENT_SUFFIX; use nextclade::io::errors_csv::ErrorsCsvWriter; use nextclade::io::fasta::{FastaPeptideWriter, FastaRecord, FastaWriter}; use nextclade::io::gene_map::GeneMap; @@ -118,9 +119,16 @@ impl<'a> NextcladeOrderedWriter<'a> { warnings, insertions, missing_genes, + is_reverse_complement, .. } = &nextclade_outputs; + let seq_name = if *is_reverse_complement { + format!("{seq_name}{REVERSE_COMPLEMENT_SUFFIX}") + } else { + seq_name.clone() + }; + if let Some(fasta_writer) = &mut self.fasta_writer { fasta_writer.write(&seq_name, &from_nuc_seq(&qry_seq_stripped))?; } diff --git a/packages_rs/nextclade/src/align/align.rs b/packages_rs/nextclade/src/align/align.rs index 3d17fc6e1..34cae2c93 100644 --- a/packages_rs/nextclade/src/align/align.rs +++ b/packages_rs/nextclade/src/align/align.rs @@ -52,7 +52,9 @@ pub fn align_nuc( let mut qry_seq = qry_seq.to_owned(); reverse_complement_in_place(&mut qry_seq); let stripes = seed_alignment(&qry_seq, ref_seq, params).map_err(|_| report)?; - Ok(align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes)) + let mut result = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes); + result.is_reverse_complement = true; + Ok(result) } else { Err(report) } diff --git a/packages_rs/nextclade/src/align/backtrace.rs b/packages_rs/nextclade/src/align/backtrace.rs index ae60a0962..fb53c9ef2 100644 --- a/packages_rs/nextclade/src/align/backtrace.rs +++ b/packages_rs/nextclade/src/align/backtrace.rs @@ -14,6 +14,7 @@ pub struct AlignmentOutput { pub qry_seq: Vec, pub ref_seq: Vec, pub alignment_score: i32, + pub is_reverse_complement: bool, } pub fn backtrace>( @@ -89,6 +90,7 @@ pub fn backtrace>( qry_seq: aln_qry, ref_seq: aln_ref, alignment_score: scores[(num_rows - 1, num_cols - 1)], + is_reverse_complement: false, } } @@ -156,6 +158,7 @@ mod tests { qry_seq: to_nuc_seq("---CTCGCT")?, ref_seq: to_nuc_seq("ACGCTCGCT")?, alignment_score: 18, + is_reverse_complement: false }; let output = backtrace(&qry_seq, &ref_seq, &scores, &paths); diff --git a/packages_rs/nextclade/src/constants.rs b/packages_rs/nextclade/src/constants.rs new file mode 100644 index 000000000..b102b539f --- /dev/null +++ b/packages_rs/nextclade/src/constants.rs @@ -0,0 +1 @@ +pub const REVERSE_COMPLEMENT_SUFFIX: &str = " |(reverse complement)"; diff --git a/packages_rs/nextclade/src/lib.rs b/packages_rs/nextclade/src/lib.rs index 058de692f..744d3c773 100644 --- a/packages_rs/nextclade/src/lib.rs +++ b/packages_rs/nextclade/src/lib.rs @@ -1,5 +1,6 @@ pub mod align; pub mod analyze; +pub mod constants; pub mod gene; pub mod io; pub mod qc; diff --git a/packages_rs/nextclade/src/run/nextalign_run_one.rs b/packages_rs/nextclade/src/run/nextalign_run_one.rs index cecf66492..ebb2ff33f 100644 --- a/packages_rs/nextclade/src/run/nextalign_run_one.rs +++ b/packages_rs/nextclade/src/run/nextalign_run_one.rs @@ -51,12 +51,15 @@ pub fn nextalign_run_one( .cloned() .collect_vec(); + let is_reverse_complement = alignment.is_reverse_complement; + Ok(NextalignOutputs { stripped, alignment, translations, warnings, missing_genes, + is_reverse_complement, }) } } diff --git a/packages_rs/nextclade/src/run/nextclade_run_one.rs b/packages_rs/nextclade/src/run/nextclade_run_one.rs index 518b55441..6ed919cfd 100644 --- a/packages_rs/nextclade/src/run/nextclade_run_one.rs +++ b/packages_rs/nextclade/src/run/nextclade_run_one.rs @@ -12,6 +12,7 @@ use crate::analyze::nuc_changes::{find_nuc_changes, FindNucChangesOutput}; use crate::analyze::pcr_primer_changes::get_pcr_primer_changes; use crate::analyze::pcr_primers::PcrPrimer; use crate::analyze::virus_properties::VirusProperties; +use crate::constants::REVERSE_COMPLEMENT_SUFFIX; use crate::io::aa::Aa; use crate::io::gene_map::GeneMap; use crate::io::letter::Letter; @@ -46,6 +47,7 @@ pub fn nextclade_run_one( translations, warnings, missing_genes, + is_reverse_complement, } = nextalign_run_one( qry_seq, ref_seq, @@ -154,11 +156,17 @@ pub fn nextclade_run_one( qc_config, ); + let seq_name = if is_reverse_complement { + format!("{seq_name}{REVERSE_COMPLEMENT_SUFFIX}") + } else { + seq_name.to_owned() + }; + Ok(( stripped.qry_seq, translations, NextcladeOutputs { - seq_name: seq_name.to_owned(), + seq_name, substitutions, total_substitutions, deletions, @@ -195,6 +203,7 @@ pub fn nextclade_run_one( qc, custom_node_attributes: clade_node_attrs, nearest_node_id, + is_reverse_complement, }, )) } diff --git a/packages_rs/nextclade/src/types/outputs.rs b/packages_rs/nextclade/src/types/outputs.rs index 8b34a6067..2953698c2 100644 --- a/packages_rs/nextclade/src/types/outputs.rs +++ b/packages_rs/nextclade/src/types/outputs.rs @@ -31,6 +31,7 @@ pub struct NextalignOutputs { pub translations: Vec, pub warnings: Vec, pub missing_genes: Vec, + pub is_reverse_complement: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -75,6 +76,7 @@ pub struct NextcladeOutputs { pub qc: QcResult, pub custom_node_attributes: BTreeMap, pub nearest_node_id: usize, + pub is_reverse_complement: bool, } impl NextcladeOutputs { From fa8f27594ff52efb4085fc4e23d015a4d6aefc4e Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 24 Jun 2022 21:56:28 +0200 Subject: [PATCH 3/5] feat(cli): issue a warning when a sequence was reverse-complemented --- .../nextclade-cli/src/cli/nextalign_loop.rs | 2 + .../nextclade-cli/src/cli/nextclade_loop.rs | 1 + packages_rs/nextclade-web/src/wasm/analyze.rs | 5 +++ .../src/workers/nextcladeWasm.worker.ts | 1 + packages_rs/nextclade/src/align/align.rs | 41 ++++++++++--------- .../nextclade/src/run/nextalign_run_one.rs | 4 +- .../nextclade/src/run/nextclade_run_one.rs | 4 ++ packages_rs/nextclade/src/types/outputs.rs | 1 + 8 files changed, 39 insertions(+), 20 deletions(-) diff --git a/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs b/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs index 71304276b..241c66052 100644 --- a/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs +++ b/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs @@ -115,6 +115,8 @@ pub fn nextalign_run(run_args: NextalignRunArgs) -> Result<(), Report> { .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'")) .and_then(|qry_seq| { nextalign_run_one( + index, + &seq_name, &qry_seq, ref_seq, ref_peptides, diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs b/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs index 0ca1087dc..f20b75e39 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs @@ -178,6 +178,7 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> { .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'")) .and_then(|qry_seq| { nextclade_run_one( + index, &seq_name, &qry_seq, ref_seq, diff --git a/packages_rs/nextclade-web/src/wasm/analyze.rs b/packages_rs/nextclade-web/src/wasm/analyze.rs index cd60aeaed..c20d92d39 100644 --- a/packages_rs/nextclade-web/src/wasm/analyze.rs +++ b/packages_rs/nextclade-web/src/wasm/analyze.rs @@ -76,6 +76,9 @@ impl NextcladeParams { #[wasm_bindgen] #[derive(Clone, Serialize, Deserialize, TypescriptDefinition, Debug)] pub struct AnalysisInput { + #[wasm_bindgen(getter_with_clone)] + pub qry_index: usize, + #[wasm_bindgen(getter_with_clone)] pub qry_seq_name: String, @@ -218,6 +221,7 @@ impl Nextclade { pub fn run(&mut self, input: &AnalysisInput) -> Result { let AnalysisInput { + qry_index, qry_seq_name, qry_seq_str, } = input; @@ -225,6 +229,7 @@ impl Nextclade { let qry_seq = &to_nuc_seq(qry_seq_str).wrap_err("When converting query sequence")?; match nextclade_run_one( + *qry_index, qry_seq_name, qry_seq, &self.ref_seq, diff --git a/packages_rs/nextclade-web/src/workers/nextcladeWasm.worker.ts b/packages_rs/nextclade-web/src/workers/nextcladeWasm.worker.ts index 1e43165b8..e22f2f6cc 100644 --- a/packages_rs/nextclade-web/src/workers/nextcladeWasm.worker.ts +++ b/packages_rs/nextclade-web/src/workers/nextcladeWasm.worker.ts @@ -90,6 +90,7 @@ async function analyze(record: FastaRecord): Promise { const { index, seqName, seq } = record const input = AnalysisInput.from_js({ + qry_index: index, qry_seq_name: seqName, qry_seq_str: seq, }) diff --git a/packages_rs/nextclade/src/align/align.rs b/packages_rs/nextclade/src/align/align.rs index 34cae2c93..83f7d9173 100644 --- a/packages_rs/nextclade/src/align/align.rs +++ b/packages_rs/nextclade/src/align/align.rs @@ -10,7 +10,7 @@ use crate::io::nuc::Nuc; use crate::make_error; use crate::translate::complement::reverse_complement_in_place; use eyre::Report; -use log::{info, trace}; +use log::{info, trace, warn}; fn align_pairwise>( qry_seq: &[T], @@ -30,6 +30,8 @@ fn align_pairwise>( /// align nucleotide sequences via seed alignment and banded smith watermann without penalizing terminal gaps pub fn align_nuc( + index: usize, + seq_name: &str, qry_seq: &[Nuc], ref_seq: &[Nuc], gap_open_close: &[i32], @@ -48,12 +50,13 @@ pub fn align_nuc( Ok(stripes) => Ok(align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes)), Err(report) => { if params.retry_reverse_complement { - info!("Seed matching failed. Retrying reverse complement"); // TODO: would be nice to have sequence index and name here + info!("When processing sequence #{index} '{seq_name}': Seed matching failed. Retrying reverse complement"); let mut qry_seq = qry_seq.to_owned(); reverse_complement_in_place(&mut qry_seq); let stripes = seed_alignment(&qry_seq, ref_seq, params).map_err(|_| report)?; let mut result = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes); result.is_reverse_complement = true; + warn!("When processing sequence #{index} '{seq_name}': Sequence is reverse-complemented: Seed matching failed for the original sequence, but succeeded for its reverse complement. Outputs will be derived from the reverse complement and 'reverse complement' suffix will be added to sequence ID."); Ok(result) } else { Err(report) @@ -131,7 +134,7 @@ mod tests { let qry_seq = to_nuc_seq("ACGCTCGCT")?; let ref_seq = to_nuc_seq("ACGCTCGCT")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_seq), from_nuc_seq(&result.qry_seq)); @@ -145,7 +148,7 @@ mod tests { let ref_seq = to_nuc_seq("ACGCTCGCT")?; let qry_aln = to_nuc_seq("-CGCTCGCT")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -159,7 +162,7 @@ mod tests { let ref_seq = to_nuc_seq("ACGCTCGCT")?; let qry_aln = to_nuc_seq("---CTCGCT")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -174,7 +177,7 @@ mod tests { let qry_aln = to_nuc_seq("-----TCCAATCA")?; // ^ - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -189,7 +192,7 @@ mod tests { let qry_aln = to_nuc_seq("-----TGTTACCTGCGC")?; // ^^ - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -203,7 +206,7 @@ mod tests { let ref_seq = to_nuc_seq("ACGCTCGCT")?; let qry_aln = to_nuc_seq("ACGCTC---")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -218,7 +221,7 @@ mod tests { let qry_aln = to_nuc_seq("CCAATCAT-----")?; // ^ - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -233,7 +236,7 @@ mod tests { let qry_aln = to_nuc_seq("CCGATCAT-----")?; // ^ ^ - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -247,7 +250,7 @@ mod tests { let ref_seq = to_nuc_seq("GCCACGCTCGCT")?; let qry_aln = to_nuc_seq("---ACGCTC---")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -261,7 +264,7 @@ mod tests { let ref_seq = to_nuc_seq("ACGCTC")?; let ref_aln = to_nuc_seq("---ACGCTC---")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_aln), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_seq), from_nuc_seq(&result.qry_seq)); @@ -275,7 +278,7 @@ mod tests { let ref_seq = to_nuc_seq("GCCACGCTCGCT")?; let qry_aln = to_nuc_seq("GCCA--CTCCCT")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; // assert_eq!(18, result.alignment_score); assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); @@ -290,7 +293,7 @@ mod tests { let ref_seq = to_nuc_seq("GCCACTCGCT")?; let ref_aln = to_nuc_seq("GCCA--CTCGCT")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_aln), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_seq), from_nuc_seq(&result.qry_seq)); @@ -304,7 +307,7 @@ mod tests { let ref_seq = to_nuc_seq("ACATATACTTC")?; let qry_aln = to_nuc_seq("ACAT---CTTC")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_seq), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -318,7 +321,7 @@ mod tests { let ref_seq = to_nuc_seq("ACATCTTG")?; let ref_aln = to_nuc_seq("ACAT---CTTG")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_aln), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_seq), from_nuc_seq(&result.qry_seq)); @@ -333,7 +336,7 @@ mod tests { let qry_aln = to_nuc_seq("AAAAAAAAAAAA----------")?; let ref_aln = to_nuc_seq("---------AAATTTTTTTTTT")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_aln), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -348,7 +351,7 @@ mod tests { let ref_aln = to_nuc_seq("AAAAAAAAAAAA----------")?; let qry_aln = to_nuc_seq("---------AAATTTTTTTTTT")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_aln), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); @@ -363,7 +366,7 @@ mod tests { let ref_aln = to_nuc_seq("CTTGGAGGTTCCGTG----GCTAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATGCGACACTTCGCATGGTG---AGCCTTTGT")?; let qry_aln = to_nuc_seq("CTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATC-----AAGCTCATGGGACANNNNNCATGGTGGACAGCCTTTGT")?; - let result = align_nuc(&qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; + let result = align_nuc(0, "", &qry_seq, &ref_seq, &ctx.gap_open_close, &ctx.params)?; assert_eq!(from_nuc_seq(&ref_aln), from_nuc_seq(&result.ref_seq)); assert_eq!(from_nuc_seq(&qry_aln), from_nuc_seq(&result.qry_seq)); diff --git a/packages_rs/nextclade/src/run/nextalign_run_one.rs b/packages_rs/nextclade/src/run/nextalign_run_one.rs index ebb2ff33f..3a6210f1d 100644 --- a/packages_rs/nextclade/src/run/nextalign_run_one.rs +++ b/packages_rs/nextclade/src/run/nextalign_run_one.rs @@ -11,6 +11,8 @@ use itertools::{Either, Itertools}; use std::collections::HashSet; pub fn nextalign_run_one( + index: usize, + seq_name: &str, qry_seq: &[Nuc], ref_seq: &[Nuc], ref_peptides: &TranslationMap, @@ -19,7 +21,7 @@ pub fn nextalign_run_one( gap_open_close_aa: &[i32], params: &AlignPairwiseParams, ) -> Result { - match align_nuc(qry_seq, ref_seq, gap_open_close_nuc, params) { + match align_nuc(index, seq_name, qry_seq, ref_seq, gap_open_close_nuc, params) { Err(report) => Err(report), Ok(alignment) => { diff --git a/packages_rs/nextclade/src/run/nextclade_run_one.rs b/packages_rs/nextclade/src/run/nextclade_run_one.rs index 6ed919cfd..6536b5c7b 100644 --- a/packages_rs/nextclade/src/run/nextclade_run_one.rs +++ b/packages_rs/nextclade/src/run/nextclade_run_one.rs @@ -28,6 +28,7 @@ use crate::types::outputs::{NextalignOutputs, NextcladeOutputs}; use eyre::Report; pub fn nextclade_run_one( + index: usize, seq_name: &str, qry_seq: &[Nuc], ref_seq: &[Nuc], @@ -49,6 +50,8 @@ pub fn nextclade_run_one( missing_genes, is_reverse_complement, } = nextalign_run_one( + index, + seq_name, qry_seq, ref_seq, ref_peptides, @@ -166,6 +169,7 @@ pub fn nextclade_run_one( stripped.qry_seq, translations, NextcladeOutputs { + index, seq_name, substitutions, total_substitutions, diff --git a/packages_rs/nextclade/src/types/outputs.rs b/packages_rs/nextclade/src/types/outputs.rs index 2953698c2..09f98d2be 100644 --- a/packages_rs/nextclade/src/types/outputs.rs +++ b/packages_rs/nextclade/src/types/outputs.rs @@ -37,6 +37,7 @@ pub struct NextalignOutputs { #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct NextcladeOutputs { + pub index: usize, pub seq_name: String, pub substitutions: Vec, pub total_substitutions: usize, From 2605ccaa64c302ebd5fb087f6d93255d94fae3ee Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 24 Jun 2022 22:54:43 +0200 Subject: [PATCH 4/5] feat: add warning to errors.csv when sequence gets reverse-complemented --- packages_rs/nextclade/src/run/nextalign_run_one.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/packages_rs/nextclade/src/run/nextalign_run_one.rs b/packages_rs/nextclade/src/run/nextalign_run_one.rs index 3a6210f1d..7094598b3 100644 --- a/packages_rs/nextclade/src/run/nextalign_run_one.rs +++ b/packages_rs/nextclade/src/run/nextalign_run_one.rs @@ -9,6 +9,7 @@ use crate::utils::error::report_to_string; use eyre::Report; use itertools::{Either, Itertools}; use std::collections::HashSet; +use std::fmt::format; pub fn nextalign_run_one( index: usize, @@ -36,7 +37,7 @@ pub fn nextalign_run_one( let stripped = insertions_strip(&alignment.qry_seq, &alignment.ref_seq); - let (translations, warnings): (Vec, Vec) = + let (translations, mut warnings): (Vec, Vec) = translations.into_iter().partition_map(|(gene_name, res)| match res { Ok(tr) => Either::Left(tr), Err(err) => Either::Right(PeptideWarning { @@ -55,6 +56,13 @@ pub fn nextalign_run_one( let is_reverse_complement = alignment.is_reverse_complement; + if is_reverse_complement { + warnings.push(PeptideWarning { + gene_name: "nuc".to_owned(), + warning: format!("When processing sequence #{index} '{seq_name}': Sequence is reverse-complemented: Seed matching failed for the original sequence, but succeeded for its reverse complement. Outputs will be derived from the reverse complement and 'reverse complement' suffix will be added to sequence ID.") + }); + } + Ok(NextalignOutputs { stripped, alignment, From 18bdc94dbb1f7412e32e56aa59d0e009a549c933 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 24 Jun 2022 22:58:30 +0200 Subject: [PATCH 5/5] feat: add "isReverseComplement" columt to csv and tsv outputs --- packages_rs/nextclade/src/io/nextclade_csv.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages_rs/nextclade/src/io/nextclade_csv.rs b/packages_rs/nextclade/src/io/nextclade_csv.rs index c39b3b198..e5562768a 100644 --- a/packages_rs/nextclade/src/io/nextclade_csv.rs +++ b/packages_rs/nextclade/src/io/nextclade_csv.rs @@ -86,6 +86,7 @@ static NEXTCLADE_CSV_HEADERS: &[&str] = &[ "qc.stopCodons.totalStopCodons", "qc.stopCodons.score", "qc.stopCodons.status", + "isReverseComplement", // "failedGenes", // "warnings", "errors", @@ -169,6 +170,7 @@ impl NextcladeResultsCsvWriter { // divergence, qc, custom_node_attributes, + is_reverse_complement, .. } = nextclade_outputs; @@ -365,6 +367,7 @@ impl NextcladeResultsCsvWriter { "qc.stopCodons.status", qc.stop_codons.as_ref().map(|sc| sc.status.to_string()), )?; + self.add_entry("isReverseComplement", &is_reverse_complement.to_string())?; // self.add_entry("failedGenes", &format_failed_genes(missing_genes, ARRAY_ITEM_DELIMITER))?; // self.add_entry("warnings", &format_aa_warnings(translations, ARRAY_ITEM_DELIMITER))?; self.add_entry("errors", &"")?;