From ed264931cbc1e40b91821d82eeab58828b26a217 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Wed, 5 Jun 2024 18:59:34 +0200 Subject: [PATCH 1/8] refactor: split "find aa changes" code into modules In preparation for https://github.com/nextstrain/nextclade/pull/1454 Among other things, in order to render private and relative aa mutations we need to group them similarly to how absolute aa mutations are grouped. This involves finding adjacent mutations and nuc context for these mutations. Sadly, the code for grouping is quite complex and is not immediately reusable (it combines aa mutation search and grouping at the same time), so let's do some refactoring. Let's start from factoring away structs and functions that will be unchanged, to clear up some space for the action, to minimize diffs and to reduce scrolling. --- .../src/analyze/aa_change_with_context.rs | 89 ++++++++ .../nextclade/src/analyze/aa_changes_find.rs | 51 +++++ ..._changes.rs => aa_changes_find_for_cds.rs} | 199 +----------------- .../nextclade/src/analyze/aa_changes_group.rs | 49 +++++ packages/nextclade/src/analyze/aa_sub.rs | 16 +- packages/nextclade/src/analyze/mod.rs | 5 +- .../nextclade/src/run/nextclade_run_one.rs | 6 +- packages/nextclade/src/types/outputs.rs | 2 +- 8 files changed, 222 insertions(+), 195 deletions(-) create mode 100644 packages/nextclade/src/analyze/aa_change_with_context.rs create mode 100644 packages/nextclade/src/analyze/aa_changes_find.rs rename packages/nextclade/src/analyze/{aa_changes.rs => aa_changes_find_for_cds.rs} (56%) create mode 100644 packages/nextclade/src/analyze/aa_changes_group.rs diff --git a/packages/nextclade/src/analyze/aa_change_with_context.rs b/packages/nextclade/src/analyze/aa_change_with_context.rs new file mode 100644 index 000000000..14626554b --- /dev/null +++ b/packages/nextclade/src/analyze/aa_change_with_context.rs @@ -0,0 +1,89 @@ +use crate::alphabet::aa::Aa; +use crate::alphabet::letter::{serde_deserialize_seq, serde_serialize_seq}; +use crate::alphabet::nuc::Nuc; +use crate::analyze::aa_sub::is_aa_mutated_or_deleted; +use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; +use crate::coord::position::{AaRefPosition, NucRefGlobalPosition, PositionLike}; +use crate::coord::range::NucRefGlobalRange; +use crate::gene::cds::Cds; +use crate::gene::gene::GeneStrand; +use crate::translate::complement::reverse_complement_in_place; +use crate::translate::translate_genes::CdsTranslation; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct AaChangeWithContext { + pub cds_name: String, + pub pos: AaRefPosition, + pub ref_aa: Aa, + pub qry_aa: Aa, + pub nuc_pos: NucRefGlobalPosition, + + #[schemars(with = "String")] + #[serde(serialize_with = "serde_serialize_seq")] + #[serde(deserialize_with = "serde_deserialize_seq")] + pub ref_triplet: Vec, + + #[schemars(with = "String")] + #[serde(serialize_with = "serde_serialize_seq")] + #[serde(deserialize_with = "serde_deserialize_seq")] + pub qry_triplet: Vec, + pub nuc_ranges: Vec, +} + +impl AaChangeWithContext { + pub fn new( + cds: &Cds, + pos: AaRefPosition, + qry_seq: &[Nuc], + ref_seq: &[Nuc], + ref_tr: &CdsTranslation, + qry_tr: &CdsTranslation, + ) -> Self { + let ref_aa = ref_tr.seq[pos.as_usize()]; + let qry_aa = qry_tr.seq[pos.as_usize()]; + let nuc_ranges = cds_codon_pos_to_ref_range(cds, pos); + + let ref_triplet = nuc_ranges + .iter() + .flat_map(|(range, strand)| { + let mut nucs = ref_seq[range.to_std()].to_vec(); + if strand == &GeneStrand::Reverse { + reverse_complement_in_place(&mut nucs); + } + nucs + }) + .collect_vec(); + + let qry_triplet = nuc_ranges + .iter() + .flat_map(|(range, strand)| { + let mut nucs = qry_seq[range.clamp_range(0, qry_seq.len()).to_std()].to_vec(); + if strand == &GeneStrand::Reverse { + reverse_complement_in_place(&mut nucs); + } + nucs + }) + .collect_vec(); + + let nuc_ranges = nuc_ranges.into_iter().map(|(range, _)| range).collect_vec(); + + Self { + cds_name: cds.name.clone(), + pos, + ref_aa, + qry_aa, + nuc_pos: nuc_ranges[0].begin, + nuc_ranges, + ref_triplet, + qry_triplet, + } + } + + #[inline] + pub fn is_mutated_or_deleted(&self) -> bool { + is_aa_mutated_or_deleted(self.ref_aa, self.qry_aa) + } +} diff --git a/packages/nextclade/src/analyze/aa_changes_find.rs b/packages/nextclade/src/analyze/aa_changes_find.rs new file mode 100644 index 000000000..9b1ea5eed --- /dev/null +++ b/packages/nextclade/src/analyze/aa_changes_find.rs @@ -0,0 +1,51 @@ +use crate::alphabet::nuc::Nuc; +use crate::analyze::aa_changes_find_for_cds::{aa_changes_find_for_cds, FindAaChangesOutput}; +use crate::analyze::nuc_del::NucDelRange; +use crate::analyze::nuc_sub::NucSub; +use crate::gene::gene_map::GeneMap; +use crate::translate::translate_genes::Translation; +use crate::utils::collections::extend_map_of_vecs; +use eyre::Report; + +/// Finds aminoacid substitutions and deletions in query peptides relative to reference peptides, in all genes +/// +/// ## Precondition +/// Nucleotide sequences and peptides are required to be stripped from insertions +pub fn aa_changes_find( + ref_seq: &[Nuc], + qry_seq: &[Nuc], + ref_translation: &Translation, + qry_translation: &Translation, + gene_map: &GeneMap, + nuc_subs: &[NucSub], + nuc_dels: &[NucDelRange], +) -> Result { + let mut changes = qry_translation + .iter_cdses() + .map(|(qry_name, qry_cds_tr)| { + let ref_cds_tr = ref_translation.get_cds(qry_name)?; + let cds = gene_map.get_cds(&qry_cds_tr.name)?; + Ok(aa_changes_find_for_cds( + cds, qry_seq, ref_seq, ref_cds_tr, qry_cds_tr, nuc_subs, nuc_dels, + )) + }) + .collect::, Report>>()? + .into_iter() + // Merge changes from all CDSes into one struct + .fold(FindAaChangesOutput::default(), |mut output, changes| { + output.aa_changes_groups.extend(changes.aa_changes_groups); + output.aa_substitutions.extend(changes.aa_substitutions); + output.aa_deletions.extend(changes.aa_deletions); + extend_map_of_vecs(&mut output.nuc_to_aa_muts, changes.nuc_to_aa_muts); + output + }); + + changes.aa_substitutions.sort(); + changes.aa_deletions.sort(); + changes.nuc_to_aa_muts.iter_mut().for_each(|(_, vals)| { + vals.sort(); + vals.dedup(); + }); + + Ok(changes) +} diff --git a/packages/nextclade/src/analyze/aa_changes.rs b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs similarity index 56% rename from packages/nextclade/src/analyze/aa_changes.rs rename to packages/nextclade/src/analyze/aa_changes_find_for_cds.rs index e4a0e514e..8889efdab 100644 --- a/packages/nextclade/src/analyze/aa_changes.rs +++ b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs @@ -1,145 +1,21 @@ -use crate::alphabet::aa::Aa; use crate::alphabet::letter::Letter; -use crate::alphabet::letter::{serde_deserialize_seq, serde_serialize_seq}; use crate::alphabet::nuc::Nuc; +use crate::analyze::aa_change_with_context::AaChangeWithContext; +use crate::analyze::aa_changes_group::AaChangesGroup; use crate::analyze::aa_del::AaDel; -use crate::analyze::aa_sub::AaSub; +use crate::analyze::aa_sub::{is_aa_mutated_or_deleted, is_codon_sequenced, AaSub}; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::nuc_sub::NucSub; use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; -use crate::coord::position::{AaRefPosition, NucRefGlobalPosition, PositionLike}; -use crate::coord::range::{have_intersection, AaRefRange, NucRefGlobalRange}; +use crate::coord::position::PositionLike; +use crate::coord::range::{have_intersection, AaRefRange}; use crate::gene::cds::Cds; -use crate::gene::gene::GeneStrand; -use crate::gene::gene_map::GeneMap; -use crate::translate::complement::reverse_complement_in_place; -use crate::translate::translate_genes::{CdsTranslation, Translation}; -use crate::utils::collections::extend_map_of_vecs; +use crate::translate::translate_genes::CdsTranslation; use either::Either; -use eyre::Report; -use itertools::{Itertools, MinMaxResult}; +use itertools::Itertools; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; -#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] -#[serde(rename_all = "camelCase")] -pub struct AaChangeWithContext { - pub cds_name: String, - pub pos: AaRefPosition, - pub ref_aa: Aa, - pub qry_aa: Aa, - pub nuc_pos: NucRefGlobalPosition, - - #[schemars(with = "String")] - #[serde(serialize_with = "serde_serialize_seq")] - #[serde(deserialize_with = "serde_deserialize_seq")] - pub ref_triplet: Vec, - - #[schemars(with = "String")] - #[serde(serialize_with = "serde_serialize_seq")] - #[serde(deserialize_with = "serde_deserialize_seq")] - pub qry_triplet: Vec, - pub nuc_ranges: Vec, -} - -impl AaChangeWithContext { - pub fn new( - cds: &Cds, - pos: AaRefPosition, - qry_seq: &[Nuc], - ref_seq: &[Nuc], - ref_tr: &CdsTranslation, - qry_tr: &CdsTranslation, - ) -> Self { - let ref_aa = ref_tr.seq[pos.as_usize()]; - let qry_aa = qry_tr.seq[pos.as_usize()]; - let nuc_ranges = cds_codon_pos_to_ref_range(cds, pos); - - let ref_triplet = nuc_ranges - .iter() - .flat_map(|(range, strand)| { - let mut nucs = ref_seq[range.to_std()].to_vec(); - if strand == &GeneStrand::Reverse { - reverse_complement_in_place(&mut nucs); - } - nucs - }) - .collect_vec(); - - let qry_triplet = nuc_ranges - .iter() - .flat_map(|(range, strand)| { - let mut nucs = qry_seq[range.clamp_range(0, qry_seq.len()).to_std()].to_vec(); - if strand == &GeneStrand::Reverse { - reverse_complement_in_place(&mut nucs); - } - nucs - }) - .collect_vec(); - - let nuc_ranges = nuc_ranges.into_iter().map(|(range, _)| range).collect_vec(); - - Self { - cds_name: cds.name.clone(), - pos, - ref_aa, - qry_aa, - nuc_pos: nuc_ranges[0].begin, - nuc_ranges, - ref_triplet, - qry_triplet, - } - } - - #[inline] - pub fn is_mutated_or_deleted(&self) -> bool { - is_aa_mutated_or_deleted(self.ref_aa, self.qry_aa) - } -} - -#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] -#[serde(rename_all = "camelCase")] -pub struct AaChangesGroup { - name: String, - range: AaRefRange, - changes: Vec, - nuc_subs: Vec, - nuc_dels: Vec, -} - -impl AaChangesGroup { - pub fn new(name: impl AsRef) -> Self { - Self::with_changes(name, vec![]) - } - - pub fn with_changes(name: impl AsRef, changes: Vec) -> Self { - Self { - name: name.as_ref().to_owned(), - range: Self::find_codon_range(&changes), - changes, - nuc_subs: vec![], - nuc_dels: vec![], - } - } - - pub fn push(&mut self, change: AaChangeWithContext) { - self.changes.push(change); - self.range = Self::find_codon_range(&self.changes); - } - - pub fn last(&self) -> Option<&AaChangeWithContext> { - self.changes.last() - } - - fn find_codon_range(changes: &[AaChangeWithContext]) -> AaRefRange { - match changes.iter().minmax_by_key(|change| change.pos) { - MinMaxResult::NoElements => AaRefRange::from_isize(0, 0), - MinMaxResult::OneElement(one) => AaRefRange::new(one.pos, one.pos + 1), - MinMaxResult::MinMax(first, last) => AaRefRange::new(first.pos, last.pos + 1), - } - } -} - #[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct FindAaChangesOutput { @@ -149,49 +25,6 @@ pub struct FindAaChangesOutput { pub nuc_to_aa_muts: BTreeMap>, } -/// Finds aminoacid substitutions and deletions in query peptides relative to reference peptides, in all genes -/// -/// ## Precondition -/// Nucleotide sequences and peptides are required to be stripped from insertions -pub fn find_aa_changes( - ref_seq: &[Nuc], - qry_seq: &[Nuc], - ref_translation: &Translation, - qry_translation: &Translation, - gene_map: &GeneMap, - nuc_subs: &[NucSub], - nuc_dels: &[NucDelRange], -) -> Result { - let mut changes = qry_translation - .iter_cdses() - .map(|(qry_name, qry_cds_tr)| { - let ref_cds_tr = ref_translation.get_cds(qry_name)?; - let cds = gene_map.get_cds(&qry_cds_tr.name)?; - Ok(find_aa_changes_for_cds( - cds, qry_seq, ref_seq, ref_cds_tr, qry_cds_tr, nuc_subs, nuc_dels, - )) - }) - .collect::, Report>>()? - .into_iter() - // Merge changes from all CDSes into one struct - .fold(FindAaChangesOutput::default(), |mut output, changes| { - output.aa_changes_groups.extend(changes.aa_changes_groups); - output.aa_substitutions.extend(changes.aa_substitutions); - output.aa_deletions.extend(changes.aa_deletions); - extend_map_of_vecs(&mut output.nuc_to_aa_muts, changes.nuc_to_aa_muts); - output - }); - - changes.aa_substitutions.sort(); - changes.aa_deletions.sort(); - changes.nuc_to_aa_muts.iter_mut().for_each(|(_, vals)| { - vals.sort(); - vals.dedup(); - }); - - Ok(changes) -} - /// Finds aminoacid substitutions and deletions in query peptides relative to reference peptides, in one gene /// /// ## Precondition @@ -200,13 +33,13 @@ pub fn find_aa_changes( /// /// ## Implementation details /// We compare reference and query peptides (extracted by the preceding call to Nextalign), -/// one aminoacid at at time, and deduce changes. We then report the change and relevant nucleotide context surrounding +/// one aminoacid at a time, and deduce changes. We then report the change and relevant nucleotide context surrounding /// this change. /// Previously we reported one-to-one mapping of aminoacid changes to corresponding nucleotide changes. However, it /// was not always accurate, because if there are multiple nucleotide changes in a codon, the direct correspondence /// might not always be established without knowing the order in which nucleotide changes have occurred. And in the /// context of Nextclade we don't have this information. -fn find_aa_changes_for_cds( +pub fn aa_changes_find_for_cds( cds: &Cds, qry_seq: &[Nuc], ref_seq: &[Nuc], @@ -404,17 +237,3 @@ fn find_aa_changes_for_cds( nuc_to_aa_muts, } } - -/// Check whether a given pair if reference and query aminoacids constitute a mutation or deletion -#[inline] -fn is_aa_mutated_or_deleted(ref_aa: Aa, qry_aa: Aa) -> bool { - // NOTE: We chose to ignore mutations to `X`. - qry_aa != ref_aa && qry_aa != Aa::X -} - -/// Check whether a given codon position corresponds to a sequenced aminoacid -fn is_codon_sequenced(aa_alignment_ranges: &[AaRefRange], codon: AaRefPosition) -> bool { - aa_alignment_ranges - .iter() - .any(|aa_alignment_range| aa_alignment_range.contains(codon)) -} diff --git a/packages/nextclade/src/analyze/aa_changes_group.rs b/packages/nextclade/src/analyze/aa_changes_group.rs new file mode 100644 index 000000000..0733aeb8b --- /dev/null +++ b/packages/nextclade/src/analyze/aa_changes_group.rs @@ -0,0 +1,49 @@ +use crate::analyze::aa_change_with_context::AaChangeWithContext; +use crate::analyze::nuc_del::NucDelRange; +use crate::analyze::nuc_sub::NucSub; +use crate::coord::range::AaRefRange; +use itertools::{Itertools, MinMaxResult}; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct AaChangesGroup { + pub name: String, + pub range: AaRefRange, + pub changes: Vec, + pub nuc_subs: Vec, + pub nuc_dels: Vec, +} + +impl AaChangesGroup { + pub fn new(name: impl AsRef) -> Self { + Self::with_changes(name, vec![]) + } + + pub fn with_changes(name: impl AsRef, changes: Vec) -> Self { + Self { + name: name.as_ref().to_owned(), + range: Self::find_codon_range(&changes), + changes, + nuc_subs: vec![], + nuc_dels: vec![], + } + } + + pub fn push(&mut self, change: AaChangeWithContext) { + self.changes.push(change); + self.range = Self::find_codon_range(&self.changes); + } + + pub fn last(&self) -> Option<&AaChangeWithContext> { + self.changes.last() + } + + fn find_codon_range(changes: &[AaChangeWithContext]) -> AaRefRange { + match changes.iter().minmax_by_key(|change| change.pos) { + MinMaxResult::NoElements => AaRefRange::from_isize(0, 0), + MinMaxResult::OneElement(one) => AaRefRange::new(one.pos, one.pos + 1), + MinMaxResult::MinMax(first, last) => AaRefRange::new(first.pos, last.pos + 1), + } + } +} diff --git a/packages/nextclade/src/analyze/aa_sub.rs b/packages/nextclade/src/analyze/aa_sub.rs index ec3a122cd..063500610 100644 --- a/packages/nextclade/src/analyze/aa_sub.rs +++ b/packages/nextclade/src/analyze/aa_sub.rs @@ -1,9 +1,10 @@ use crate::alphabet::aa::{from_aa, Aa}; use crate::alphabet::letter::Letter; -use crate::analyze::aa_changes::AaChangeWithContext; +use crate::analyze::aa_change_with_context::AaChangeWithContext; use crate::analyze::aa_del::AaDel; use crate::analyze::abstract_mutation::{AbstractMutation, MutParams, Pos, QryLetter, RefLetter}; use crate::coord::position::AaRefPosition; +use crate::coord::range::AaRefRange; use crate::io::parse_pos::parse_pos; use crate::make_error; use eyre::{Report, WrapErr}; @@ -152,3 +153,16 @@ impl From<&AaChangeWithContext> for AaSub { } } } + +/// Check whether a given pair if reference and query aminoacids constitute a mutation or deletion +pub fn is_aa_mutated_or_deleted(ref_aa: Aa, qry_aa: Aa) -> bool { + // NOTE: We chose to ignore mutations to `X`. + qry_aa != ref_aa && qry_aa != Aa::X +} + +/// Check whether a given codon position corresponds to a sequenced aminoacid +pub fn is_codon_sequenced(aa_alignment_ranges: &[AaRefRange], codon: AaRefPosition) -> bool { + aa_alignment_ranges + .iter() + .any(|aa_alignment_range| aa_alignment_range.contains(codon)) +} diff --git a/packages/nextclade/src/analyze/mod.rs b/packages/nextclade/src/analyze/mod.rs index b4833b2e6..a1f02982b 100644 --- a/packages/nextclade/src/analyze/mod.rs +++ b/packages/nextclade/src/analyze/mod.rs @@ -1,4 +1,7 @@ -pub mod aa_changes; +pub mod aa_change_with_context; +pub mod aa_changes_find; +pub mod aa_changes_find_for_cds; +pub mod aa_changes_group; pub mod aa_del; pub mod aa_sub; pub mod abstract_mutation; diff --git a/packages/nextclade/src/run/nextclade_run_one.rs b/packages/nextclade/src/run/nextclade_run_one.rs index 1fd99de25..bef8304ec 100644 --- a/packages/nextclade/src/run/nextclade_run_one.rs +++ b/packages/nextclade/src/run/nextclade_run_one.rs @@ -3,7 +3,9 @@ use crate::align::insertions_strip::{get_aa_insertions, insertions_strip, AaIns, use crate::alphabet::aa::Aa; use crate::alphabet::letter::Letter; use crate::alphabet::nuc::Nuc; -use crate::analyze::aa_changes::{find_aa_changes, AaChangesGroup, FindAaChangesOutput}; +use crate::analyze::aa_changes_find::aa_changes_find; +use crate::analyze::aa_changes_find_for_cds::FindAaChangesOutput; +use crate::analyze::aa_changes_group::AaChangesGroup; use crate::analyze::aa_del::AaDel; use crate::analyze::aa_sub::AaSub; use crate::analyze::divergence::calculate_branch_length; @@ -198,7 +200,7 @@ pub fn nextclade_run_one( aa_substitutions, aa_deletions, nuc_to_aa_muts, - } = find_aa_changes( + } = aa_changes_find( ref_seq, &stripped.qry_seq, ref_translation, diff --git a/packages/nextclade/src/types/outputs.rs b/packages/nextclade/src/types/outputs.rs index e603e65ae..135bdf4dd 100644 --- a/packages/nextclade/src/types/outputs.rs +++ b/packages/nextclade/src/types/outputs.rs @@ -1,6 +1,6 @@ use crate::align::insertions_strip::{AaIns, Insertion}; use crate::alphabet::nuc::Nuc; -use crate::analyze::aa_changes::AaChangesGroup; +use crate::analyze::aa_changes_group::AaChangesGroup; use crate::analyze::aa_del::AaDel; use crate::analyze::aa_sub::AaSub; use crate::analyze::find_aa_motifs_changes::{AaMotifsChangesMap, AaMotifsMap}; From 5ee7936f98c4fb937778da4adee340683a0f4f0d Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Wed, 5 Jun 2024 20:33:18 +0200 Subject: [PATCH 2/8] refactor: extract aa alignment pair to a struct Rather than carrying the 2 vecs for qry and ref sequences, lets group them into a class with meaningful methods. This prevents position lookup code from spreading everywhere, and removes extra param to pass into functions. --- .../nextclade/src/analyze/aa_alignment.rs | 34 ++++++++++++++ .../src/analyze/aa_change_with_context.rs | 17 ++----- .../src/analyze/aa_changes_find_for_cds.rs | 45 +++++-------------- packages/nextclade/src/analyze/aa_sub.rs | 9 ++++ packages/nextclade/src/analyze/mod.rs | 1 + 5 files changed, 60 insertions(+), 46 deletions(-) create mode 100644 packages/nextclade/src/analyze/aa_alignment.rs diff --git a/packages/nextclade/src/analyze/aa_alignment.rs b/packages/nextclade/src/analyze/aa_alignment.rs new file mode 100644 index 000000000..26e7cd20b --- /dev/null +++ b/packages/nextclade/src/analyze/aa_alignment.rs @@ -0,0 +1,34 @@ +use crate::alphabet::aa::Aa; +use crate::analyze::aa_sub::AaSubMin; +use crate::coord::position::{AaRefPosition, PositionLike}; +use crate::gene::cds::Cds; +use crate::translate::translate_genes::CdsTranslation; + +/// Represents a pair of aligned amino acid sequences (resulting from pairwise alignment +pub struct AaAlignment<'c, 'r, 'q> { + cds: &'c Cds, + ref_tr: &'r CdsTranslation, + qry_tr: &'q CdsTranslation, +} + +impl<'c, 'r, 'q> AaAlignment<'c, 'r, 'q> { + pub const fn new(cds: &'c Cds, ref_tr: &'r CdsTranslation, qry_tr: &'q CdsTranslation) -> Self { + Self { cds, ref_tr, qry_tr } + } + + pub fn ref_at(&self, pos: AaRefPosition) -> Aa { + self.ref_tr.seq[pos.as_usize()] + } + + pub fn qry_at(&self, pos: AaRefPosition) -> Aa { + self.qry_tr.seq[pos.as_usize()] + } + + pub fn mut_at(&self, pos: AaRefPosition) -> AaSubMin { + AaSubMin { + ref_aa: self.ref_at(pos), + pos, + qry_aa: self.qry_at(pos), + } + } +} diff --git a/packages/nextclade/src/analyze/aa_change_with_context.rs b/packages/nextclade/src/analyze/aa_change_with_context.rs index 14626554b..79af98e05 100644 --- a/packages/nextclade/src/analyze/aa_change_with_context.rs +++ b/packages/nextclade/src/analyze/aa_change_with_context.rs @@ -1,14 +1,13 @@ use crate::alphabet::aa::Aa; use crate::alphabet::letter::{serde_deserialize_seq, serde_serialize_seq}; use crate::alphabet::nuc::Nuc; -use crate::analyze::aa_sub::is_aa_mutated_or_deleted; +use crate::analyze::aa_sub::{is_aa_mutated_or_deleted, AaSubMin}; use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; -use crate::coord::position::{AaRefPosition, NucRefGlobalPosition, PositionLike}; +use crate::coord::position::{AaRefPosition, NucRefGlobalPosition}; use crate::coord::range::NucRefGlobalRange; use crate::gene::cds::Cds; use crate::gene::gene::GeneStrand; use crate::translate::complement::reverse_complement_in_place; -use crate::translate::translate_genes::CdsTranslation; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -34,16 +33,8 @@ pub struct AaChangeWithContext { } impl AaChangeWithContext { - pub fn new( - cds: &Cds, - pos: AaRefPosition, - qry_seq: &[Nuc], - ref_seq: &[Nuc], - ref_tr: &CdsTranslation, - qry_tr: &CdsTranslation, - ) -> Self { - let ref_aa = ref_tr.seq[pos.as_usize()]; - let qry_aa = qry_tr.seq[pos.as_usize()]; + pub fn new(cds: &Cds, sub: &AaSubMin, qry_seq: &[Nuc], ref_seq: &[Nuc]) -> Self { + let AaSubMin { pos, ref_aa, qry_aa } = *sub; let nuc_ranges = cds_codon_pos_to_ref_range(cds, pos); let ref_triplet = nuc_ranges diff --git a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs index 8889efdab..2b76b0243 100644 --- a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs +++ b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs @@ -1,5 +1,6 @@ use crate::alphabet::letter::Letter; use crate::alphabet::nuc::Nuc; +use crate::analyze::aa_alignment::AaAlignment; use crate::analyze::aa_change_with_context::AaChangeWithContext; use crate::analyze::aa_changes_group::AaChangesGroup; use crate::analyze::aa_del::AaDel; @@ -51,6 +52,8 @@ pub fn aa_changes_find_for_cds( assert_eq!(ref_tr.seq.len(), qry_tr.seq.len()); assert_eq!(qry_seq.len(), ref_seq.len()); + let tr = AaAlignment::new(cds, ref_tr, qry_tr); + let aa_alignment_ranges = &qry_tr.alignment_ranges; let mut aa_changes_groups = vec![AaChangesGroup::new(&cds.name)]; let mut curr_group = aa_changes_groups.last_mut().unwrap(); @@ -61,24 +64,18 @@ pub fn aa_changes_find_for_cds( let ref_aa = ref_tr.seq[codon.as_usize()]; let qry_aa = qry_tr.seq[codon.as_usize()]; + if is_aa_mutated_or_deleted(ref_aa, qry_aa) { match curr_group.last() { // If current group is empty, then we are about to insert the first codon into the first group. None => { if codon > 0 && is_codon_sequenced(aa_alignment_ranges, codon - 1) { // Also prepend one codon to the left, for additional context, to start the group - curr_group.push(AaChangeWithContext::new( - cds, - codon - 1, - qry_seq, - ref_seq, - ref_tr, - qry_tr, - )); + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); } // The current codon itself - curr_group.push(AaChangeWithContext::new(cds, codon, qry_seq, ref_seq, ref_tr, qry_tr)); + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), qry_seq, ref_seq)); } // Current group is not empty Some(prev) => { @@ -88,18 +85,11 @@ pub fn aa_changes_find_for_cds( // If previous codon in the group is not exactly adjacent, there is 1 item in between, // then cover the hole by inserting previous codon. if codon == prev.pos + 2 && is_codon_sequenced(aa_alignment_ranges, codon - 1) { - curr_group.push(AaChangeWithContext::new( - cds, - codon - 1, - qry_seq, - ref_seq, - ref_tr, - qry_tr, - )); + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); } // And insert the current codon - curr_group.push(AaChangeWithContext::new(cds, codon, qry_seq, ref_seq, ref_tr, qry_tr)); + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), qry_seq, ref_seq)); } // If previous codon in the group is not adjacent, then terminate the current group and start a new group. else { @@ -107,11 +97,9 @@ pub fn aa_changes_find_for_cds( if is_codon_sequenced(aa_alignment_ranges, prev.pos + 1) { curr_group.push(AaChangeWithContext::new( cds, - prev.pos + 1, + &tr.mut_at(prev.pos + 1), qry_seq, ref_seq, - ref_tr, - qry_tr, )); } @@ -120,18 +108,11 @@ pub fn aa_changes_find_for_cds( // Start a new group and push the current codon into it. if is_codon_sequenced(aa_alignment_ranges, codon - 1) { // Also prepend one codon to the left, for additional context, to start the new group. - new_group.push(AaChangeWithContext::new( - cds, - codon - 1, - qry_seq, - ref_seq, - ref_tr, - qry_tr, - )); + new_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); } // Push the current codon to the new group - new_group.push(AaChangeWithContext::new(cds, codon, qry_seq, ref_seq, ref_tr, qry_tr)); + new_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), qry_seq, ref_seq)); aa_changes_groups.push(new_group); @@ -147,11 +128,9 @@ pub fn aa_changes_find_for_cds( if is_codon_sequenced(aa_alignment_ranges, last.pos + 1) { curr_group.push(AaChangeWithContext::new( cds, - last.pos + 1, + &tr.mut_at(last.pos + 1), qry_seq, ref_seq, - ref_tr, - qry_tr, )); } } diff --git a/packages/nextclade/src/analyze/aa_sub.rs b/packages/nextclade/src/analyze/aa_sub.rs index 063500610..3c055038f 100644 --- a/packages/nextclade/src/analyze/aa_sub.rs +++ b/packages/nextclade/src/analyze/aa_sub.rs @@ -14,6 +14,15 @@ use serde::{Deserialize, Serialize}; use std::fmt::{Display, Formatter}; use std::str::FromStr; +/// Represents minimal aminoacid substitution (regardless of CDS) +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct AaSubMin { + pub pos: AaRefPosition, + pub ref_aa: Aa, + pub qry_aa: Aa, +} + /// Represents aminoacid substitution #[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] diff --git a/packages/nextclade/src/analyze/mod.rs b/packages/nextclade/src/analyze/mod.rs index a1f02982b..546046f63 100644 --- a/packages/nextclade/src/analyze/mod.rs +++ b/packages/nextclade/src/analyze/mod.rs @@ -5,6 +5,7 @@ pub mod aa_changes_group; pub mod aa_del; pub mod aa_sub; pub mod abstract_mutation; +pub mod aa_alignment; pub mod count_gaps; pub mod divergence; pub mod find_aa_motifs; From 2cbf269e3aa7f690d80f73687ec453abd5394fbf Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Wed, 5 Jun 2024 21:43:30 +0200 Subject: [PATCH 3/8] refactor: move position-based checks into the new aa alignment class Let's make the free functions checking for certain conditions at certain positions into class methods. These functions are useful for all kinds of mutation structs, so let's move some into the `AbstractMutation` trait. Turns out that most of these functions can have neat default implementations just from existing trait methods. --- packages/nextclade/src/alphabet/aa.rs | 1 + packages/nextclade/src/alphabet/letter.rs | 1 + packages/nextclade/src/alphabet/nuc.rs | 1 + .../nextclade/src/analyze/aa_alignment.rs | 31 ++++++++-- .../src/analyze/aa_change_with_context.rs | 38 ++++++++++-- .../src/analyze/aa_changes_find_for_cds.rs | 28 ++++----- packages/nextclade/src/analyze/aa_sub.rs | 39 ++++-------- packages/nextclade/src/analyze/aa_sub_min.rs | 61 +++++++++++++++++++ .../src/analyze/abstract_mutation.rs | 32 ++++++++++ packages/nextclade/src/analyze/mod.rs | 3 +- packages/nextclade/src/analyze/nuc_sub.rs | 5 -- 11 files changed, 181 insertions(+), 59 deletions(-) create mode 100644 packages/nextclade/src/analyze/aa_sub_min.rs diff --git a/packages/nextclade/src/alphabet/aa.rs b/packages/nextclade/src/alphabet/aa.rs index cbc67d4a8..a0b3b6133 100644 --- a/packages/nextclade/src/alphabet/aa.rs +++ b/packages/nextclade/src/alphabet/aa.rs @@ -66,6 +66,7 @@ impl Display for Aa { impl Letter for Aa { const GAP: Aa = Aa::Gap; + const UNKNOWN: Aa = Aa::X; #[inline] fn is_gap(&self) -> bool { diff --git a/packages/nextclade/src/alphabet/letter.rs b/packages/nextclade/src/alphabet/letter.rs index a5623e2ea..25b644b6b 100644 --- a/packages/nextclade/src/alphabet/letter.rs +++ b/packages/nextclade/src/alphabet/letter.rs @@ -11,6 +11,7 @@ pub trait ScoreMatrixLookup { /// Generic representation of a character defining nucleotide or amino acid pub trait Letter: Copy + Display + Eq + Ord + ScoreMatrixLookup { const GAP: L; + const UNKNOWN: L; fn is_gap(&self) -> bool; diff --git a/packages/nextclade/src/alphabet/nuc.rs b/packages/nextclade/src/alphabet/nuc.rs index 4b839285c..6043b6d61 100644 --- a/packages/nextclade/src/alphabet/nuc.rs +++ b/packages/nextclade/src/alphabet/nuc.rs @@ -57,6 +57,7 @@ impl Display for Nuc { impl Letter for Nuc { const GAP: Nuc = Nuc::Gap; + const UNKNOWN: Nuc = Nuc::N; #[inline] fn is_gap(&self) -> bool { diff --git a/packages/nextclade/src/analyze/aa_alignment.rs b/packages/nextclade/src/analyze/aa_alignment.rs index 26e7cd20b..b32387e72 100644 --- a/packages/nextclade/src/analyze/aa_alignment.rs +++ b/packages/nextclade/src/analyze/aa_alignment.rs @@ -1,19 +1,31 @@ use crate::alphabet::aa::Aa; -use crate::analyze::aa_sub::AaSubMin; +use crate::analyze::aa_sub_min::AaSubMin; use crate::coord::position::{AaRefPosition, PositionLike}; +use crate::coord::range::AaRefRange; use crate::gene::cds::Cds; use crate::translate::translate_genes::CdsTranslation; /// Represents a pair of aligned amino acid sequences (resulting from pairwise alignment -pub struct AaAlignment<'c, 'r, 'q> { +pub struct AaAlignment<'c, 'r, 'q, 'ar> { cds: &'c Cds, ref_tr: &'r CdsTranslation, qry_tr: &'q CdsTranslation, + aln_ranges: &'ar [AaRefRange], } -impl<'c, 'r, 'q> AaAlignment<'c, 'r, 'q> { - pub const fn new(cds: &'c Cds, ref_tr: &'r CdsTranslation, qry_tr: &'q CdsTranslation) -> Self { - Self { cds, ref_tr, qry_tr } +impl<'c, 'r, 'q, 'ar> AaAlignment<'c, 'r, 'q, 'ar> { + pub const fn new( + cds: &'c Cds, + ref_tr: &'r CdsTranslation, + qry_tr: &'q CdsTranslation, + aln_ranges: &'ar [AaRefRange], + ) -> Self { + Self { + cds, + ref_tr, + qry_tr, + aln_ranges, + } } pub fn ref_at(&self, pos: AaRefPosition) -> Aa { @@ -31,4 +43,13 @@ impl<'c, 'r, 'q> AaAlignment<'c, 'r, 'q> { qry_aa: self.qry_at(pos), } } + + /// Check whether a given codon position falls within alignment + pub fn is_codon_sequenced(&self, pos: AaRefPosition) -> bool { + pos >= 0 + && self + .aln_ranges + .iter() + .any(|aa_alignment_range| aa_alignment_range.contains(pos)) + } } diff --git a/packages/nextclade/src/analyze/aa_change_with_context.rs b/packages/nextclade/src/analyze/aa_change_with_context.rs index 79af98e05..1bce9223b 100644 --- a/packages/nextclade/src/analyze/aa_change_with_context.rs +++ b/packages/nextclade/src/analyze/aa_change_with_context.rs @@ -1,7 +1,8 @@ use crate::alphabet::aa::Aa; use crate::alphabet::letter::{serde_deserialize_seq, serde_serialize_seq}; use crate::alphabet::nuc::Nuc; -use crate::analyze::aa_sub::{is_aa_mutated_or_deleted, AaSubMin}; +use crate::analyze::aa_sub_min::AaSubMin; +use crate::analyze::abstract_mutation::{AbstractMutation, MutParams, Pos, QryLetter, RefLetter}; use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; use crate::coord::position::{AaRefPosition, NucRefGlobalPosition}; use crate::coord::range::NucRefGlobalRange; @@ -32,6 +33,36 @@ pub struct AaChangeWithContext { pub nuc_ranges: Vec, } +impl Pos for AaChangeWithContext { + fn pos(&self) -> AaRefPosition { + self.pos + } +} + +impl QryLetter for AaChangeWithContext { + fn qry_letter(&self) -> Aa { + self.qry_aa + } +} + +impl RefLetter for AaChangeWithContext { + fn ref_letter(&self) -> Aa { + self.ref_aa + } +} + +impl AbstractMutation for AaChangeWithContext { + fn clone_with(&self, params: MutParams) -> Self { + Self { + cds_name: self.cds_name.clone(), + pos: params.pos, + ref_aa: params.ref_letter, + qry_aa: params.qry_letter, + ..self.clone() + } + } +} + impl AaChangeWithContext { pub fn new(cds: &Cds, sub: &AaSubMin, qry_seq: &[Nuc], ref_seq: &[Nuc]) -> Self { let AaSubMin { pos, ref_aa, qry_aa } = *sub; @@ -72,9 +103,4 @@ impl AaChangeWithContext { qry_triplet, } } - - #[inline] - pub fn is_mutated_or_deleted(&self) -> bool { - is_aa_mutated_or_deleted(self.ref_aa, self.qry_aa) - } } diff --git a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs index 2b76b0243..e8a3c354e 100644 --- a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs +++ b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs @@ -4,11 +4,11 @@ use crate::analyze::aa_alignment::AaAlignment; use crate::analyze::aa_change_with_context::AaChangeWithContext; use crate::analyze::aa_changes_group::AaChangesGroup; use crate::analyze::aa_del::AaDel; -use crate::analyze::aa_sub::{is_aa_mutated_or_deleted, is_codon_sequenced, AaSub}; +use crate::analyze::aa_sub::AaSub; +use crate::analyze::abstract_mutation::AbstractMutation; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::nuc_sub::NucSub; use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; -use crate::coord::position::PositionLike; use crate::coord::range::{have_intersection, AaRefRange}; use crate::gene::cds::Cds; use crate::translate::translate_genes::CdsTranslation; @@ -52,24 +52,22 @@ pub fn aa_changes_find_for_cds( assert_eq!(ref_tr.seq.len(), qry_tr.seq.len()); assert_eq!(qry_seq.len(), ref_seq.len()); - let tr = AaAlignment::new(cds, ref_tr, qry_tr); + let tr = AaAlignment::new(cds, ref_tr, qry_tr, &qry_tr.alignment_ranges); - let aa_alignment_ranges = &qry_tr.alignment_ranges; let mut aa_changes_groups = vec![AaChangesGroup::new(&cds.name)]; let mut curr_group = aa_changes_groups.last_mut().unwrap(); for codon in AaRefRange::from_usize(0, qry_tr.seq.len()).iter() { - if !is_codon_sequenced(aa_alignment_ranges, codon) { + if !tr.is_codon_sequenced(codon) { continue; } - let ref_aa = ref_tr.seq[codon.as_usize()]; - let qry_aa = qry_tr.seq[codon.as_usize()]; + let aa_mut = tr.mut_at(codon); - if is_aa_mutated_or_deleted(ref_aa, qry_aa) { + if aa_mut.is_mutated_and_not_unknown() { match curr_group.last() { // If current group is empty, then we are about to insert the first codon into the first group. None => { - if codon > 0 && is_codon_sequenced(aa_alignment_ranges, codon - 1) { + if codon > 0 && tr.is_codon_sequenced(codon - 1) { // Also prepend one codon to the left, for additional context, to start the group curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); } @@ -84,7 +82,7 @@ pub fn aa_changes_find_for_cds( if codon <= prev.pos + 2 { // If previous codon in the group is not exactly adjacent, there is 1 item in between, // then cover the hole by inserting previous codon. - if codon == prev.pos + 2 && is_codon_sequenced(aa_alignment_ranges, codon - 1) { + if codon == prev.pos + 2 && tr.is_codon_sequenced(codon - 1) { curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); } @@ -94,7 +92,7 @@ pub fn aa_changes_find_for_cds( // If previous codon in the group is not adjacent, then terminate the current group and start a new group. else { // Add one codon to the right, for additional context, to finalize the current group - if is_codon_sequenced(aa_alignment_ranges, prev.pos + 1) { + if tr.is_codon_sequenced(prev.pos + 1) { curr_group.push(AaChangeWithContext::new( cds, &tr.mut_at(prev.pos + 1), @@ -106,7 +104,7 @@ pub fn aa_changes_find_for_cds( let mut new_group = AaChangesGroup::new(&cds.name); // Start a new group and push the current codon into it. - if is_codon_sequenced(aa_alignment_ranges, codon - 1) { + if tr.is_codon_sequenced(codon - 1) { // Also prepend one codon to the left, for additional context, to start the new group. new_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); } @@ -125,7 +123,7 @@ pub fn aa_changes_find_for_cds( // Add one codon to the right, for additional context, to finalize the last group if let Some(last) = curr_group.last() { - if is_codon_sequenced(aa_alignment_ranges, last.pos + 1) { + if tr.is_codon_sequenced(last.pos + 1) { curr_group.push(AaChangeWithContext::new( cds, &tr.mut_at(last.pos + 1), @@ -165,7 +163,7 @@ pub fn aa_changes_find_for_cds( let (aa_substitutions, aa_deletions): (Vec, Vec) = aa_changes_groups .iter() .flat_map(|aa_changes_group| &aa_changes_group.changes) - .filter(|change| is_aa_mutated_or_deleted(change.ref_aa, change.qry_aa)) + .filter(|change| change.is_mutated_and_not_unknown()) .partition_map(|change| { if change.qry_aa.is_gap() { Either::Right(AaDel { @@ -190,7 +188,7 @@ pub fn aa_changes_find_for_cds( group .changes .iter() - .filter(|change| AaChangeWithContext::is_mutated_or_deleted(change)) + .filter(|change| change.is_mutated_and_not_unknown()) .flat_map(|change| { change.nuc_ranges.iter().flat_map(move |range| { range.iter() diff --git a/packages/nextclade/src/analyze/aa_sub.rs b/packages/nextclade/src/analyze/aa_sub.rs index 3c055038f..8a5f0d962 100644 --- a/packages/nextclade/src/analyze/aa_sub.rs +++ b/packages/nextclade/src/analyze/aa_sub.rs @@ -4,7 +4,6 @@ use crate::analyze::aa_change_with_context::AaChangeWithContext; use crate::analyze::aa_del::AaDel; use crate::analyze::abstract_mutation::{AbstractMutation, MutParams, Pos, QryLetter, RefLetter}; use crate::coord::position::AaRefPosition; -use crate::coord::range::AaRefRange; use crate::io::parse_pos::parse_pos; use crate::make_error; use eyre::{Report, WrapErr}; @@ -14,15 +13,6 @@ use serde::{Deserialize, Serialize}; use std::fmt::{Display, Formatter}; use std::str::FromStr; -/// Represents minimal aminoacid substitution (regardless of CDS) -#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize, schemars::JsonSchema)] -#[serde(rename_all = "camelCase")] -pub struct AaSubMin { - pub pos: AaRefPosition, - pub ref_aa: Aa, - pub qry_aa: Aa, -} - /// Represents aminoacid substitution #[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] @@ -63,11 +53,6 @@ impl Pos for AaSub { } impl AaSub { - /// Checks whether this substitution is a deletion (substitution of letter `Gap`) - pub fn is_del(&self) -> bool { - self.qry_aa.is_gap() - } - pub fn from_str_and_gene(mut_str: impl AsRef, cds_name: impl AsRef) -> Result { Self::from_str(&format!("{}:{}", cds_name.as_ref(), mut_str.as_ref())) } @@ -163,15 +148,15 @@ impl From<&AaChangeWithContext> for AaSub { } } -/// Check whether a given pair if reference and query aminoacids constitute a mutation or deletion -pub fn is_aa_mutated_or_deleted(ref_aa: Aa, qry_aa: Aa) -> bool { - // NOTE: We chose to ignore mutations to `X`. - qry_aa != ref_aa && qry_aa != Aa::X -} - -/// Check whether a given codon position corresponds to a sequenced aminoacid -pub fn is_codon_sequenced(aa_alignment_ranges: &[AaRefRange], codon: AaRefPosition) -> bool { - aa_alignment_ranges - .iter() - .any(|aa_alignment_range| aa_alignment_range.contains(codon)) -} +// /// Check whether a given pair if reference and query aminoacids constitute a mutation or deletion +// pub fn is_aa_mutated_or_deleted(ref_aa: Aa, qry_aa: Aa) -> bool { +// // NOTE: We chose to ignore mutations to `X`. +// qry_aa != ref_aa && qry_aa != Aa::X +// } +// +// /// Check whether a given codon position corresponds to a sequenced aminoacid +// pub fn is_codon_sequenced(aa_alignment_ranges: &[AaRefRange], codon: AaRefPosition) -> bool { +// aa_alignment_ranges +// .iter() +// .any(|aa_alignment_range| aa_alignment_range.contains(codon)) +// } diff --git a/packages/nextclade/src/analyze/aa_sub_min.rs b/packages/nextclade/src/analyze/aa_sub_min.rs new file mode 100644 index 000000000..2ef1bd3ad --- /dev/null +++ b/packages/nextclade/src/analyze/aa_sub_min.rs @@ -0,0 +1,61 @@ +use crate::alphabet::aa::{from_aa, Aa}; +use crate::analyze::abstract_mutation::{AbstractMutation, MutParams, Pos, QryLetter, RefLetter}; +use crate::coord::position::AaRefPosition; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::fmt::{Display, Formatter}; + +/// Represents minimal aminoacid substitution (without feature name) +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct AaSubMin { + pub pos: AaRefPosition, + pub ref_aa: Aa, + pub qry_aa: Aa, +} + +impl AbstractMutation for AaSubMin { + fn clone_with(&self, params: MutParams) -> Self { + Self { + pos: params.pos, + ref_aa: params.ref_letter, + qry_aa: params.qry_letter, + } + } +} + +impl QryLetter for AaSubMin { + fn qry_letter(&self) -> Aa { + self.qry_aa + } +} + +impl RefLetter for AaSubMin { + fn ref_letter(&self) -> Aa { + self.ref_aa + } +} + +impl Pos for AaSubMin { + fn pos(&self) -> AaRefPosition { + self.pos + } +} + +impl AaSubMin { + #[must_use] + pub const fn invert(&self) -> Self { + Self { + ref_aa: self.qry_aa, + pos: self.pos, + qry_aa: self.ref_aa, + } + } +} + +impl Display for AaSubMin { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + // NOTE: by convention, in bioinformatics, amino acids are numbered starting from 1, however our arrays are 0-based + write!(f, "{}{}{}", from_aa(self.ref_aa), self.pos + 1, from_aa(self.qry_aa)) + } +} diff --git a/packages/nextclade/src/analyze/abstract_mutation.rs b/packages/nextclade/src/analyze/abstract_mutation.rs index 5f8f42034..066775a88 100644 --- a/packages/nextclade/src/analyze/abstract_mutation.rs +++ b/packages/nextclade/src/analyze/abstract_mutation.rs @@ -16,6 +16,13 @@ pub trait RefLetter> { fn ref_letter(&self) -> L; } +pub trait Sub +where + P: PositionLike, + L: Letter, +{ +} + pub struct MutParams where P: PositionLike, @@ -35,4 +42,29 @@ where /// Creates a copy of the mutation, overwriting some of the fields according to the provided parameters #[must_use] fn clone_with(&self, params: MutParams) -> Self; + + #[must_use] + fn is_mutated(&self) -> bool { + self.qry_letter() != self.ref_letter() + } + + #[must_use] + fn is_sub(&self) -> bool { + self.is_mutated() && !self.qry_letter().is_gap() && !self.is_unknown() + } + + #[must_use] + fn is_del(&self) -> bool { + self.is_mutated() && self.qry_letter().is_gap() + } + + #[must_use] + fn is_unknown(&self) -> bool { + self.qry_letter().is_unknown() + } + + #[must_use] + fn is_mutated_and_not_unknown(&self) -> bool { + self.is_sub() || self.is_del() + } } diff --git a/packages/nextclade/src/analyze/mod.rs b/packages/nextclade/src/analyze/mod.rs index 546046f63..a17c9707c 100644 --- a/packages/nextclade/src/analyze/mod.rs +++ b/packages/nextclade/src/analyze/mod.rs @@ -1,11 +1,12 @@ +pub mod aa_alignment; pub mod aa_change_with_context; pub mod aa_changes_find; pub mod aa_changes_find_for_cds; pub mod aa_changes_group; pub mod aa_del; pub mod aa_sub; +pub mod aa_sub_min; pub mod abstract_mutation; -pub mod aa_alignment; pub mod count_gaps; pub mod divergence; pub mod find_aa_motifs; diff --git a/packages/nextclade/src/analyze/nuc_sub.rs b/packages/nextclade/src/analyze/nuc_sub.rs index 48fdce9b5..7e3f94d6b 100644 --- a/packages/nextclade/src/analyze/nuc_sub.rs +++ b/packages/nextclade/src/analyze/nuc_sub.rs @@ -50,11 +50,6 @@ impl Pos for NucSub { } impl NucSub { - /// Checks whether this substitution is a deletion (substitution of letter `Gap`) - pub fn is_del(&self) -> bool { - self.qry_nuc.is_gap() - } - pub const fn genotype(&self) -> Genotype { Genotype { pos: self.pos, From dad16425d1bf8e7340a37779c7352928c1656036 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 6 Jun 2024 03:31:05 +0200 Subject: [PATCH 4/8] refactor: split aa mut search and grouping loops --- .../nextclade/src/analyze/aa_changes_find_for_cds.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs index e8a3c354e..f7a2faf49 100644 --- a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs +++ b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs @@ -56,12 +56,13 @@ pub fn aa_changes_find_for_cds( let mut aa_changes_groups = vec![AaChangesGroup::new(&cds.name)]; let mut curr_group = aa_changes_groups.last_mut().unwrap(); - for codon in AaRefRange::from_usize(0, qry_tr.seq.len()).iter() { - if !tr.is_codon_sequenced(codon) { - continue; - } - let aa_mut = tr.mut_at(codon); + let aa_muts = AaRefRange::from_usize(0, qry_tr.seq.len()) + .iter() + .filter_map(|codon| tr.is_codon_sequenced(codon).then_some(tr.mut_at(codon))); + + for aa_mut in aa_muts { + let codon = aa_mut.pos; if aa_mut.is_mutated_and_not_unknown() { match curr_group.last() { From 2cf2c03c071ab2fbdc9c1ad03eeb04c9889e635d Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 6 Jun 2024 03:55:13 +0200 Subject: [PATCH 5/8] refactor: factor out aa changes grouping into a new function --- .../nextclade/src/analyze/aa_alignment.rs | 3 +- .../src/analyze/aa_changes_find_for_cds.rs | 32 +++++++++++++------ 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/packages/nextclade/src/analyze/aa_alignment.rs b/packages/nextclade/src/analyze/aa_alignment.rs index b32387e72..cf84ac2d1 100644 --- a/packages/nextclade/src/analyze/aa_alignment.rs +++ b/packages/nextclade/src/analyze/aa_alignment.rs @@ -14,12 +14,13 @@ pub struct AaAlignment<'c, 'r, 'q, 'ar> { } impl<'c, 'r, 'q, 'ar> AaAlignment<'c, 'r, 'q, 'ar> { - pub const fn new( + pub fn new( cds: &'c Cds, ref_tr: &'r CdsTranslation, qry_tr: &'q CdsTranslation, aln_ranges: &'ar [AaRefRange], ) -> Self { + assert_eq!(ref_tr.seq.len(), qry_tr.seq.len()); Self { cds, ref_tr, diff --git a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs index f7a2faf49..637024c3c 100644 --- a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs +++ b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs @@ -1,3 +1,4 @@ +use crate::alphabet::aa::Aa; use crate::alphabet::letter::Letter; use crate::alphabet::nuc::Nuc; use crate::analyze::aa_alignment::AaAlignment; @@ -9,6 +10,7 @@ use crate::analyze::abstract_mutation::AbstractMutation; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::nuc_sub::NucSub; use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; +use crate::coord::position::AaRefPosition; use crate::coord::range::{have_intersection, AaRefRange}; use crate::gene::cds::Cds; use crate::translate::translate_genes::CdsTranslation; @@ -49,20 +51,32 @@ pub fn aa_changes_find_for_cds( nuc_subs: &[NucSub], nuc_dels: &[NucDelRange], ) -> FindAaChangesOutput { - assert_eq!(ref_tr.seq.len(), qry_tr.seq.len()); assert_eq!(qry_seq.len(), ref_seq.len()); let tr = AaAlignment::new(cds, ref_tr, qry_tr, &qry_tr.alignment_ranges); + let aa_changes = AaRefRange::from_usize(0, qry_tr.seq.len()) + .iter() + .filter_map(|codon| tr.is_codon_sequenced(codon).then_some(tr.mut_at(codon))) + .collect_vec(); + + aa_changes_group(&aa_changes, cds, qry_seq, ref_seq, nuc_subs, nuc_dels, &tr) +} + +pub fn aa_changes_group>( + aa_muts: &[M], + cds: &Cds, + qry_seq: &[Nuc], + ref_seq: &[Nuc], + nuc_subs: &[NucSub], + nuc_dels: &[NucDelRange], + tr: &AaAlignment, +) -> FindAaChangesOutput { let mut aa_changes_groups = vec![AaChangesGroup::new(&cds.name)]; let mut curr_group = aa_changes_groups.last_mut().unwrap(); - let aa_muts = AaRefRange::from_usize(0, qry_tr.seq.len()) - .iter() - .filter_map(|codon| tr.is_codon_sequenced(codon).then_some(tr.mut_at(codon))); - for aa_mut in aa_muts { - let codon = aa_mut.pos; + let codon = aa_mut.pos(); if aa_mut.is_mutated_and_not_unknown() { match curr_group.last() { @@ -193,9 +207,9 @@ pub fn aa_changes_find_for_cds( .flat_map(|change| { change.nuc_ranges.iter().flat_map(move |range| { range.iter() - // TODO: We convert position to string here, because when communicating with WASM we will pass through - // JSON schema, and JSON object keys must be strings. Maybe there is a way to keep the keys as numbers? - .map(move |pos| (pos.to_string(), AaSub::from(change))) + // TODO: We convert position to string here, because when communicating with WASM we will pass through + // JSON schema, and JSON object keys must be strings. Maybe there is a way to keep the keys as numbers? + .map(move |pos| (pos.to_string(), AaSub::from(change))) }) }) }) From b905fbbe07d13e739ea2806bb5204eff8429d4ae Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 6 Jun 2024 04:18:08 +0200 Subject: [PATCH 6/8] refactor: hoist `AaAlignment` contruction --- Cargo.toml | 1 + .../nextclade/src/analyze/aa_alignment.rs | 31 ++++++++-------- .../nextclade/src/analyze/aa_changes_find.rs | 36 +++++++++---------- .../src/analyze/aa_changes_find_for_cds.rs | 11 ++---- 4 files changed, 35 insertions(+), 44 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d6ac268a9..8475f0865 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -180,6 +180,7 @@ indexing_slicing = "allow" integer_division = "allow" iter_nth_zero = "allow" large_digit_groups = "allow" +len_without_is_empty = "allow" len_zero = "allow" let_underscore_must_use = "allow" manual_string_new = "allow" diff --git a/packages/nextclade/src/analyze/aa_alignment.rs b/packages/nextclade/src/analyze/aa_alignment.rs index cf84ac2d1..3d79e2948 100644 --- a/packages/nextclade/src/analyze/aa_alignment.rs +++ b/packages/nextclade/src/analyze/aa_alignment.rs @@ -1,32 +1,28 @@ use crate::alphabet::aa::Aa; use crate::analyze::aa_sub_min::AaSubMin; use crate::coord::position::{AaRefPosition, PositionLike}; -use crate::coord::range::AaRefRange; use crate::gene::cds::Cds; use crate::translate::translate_genes::CdsTranslation; /// Represents a pair of aligned amino acid sequences (resulting from pairwise alignment -pub struct AaAlignment<'c, 'r, 'q, 'ar> { +pub struct AaAlignment<'c, 'r, 'q> { cds: &'c Cds, ref_tr: &'r CdsTranslation, qry_tr: &'q CdsTranslation, - aln_ranges: &'ar [AaRefRange], } -impl<'c, 'r, 'q, 'ar> AaAlignment<'c, 'r, 'q, 'ar> { - pub fn new( - cds: &'c Cds, - ref_tr: &'r CdsTranslation, - qry_tr: &'q CdsTranslation, - aln_ranges: &'ar [AaRefRange], - ) -> Self { +impl<'c, 'r, 'q> AaAlignment<'c, 'r, 'q> { + pub fn new(cds: &'c Cds, ref_tr: &'r CdsTranslation, qry_tr: &'q CdsTranslation) -> Self { assert_eq!(ref_tr.seq.len(), qry_tr.seq.len()); - Self { - cds, - ref_tr, - qry_tr, - aln_ranges, - } + Self { cds, ref_tr, qry_tr } + } + + pub const fn cds(&self) -> &Cds { + self.cds + } + + pub fn len(&self) -> usize { + self.ref_tr.seq.len() } pub fn ref_at(&self, pos: AaRefPosition) -> Aa { @@ -49,7 +45,8 @@ impl<'c, 'r, 'q, 'ar> AaAlignment<'c, 'r, 'q, 'ar> { pub fn is_codon_sequenced(&self, pos: AaRefPosition) -> bool { pos >= 0 && self - .aln_ranges + .qry_tr + .alignment_ranges .iter() .any(|aa_alignment_range| aa_alignment_range.contains(pos)) } diff --git a/packages/nextclade/src/analyze/aa_changes_find.rs b/packages/nextclade/src/analyze/aa_changes_find.rs index 9b1ea5eed..8898de66f 100644 --- a/packages/nextclade/src/analyze/aa_changes_find.rs +++ b/packages/nextclade/src/analyze/aa_changes_find.rs @@ -1,4 +1,5 @@ use crate::alphabet::nuc::Nuc; +use crate::analyze::aa_alignment::AaAlignment; use crate::analyze::aa_changes_find_for_cds::{aa_changes_find_for_cds, FindAaChangesOutput}; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::nuc_sub::NucSub; @@ -20,25 +21,22 @@ pub fn aa_changes_find( nuc_subs: &[NucSub], nuc_dels: &[NucDelRange], ) -> Result { - let mut changes = qry_translation - .iter_cdses() - .map(|(qry_name, qry_cds_tr)| { - let ref_cds_tr = ref_translation.get_cds(qry_name)?; - let cds = gene_map.get_cds(&qry_cds_tr.name)?; - Ok(aa_changes_find_for_cds( - cds, qry_seq, ref_seq, ref_cds_tr, qry_cds_tr, nuc_subs, nuc_dels, - )) - }) - .collect::, Report>>()? - .into_iter() - // Merge changes from all CDSes into one struct - .fold(FindAaChangesOutput::default(), |mut output, changes| { - output.aa_changes_groups.extend(changes.aa_changes_groups); - output.aa_substitutions.extend(changes.aa_substitutions); - output.aa_deletions.extend(changes.aa_deletions); - extend_map_of_vecs(&mut output.nuc_to_aa_muts, changes.nuc_to_aa_muts); - output - }); + let mut changes = qry_translation.iter_cdses().map(|(qry_name, qry_tr)| { + let ref_tr = ref_translation.get_cds(qry_name)?; + let cds = gene_map.get_cds(&qry_tr.name)?; + let tr = AaAlignment::new(cds, ref_tr, qry_tr); + Ok(aa_changes_find_for_cds(qry_seq, ref_seq, &tr, nuc_subs, nuc_dels)) + }) + .collect::, Report>>()? + .into_iter() + // Merge changes from all CDSes into one struct + .fold(FindAaChangesOutput::default(), |mut output, changes| { + output.aa_changes_groups.extend(changes.aa_changes_groups); + output.aa_substitutions.extend(changes.aa_substitutions); + output.aa_deletions.extend(changes.aa_deletions); + extend_map_of_vecs(&mut output.nuc_to_aa_muts, changes.nuc_to_aa_muts); + output + }); changes.aa_substitutions.sort(); changes.aa_deletions.sort(); diff --git a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs index 637024c3c..7667ebe35 100644 --- a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs +++ b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs @@ -13,7 +13,6 @@ use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; use crate::coord::position::AaRefPosition; use crate::coord::range::{have_intersection, AaRefRange}; use crate::gene::cds::Cds; -use crate::translate::translate_genes::CdsTranslation; use either::Either; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -43,24 +42,20 @@ pub struct FindAaChangesOutput { /// might not always be established without knowing the order in which nucleotide changes have occurred. And in the /// context of Nextclade we don't have this information. pub fn aa_changes_find_for_cds( - cds: &Cds, qry_seq: &[Nuc], ref_seq: &[Nuc], - ref_tr: &CdsTranslation, - qry_tr: &CdsTranslation, + tr: &AaAlignment, nuc_subs: &[NucSub], nuc_dels: &[NucDelRange], ) -> FindAaChangesOutput { assert_eq!(qry_seq.len(), ref_seq.len()); - let tr = AaAlignment::new(cds, ref_tr, qry_tr, &qry_tr.alignment_ranges); - - let aa_changes = AaRefRange::from_usize(0, qry_tr.seq.len()) + let aa_changes = AaRefRange::from_usize(0, tr.len()) .iter() .filter_map(|codon| tr.is_codon_sequenced(codon).then_some(tr.mut_at(codon))) .collect_vec(); - aa_changes_group(&aa_changes, cds, qry_seq, ref_seq, nuc_subs, nuc_dels, &tr) + aa_changes_group(&aa_changes, tr.cds(), qry_seq, ref_seq, nuc_subs, nuc_dels, tr) } pub fn aa_changes_group>( From 102339c1885998c59e1156c14415e3fba5e99f02 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 6 Jun 2024 04:22:12 +0200 Subject: [PATCH 7/8] refactor: rename function to make it more generic --- packages/nextclade/src/analyze/aa_alignment.rs | 3 +-- .../nextclade/src/analyze/aa_changes_find_for_cds.rs | 12 ++++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/packages/nextclade/src/analyze/aa_alignment.rs b/packages/nextclade/src/analyze/aa_alignment.rs index 3d79e2948..db4102387 100644 --- a/packages/nextclade/src/analyze/aa_alignment.rs +++ b/packages/nextclade/src/analyze/aa_alignment.rs @@ -41,8 +41,7 @@ impl<'c, 'r, 'q> AaAlignment<'c, 'r, 'q> { } } - /// Check whether a given codon position falls within alignment - pub fn is_codon_sequenced(&self, pos: AaRefPosition) -> bool { + pub fn is_sequenced(&self, pos: AaRefPosition) -> bool { pos >= 0 && self .qry_tr diff --git a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs index 7667ebe35..134bf412b 100644 --- a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs +++ b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs @@ -52,7 +52,7 @@ pub fn aa_changes_find_for_cds( let aa_changes = AaRefRange::from_usize(0, tr.len()) .iter() - .filter_map(|codon| tr.is_codon_sequenced(codon).then_some(tr.mut_at(codon))) + .filter_map(|codon| tr.is_sequenced(codon).then_some(tr.mut_at(codon))) .collect_vec(); aa_changes_group(&aa_changes, tr.cds(), qry_seq, ref_seq, nuc_subs, nuc_dels, tr) @@ -77,7 +77,7 @@ pub fn aa_changes_group>( match curr_group.last() { // If current group is empty, then we are about to insert the first codon into the first group. None => { - if codon > 0 && tr.is_codon_sequenced(codon - 1) { + if codon > 0 && tr.is_sequenced(codon - 1) { // Also prepend one codon to the left, for additional context, to start the group curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); } @@ -92,7 +92,7 @@ pub fn aa_changes_group>( if codon <= prev.pos + 2 { // If previous codon in the group is not exactly adjacent, there is 1 item in between, // then cover the hole by inserting previous codon. - if codon == prev.pos + 2 && tr.is_codon_sequenced(codon - 1) { + if codon == prev.pos + 2 && tr.is_sequenced(codon - 1) { curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); } @@ -102,7 +102,7 @@ pub fn aa_changes_group>( // If previous codon in the group is not adjacent, then terminate the current group and start a new group. else { // Add one codon to the right, for additional context, to finalize the current group - if tr.is_codon_sequenced(prev.pos + 1) { + if tr.is_sequenced(prev.pos + 1) { curr_group.push(AaChangeWithContext::new( cds, &tr.mut_at(prev.pos + 1), @@ -114,7 +114,7 @@ pub fn aa_changes_group>( let mut new_group = AaChangesGroup::new(&cds.name); // Start a new group and push the current codon into it. - if tr.is_codon_sequenced(codon - 1) { + if tr.is_sequenced(codon - 1) { // Also prepend one codon to the left, for additional context, to start the new group. new_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); } @@ -133,7 +133,7 @@ pub fn aa_changes_group>( // Add one codon to the right, for additional context, to finalize the last group if let Some(last) = curr_group.last() { - if tr.is_codon_sequenced(last.pos + 1) { + if tr.is_sequenced(last.pos + 1) { curr_group.push(AaChangeWithContext::new( cds, &tr.mut_at(last.pos + 1), From 44d3f0dca1d5a8df0b07d8baaf5d30ea8a38227a Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 6 Jun 2024 04:44:39 +0200 Subject: [PATCH 8/8] refactor: extract nuc alignment pair to a struct This is a nuc version of 5ee7936f98c4fb937778da4adee340683a0f4f0d --- .../src/analyze/aa_change_with_context.rs | 7 ++- .../nextclade/src/analyze/aa_changes_find.rs | 7 +-- .../src/analyze/aa_changes_find_for_cds.rs | 43 +++++--------- packages/nextclade/src/analyze/mod.rs | 1 + .../nextclade/src/analyze/nuc_alignment.rs | 56 +++++++++++++++++++ .../nextclade/src/run/nextclade_run_one.rs | 16 +++--- 6 files changed, 87 insertions(+), 43 deletions(-) create mode 100644 packages/nextclade/src/analyze/nuc_alignment.rs diff --git a/packages/nextclade/src/analyze/aa_change_with_context.rs b/packages/nextclade/src/analyze/aa_change_with_context.rs index 1bce9223b..af4c382b5 100644 --- a/packages/nextclade/src/analyze/aa_change_with_context.rs +++ b/packages/nextclade/src/analyze/aa_change_with_context.rs @@ -3,6 +3,7 @@ use crate::alphabet::letter::{serde_deserialize_seq, serde_serialize_seq}; use crate::alphabet::nuc::Nuc; use crate::analyze::aa_sub_min::AaSubMin; use crate::analyze::abstract_mutation::{AbstractMutation, MutParams, Pos, QryLetter, RefLetter}; +use crate::analyze::nuc_alignment::NucAlignment; use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; use crate::coord::position::{AaRefPosition, NucRefGlobalPosition}; use crate::coord::range::NucRefGlobalRange; @@ -64,14 +65,14 @@ impl AbstractMutation for AaChangeWithContext { } impl AaChangeWithContext { - pub fn new(cds: &Cds, sub: &AaSubMin, qry_seq: &[Nuc], ref_seq: &[Nuc]) -> Self { + pub fn new(cds: &Cds, sub: &AaSubMin, aln: &NucAlignment) -> Self { let AaSubMin { pos, ref_aa, qry_aa } = *sub; let nuc_ranges = cds_codon_pos_to_ref_range(cds, pos); let ref_triplet = nuc_ranges .iter() .flat_map(|(range, strand)| { - let mut nucs = ref_seq[range.to_std()].to_vec(); + let mut nucs = aln.ref_range(range).to_vec(); if strand == &GeneStrand::Reverse { reverse_complement_in_place(&mut nucs); } @@ -82,7 +83,7 @@ impl AaChangeWithContext { let qry_triplet = nuc_ranges .iter() .flat_map(|(range, strand)| { - let mut nucs = qry_seq[range.clamp_range(0, qry_seq.len()).to_std()].to_vec(); + let mut nucs = aln.qry_range(range).to_vec(); if strand == &GeneStrand::Reverse { reverse_complement_in_place(&mut nucs); } diff --git a/packages/nextclade/src/analyze/aa_changes_find.rs b/packages/nextclade/src/analyze/aa_changes_find.rs index 8898de66f..5a3425868 100644 --- a/packages/nextclade/src/analyze/aa_changes_find.rs +++ b/packages/nextclade/src/analyze/aa_changes_find.rs @@ -1,6 +1,6 @@ -use crate::alphabet::nuc::Nuc; use crate::analyze::aa_alignment::AaAlignment; use crate::analyze::aa_changes_find_for_cds::{aa_changes_find_for_cds, FindAaChangesOutput}; +use crate::analyze::nuc_alignment::NucAlignment; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::nuc_sub::NucSub; use crate::gene::gene_map::GeneMap; @@ -13,8 +13,7 @@ use eyre::Report; /// ## Precondition /// Nucleotide sequences and peptides are required to be stripped from insertions pub fn aa_changes_find( - ref_seq: &[Nuc], - qry_seq: &[Nuc], + aln: &NucAlignment, ref_translation: &Translation, qry_translation: &Translation, gene_map: &GeneMap, @@ -25,7 +24,7 @@ pub fn aa_changes_find( let ref_tr = ref_translation.get_cds(qry_name)?; let cds = gene_map.get_cds(&qry_tr.name)?; let tr = AaAlignment::new(cds, ref_tr, qry_tr); - Ok(aa_changes_find_for_cds(qry_seq, ref_seq, &tr, nuc_subs, nuc_dels)) + Ok(aa_changes_find_for_cds(aln, &tr, nuc_subs, nuc_dels)) }) .collect::, Report>>()? .into_iter() diff --git a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs index 134bf412b..03c3f29d5 100644 --- a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs +++ b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs @@ -1,18 +1,17 @@ use crate::alphabet::aa::Aa; use crate::alphabet::letter::Letter; -use crate::alphabet::nuc::Nuc; use crate::analyze::aa_alignment::AaAlignment; use crate::analyze::aa_change_with_context::AaChangeWithContext; use crate::analyze::aa_changes_group::AaChangesGroup; use crate::analyze::aa_del::AaDel; use crate::analyze::aa_sub::AaSub; use crate::analyze::abstract_mutation::AbstractMutation; +use crate::analyze::nuc_alignment::NucAlignment; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::nuc_sub::NucSub; use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; use crate::coord::position::AaRefPosition; use crate::coord::range::{have_intersection, AaRefRange}; -use crate::gene::cds::Cds; use either::Either; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -42,31 +41,27 @@ pub struct FindAaChangesOutput { /// might not always be established without knowing the order in which nucleotide changes have occurred. And in the /// context of Nextclade we don't have this information. pub fn aa_changes_find_for_cds( - qry_seq: &[Nuc], - ref_seq: &[Nuc], + aln: &NucAlignment, tr: &AaAlignment, nuc_subs: &[NucSub], nuc_dels: &[NucDelRange], ) -> FindAaChangesOutput { - assert_eq!(qry_seq.len(), ref_seq.len()); - let aa_changes = AaRefRange::from_usize(0, tr.len()) .iter() .filter_map(|codon| tr.is_sequenced(codon).then_some(tr.mut_at(codon))) .collect_vec(); - aa_changes_group(&aa_changes, tr.cds(), qry_seq, ref_seq, nuc_subs, nuc_dels, tr) + aa_changes_group(&aa_changes, aln, tr, nuc_subs, nuc_dels) } pub fn aa_changes_group>( aa_muts: &[M], - cds: &Cds, - qry_seq: &[Nuc], - ref_seq: &[Nuc], + aln: &NucAlignment, + tr: &AaAlignment, nuc_subs: &[NucSub], nuc_dels: &[NucDelRange], - tr: &AaAlignment, ) -> FindAaChangesOutput { + let cds = tr.cds(); let mut aa_changes_groups = vec![AaChangesGroup::new(&cds.name)]; let mut curr_group = aa_changes_groups.last_mut().unwrap(); @@ -79,11 +74,11 @@ pub fn aa_changes_group>( None => { if codon > 0 && tr.is_sequenced(codon - 1) { // Also prepend one codon to the left, for additional context, to start the group - curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), aln)); } // The current codon itself - curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), qry_seq, ref_seq)); + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), aln)); } // Current group is not empty Some(prev) => { @@ -93,22 +88,17 @@ pub fn aa_changes_group>( // If previous codon in the group is not exactly adjacent, there is 1 item in between, // then cover the hole by inserting previous codon. if codon == prev.pos + 2 && tr.is_sequenced(codon - 1) { - curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), aln)); } // And insert the current codon - curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), qry_seq, ref_seq)); + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), aln)); } // If previous codon in the group is not adjacent, then terminate the current group and start a new group. else { // Add one codon to the right, for additional context, to finalize the current group if tr.is_sequenced(prev.pos + 1) { - curr_group.push(AaChangeWithContext::new( - cds, - &tr.mut_at(prev.pos + 1), - qry_seq, - ref_seq, - )); + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(prev.pos + 1), aln)); } let mut new_group = AaChangesGroup::new(&cds.name); @@ -116,11 +106,11 @@ pub fn aa_changes_group>( // Start a new group and push the current codon into it. if tr.is_sequenced(codon - 1) { // Also prepend one codon to the left, for additional context, to start the new group. - new_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), qry_seq, ref_seq)); + new_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), aln)); } // Push the current codon to the new group - new_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), qry_seq, ref_seq)); + new_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), aln)); aa_changes_groups.push(new_group); @@ -134,12 +124,7 @@ pub fn aa_changes_group>( // Add one codon to the right, for additional context, to finalize the last group if let Some(last) = curr_group.last() { if tr.is_sequenced(last.pos + 1) { - curr_group.push(AaChangeWithContext::new( - cds, - &tr.mut_at(last.pos + 1), - qry_seq, - ref_seq, - )); + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(last.pos + 1), aln)); } } diff --git a/packages/nextclade/src/analyze/mod.rs b/packages/nextclade/src/analyze/mod.rs index a17c9707c..371b85a76 100644 --- a/packages/nextclade/src/analyze/mod.rs +++ b/packages/nextclade/src/analyze/mod.rs @@ -16,6 +16,7 @@ pub mod find_private_nuc_mutations; pub mod is_sequenced; pub mod letter_composition; pub mod letter_ranges; +pub mod nuc_alignment; pub mod nuc_changes; pub mod nuc_del; pub mod nuc_sub; diff --git a/packages/nextclade/src/analyze/nuc_alignment.rs b/packages/nextclade/src/analyze/nuc_alignment.rs new file mode 100644 index 000000000..1b1f32821 --- /dev/null +++ b/packages/nextclade/src/analyze/nuc_alignment.rs @@ -0,0 +1,56 @@ +use crate::alphabet::nuc::Nuc; +use crate::analyze::nuc_sub::NucSub; +use crate::coord::position::{NucRefGlobalPosition, PositionLike}; +use crate::coord::range::NucRefGlobalRange; + +/// Represents a pair of aligned amino acid sequences (resulting from pairwise alignment +pub struct NucAlignment<'r, 'q, 'a> { + ref_seq: &'r [Nuc], + qry_seq: &'q [Nuc], + alignment_range: &'a NucRefGlobalRange, +} + +impl<'r, 'q, 'a> NucAlignment<'r, 'q, 'a> { + pub fn new(ref_seq: &'r [Nuc], qry_seq: &'q [Nuc], alignment_range: &'a NucRefGlobalRange) -> Self { + assert_eq!(ref_seq.len(), qry_seq.len()); + Self { + ref_seq, + qry_seq, + alignment_range, + } + } + + pub const fn len(&self) -> usize { + self.ref_seq.len() + } + + pub fn ref_at(&self, pos: NucRefGlobalPosition) -> Nuc { + self.ref_seq[pos.as_usize()] + } + + pub fn qry_at(&self, pos: NucRefGlobalPosition) -> Nuc { + self.qry_seq[pos.as_usize()] + } + + pub fn ref_range(&self, range: &NucRefGlobalRange) -> &[Nuc] { + let range = range.clamp_range(0, self.len()); + &self.ref_seq[range.to_std()] + } + + pub fn qry_range(&self, range: &NucRefGlobalRange) -> &[Nuc] { + let range = range.clamp_range(0, self.len()); + &self.qry_seq[range.to_std()] + } + + pub fn mut_at(&self, pos: NucRefGlobalPosition) -> NucSub { + NucSub { + ref_nuc: self.ref_at(pos), + pos, + qry_nuc: self.qry_at(pos), + } + } + + pub fn is_sequenced(&self, pos: NucRefGlobalPosition) -> bool { + pos >= 0 && self.alignment_range.contains(pos) + } +} diff --git a/packages/nextclade/src/run/nextclade_run_one.rs b/packages/nextclade/src/run/nextclade_run_one.rs index bef8304ec..0ed47f6ba 100644 --- a/packages/nextclade/src/run/nextclade_run_one.rs +++ b/packages/nextclade/src/run/nextclade_run_one.rs @@ -17,6 +17,7 @@ use crate::analyze::letter_composition::get_letter_composition; use crate::analyze::letter_ranges::{ find_aa_letter_ranges, find_letter_ranges, find_letter_ranges_by, CdsAaRange, NucRange, }; +use crate::analyze::nuc_alignment::NucAlignment; use crate::analyze::nuc_changes::{find_nuc_changes, FindNucChangesOutput}; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::pcr_primer_changes::get_pcr_primer_changes; @@ -110,6 +111,8 @@ pub fn nextclade_run_one( alignment_range, } = find_nuc_changes(&stripped.qry_seq, ref_seq); + let aln = NucAlignment::new(ref_seq, &stripped.qry_seq, &alignment_range); + let total_substitutions = substitutions.len(); let total_deletions = deletions.iter().map(NucDelRange::len).sum(); @@ -201,8 +204,7 @@ pub fn nextclade_run_one( aa_deletions, nuc_to_aa_muts, } = aa_changes_find( - ref_seq, - &stripped.qry_seq, + &aln, ref_translation, &translation, gene_map, @@ -261,11 +263,11 @@ pub fn nextclade_run_one( let nearest_nodes = params.general.include_nearest_node_info.then_some( nearest_node_candidates - .iter() - // Choose all nodes with distance equal to the distance of the nearest node - .filter(|n| n.distance == nearest_node_candidates[0].distance) - .map(|n| Ok(graph.get_node(n.node_key)?.payload().name.clone())) - .collect::, Report>>()?, + .iter() + // Choose all nodes with distance equal to the distance of the nearest node + .filter(|n| n.distance == nearest_node_candidates[0].distance) + .map(|n| Ok(graph.get_node(n.node_key)?.payload().name.clone())) + .collect::, Report>>()?, ); let clade = nearest_node.clade();